diff --git a/README.md b/README.md
index d63a6a1..dc8b816 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,189 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Nick Moon
+  * [LinkedIn](https://www.linkedin.com/in/nick-moon1/), [personal website](https://nicholasmoon.github.io/)
+* Tested on: Windows 10, AMD Ryzen 9 5900HS @ 3.0GHz 32GB, NVIDIA RTX 3060 Laptop 6GB (Personal Laptop)
 
-### (TODO: Your README)
+**This is a Reynolds Boids implementation in C++ using CUDA for GPU acceleration of the simulation. This allows for
+simulating even 5,000,000 boids at real-time rates (>30 fps)**
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+
+### Results
+
+![](images/results/5000000_boids_smaller.gif)
+
+*Figure 1: 5000000 boids, 1.75 dt, 400.0 scene scale, 1.5 max speed*
+
+
+![](images/results/5000_boids_02_dt.gif)
+*Figure 2: 5000 boids, 0.2 dt timestep*
+
+
+![](images/results/5000_boids_1_dt.gif)
+*Figure 3: 5000 boids, 1.0 dt timestep*
+
+**For more gifs of the simulator with different settings, see images/results**
+
+## Implementaion
+
+#### Boid Flocking
+
+In the Boids flocking simulation, particles representing birds or fish
+(boids) move around the simulation space according to three rules:
+
+1. Cohesion - boids move towards the perceived center of mass of their neighbors
+2. Separation - boids avoid getting to close to their neighbors
+3. Alignment - boids generally try to move with the same direction and speed as
+their neighbors
+
+These three rules specify a boid's velocity change in a timestep.
+At every timestep, a boid thus has to look at each of its neighboring boids
+and compute the velocity change contribution from each of the three rules.
+
+In pseudocode, these 3 rules are as follows:
+
+**Rule 1: Boids try to fly towards the centre of mass of neighbouring boids**
+
+```
+function rule1(Boid boid)
+
+    Vector perceived_center
+
+    foreach Boid b:
+        if b != boid and distance(b, boid) < rule1Distance then
+            perceived_center += b.position
+        endif
+    end
+
+    perceived_center /= number_of_neighbors
+
+    return (perceived_center - boid.position) * rule1Scale
+end
+```
+
+**Rule 2: Boids try to keep a small distance away from other objects (including other boids)**
+
+```
+function rule2(Boid boid)
+
+    Vector c = 0
+
+    foreach Boid b
+        if b != boid and distance(b, boid) < rule2Distance then
+            c -= (b.position - boid.position)
+        endif
+    end
+
+    return c * rule2Scale
+end
+```
+
+**Rule 3: Boids try to match velocity with near boids**
+
+```
+function rule3(Boid boid)
+
+    Vector perceived_velocity
+
+    foreach Boid b
+        if b != boid and distance(b, boid) < rule3Distance then
+            perceived_velocity += b.velocity
+        endif
+    end
+
+    perceived_velocity /= number_of_neighbors
+
+    return perceived_velocity * rule3Scale
+end
+```
+
+#### Naive Implementation
+
+The naive implemententation simply checks every boids against every other boid to determine which boids are
+within its neighborhood (i.e. within some max distance from itself). Ping-pong buffers are needed for the velocity
+values: writing a value to a boids velocity data required the velocity data of neighboring boids. This would cause
+a race condition and conflict trying the read and write to the same buffer if only a single velocity array was allocated.
+Instead, two are allocated and the program switches which is the "active" buffer at the end of each time step.
+
+#### Uniform Grid-Based Implementation
+
+A way to optimize this simulation, given the constraint that the boids are constrained by some axis aligned minimum cube volume,
+is to cut this cube volume into uniform sub-cube chunks. Each boid then only needs to check the grid cells
+that fall within the maximum neighborhood check distance and retrieve the boid data of all those cells. If the 
+cell resolution is set such that it is 2x that of the maximum search distance, then only 8 neighboring cells
+need to be checked. These are the cells in the 2x2x2 grid around the current boid. Likewise, if the cell resolution
+is around equal to the maximum search distance, then the 3x3x3 grid (or 27 cells) needs to be searched.
+This allows not only for only a certain subspace of the entire volume, and thus a subset of all the boids
+to be searched, but it also allows for "skipping" empty cells, or cells which contain no boids.
+
+#### Coherent Uniform Grid-Based Implementation
+
+A way to optimize this approach even further is to get rid of the array that represents a mapping between
+the boids sorted in grid-cell index number and their original order. Instead, the velocity and position buffers
+are rearranged according to their boids sorted positions. This necessitated only one additional position buffer
+to use as a ping-pong buffer before and after rearranging the values. This optimization should help speed up
+memory accesses when iterating through a certain grid cell, as all of the position and velocity data of the boids
+in that grid cell will be contiguous in memory.
+
+#### Extra Credit
+
+I also implemented the extra credit to optimize and parameterize the grid-based looping to handle arbitrary
+subgrid lengths. This involved calculating the number of cells needed to be checked in the x,y,z directions of the cell
+grid, and then using these values to offset the grid cell indices from the current boid grid cell index. This feature 
+is how I generated Figure 7.
+
+
+## Performance Analysis
+
+As can be seen by the below two graphs (Figures 4 and 5), increasing the number of boids simulated results in a larger runtime.
+This is because the more boids that are needed to be simulated, the more of the hardware is being used, and the
+more threads are needed to handle these boids.
+
+As expected, the grid-based approach **significantly** increased performance as well as the rate of loss in performance
+caused a rise in the number of boids. This is because only a subset of the overall simulation space is checked per boid
+which makes simulating boids with very few nearby boids very fast to compute, but also helps with boids with lots
+of neighbors too.
+
+Also as expected, the coherent grid-based simulation was faster than the original uniform grid-based simulation, 
+achieving at points double the fps and always faster to at least some degree than the normal uniform version.
+This effect can be seen in Figures 4 and 5 below.
+This is what I had hoped to achieve with this optimization. This optimization helped improve memory locality efficiency,
+particularly in iterating through all the boids in a certain grid cell and retrieving their position and velocity data.
+
+![](images/figures/boids_vs_fps.png)
+*Figure 4: Effect of number of boids on average FPS (with visualization disabled)*
+
+![](images/figures/boids_vs_fps_with_vis.png)
+*Figure 5: Effect of number of boids on average FPS (with visualization enabled)*
+
+Below is a graph measuring the impact of various block sizes, i.e. the number of threads allocated to each block. 
+As shown, the runtime of the grid-based simulation stays relatively constant at levels greater than 256, where it peaks.
+The reason it is mostly constant is because so many boids are being simulated with respect to the block size,
+that most values for block size allocate these threads at roughly the same efficiency.
+Block size values less than this result in penalties in performance.
+Note that all block sizes measured are multiples of 32. GPU threads are grouped into collections of 32 threads
+called a warp that can execute simultaneously. If a block size is not a factor of 32, this will result in some warps not being
+filled up to their optimal containmen, and will cause slowdown.
+
+![](images/figures/blocksize_vs_fps.png)
+*Figure 6: Effect of cuda kernel block size on average fps for uniform grid-based simulation*
+
+As a final point of analysis, Figure 7 below displays the effect of changing the grid cell resolution with respect the
+maximum neighbor search distance each boid uses to get nearby boids that influence its velocity. As can be shown,
+balancing this ratio is important for improving and maintain efficiency, and the benefits of the grid-based system.
+Too low a resolution, such as the first data point on the graph, and the performance is suboptimal as a lot of potentially
+empty space is being encompassed by large cells. Too high a resolution, and the benefit of the grid-based system is lost.
+There are now so many neighboring cells needed to be checked by a single boid, that even the increased potential
+prevalence of empty cells is lost. The graph shows that having a resolution approximately equal to the maximum search distance
+is optimal.
+
+![](images/figures/ratio_vs_fps.png)
+*Figure 7: Effect of the ratio of grid cell width to boid neighbor max search distance on average FPS*
+
+### Bloopers
+
+![](images/bloopers/blooper_graydeath.PNG)
+*Blooper caused by accessing wrong index values for boid position and velocity arrays*
+
+**For more bloopers, see images/bloopers.**
diff --git a/images/bloopers/blooper1.gif b/images/bloopers/blooper1.gif
new file mode 100644
index 0000000..1771545
Binary files /dev/null and b/images/bloopers/blooper1.gif differ
diff --git a/images/bloopers/blooper2.gif b/images/bloopers/blooper2.gif
new file mode 100644
index 0000000..f2e16b5
Binary files /dev/null and b/images/bloopers/blooper2.gif differ
diff --git a/images/bloopers/blooper3.gif b/images/bloopers/blooper3.gif
new file mode 100644
index 0000000..2a188fb
Binary files /dev/null and b/images/bloopers/blooper3.gif differ
diff --git a/images/bloopers/blooper_graydeath.PNG b/images/bloopers/blooper_graydeath.PNG
new file mode 100644
index 0000000..6f9c223
Binary files /dev/null and b/images/bloopers/blooper_graydeath.PNG differ
diff --git a/images/bloopers/blooper_nightsky.PNG b/images/bloopers/blooper_nightsky.PNG
new file mode 100644
index 0000000..c4f5ef7
Binary files /dev/null and b/images/bloopers/blooper_nightsky.PNG differ
diff --git a/images/bloopers/blooper_vacuum.PNG b/images/bloopers/blooper_vacuum.PNG
new file mode 100644
index 0000000..04a4477
Binary files /dev/null and b/images/bloopers/blooper_vacuum.PNG differ
diff --git a/images/figures/blocksize_vs_fps.png b/images/figures/blocksize_vs_fps.png
new file mode 100644
index 0000000..7ba6670
Binary files /dev/null and b/images/figures/blocksize_vs_fps.png differ
diff --git a/images/figures/boids_vs_fps.png b/images/figures/boids_vs_fps.png
new file mode 100644
index 0000000..775172b
Binary files /dev/null and b/images/figures/boids_vs_fps.png differ
diff --git a/images/figures/boids_vs_fps_with_vis.png b/images/figures/boids_vs_fps_with_vis.png
new file mode 100644
index 0000000..a9aefbd
Binary files /dev/null and b/images/figures/boids_vs_fps_with_vis.png differ
diff --git a/images/figures/ratio_vs_fps.png b/images/figures/ratio_vs_fps.png
new file mode 100644
index 0000000..a3e63d1
Binary files /dev/null and b/images/figures/ratio_vs_fps.png differ
diff --git a/images/results/5000000_boids.gif b/images/results/5000000_boids.gif
new file mode 100644
index 0000000..7f2d077
Binary files /dev/null and b/images/results/5000000_boids.gif differ
diff --git a/images/results/5000000_boids_smaller.gif b/images/results/5000000_boids_smaller.gif
new file mode 100644
index 0000000..8e0e6dd
Binary files /dev/null and b/images/results/5000000_boids_smaller.gif differ
diff --git a/images/results/500000_boids_1_dt.gif b/images/results/500000_boids_1_dt.gif
new file mode 100644
index 0000000..0fd3d95
Binary files /dev/null and b/images/results/500000_boids_1_dt.gif differ
diff --git a/images/results/5000_boids_02_dt.gif b/images/results/5000_boids_02_dt.gif
new file mode 100644
index 0000000..b335b8d
Binary files /dev/null and b/images/results/5000_boids_02_dt.gif differ
diff --git a/images/results/5000_boids_1_dt.gif b/images/results/5000_boids_1_dt.gif
new file mode 100644
index 0000000..1ca79c4
Binary files /dev/null and b/images/results/5000_boids_1_dt.gif differ
diff --git a/images/results/500_boids_1_dt.gif b/images/results/500_boids_1_dt.gif
new file mode 100644
index 0000000..b3bf090
Binary files /dev/null and b/images/results/500_boids_1_dt.gif differ
diff --git a/images/results/more_distance_half_maxspeed.gif b/images/results/more_distance_half_maxspeed.gif
new file mode 100644
index 0000000..666ba65
Binary files /dev/null and b/images/results/more_distance_half_maxspeed.gif differ
diff --git a/images/results/quarter_distances_maxspeed.gif b/images/results/quarter_distances_maxspeed.gif
new file mode 100644
index 0000000..dfe785f
Binary files /dev/null and b/images/results/quarter_distances_maxspeed.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..c9e497a 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -54,6 +54,8 @@ void checkCUDAError(const char *msg, int line = -1) {
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
 
+#define search_distance_to_cell_width 2.0f
+
 /***********************************************
 * Kernel state (pointers are device pointers) *
 ***********************************************/
@@ -86,6 +88,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3* dev_pos_rearranged;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -93,6 +97,8 @@ int gridSideCount;
 float gridCellWidth;
 float gridInverseCellWidth;
 glm::vec3 gridMinimum;
+float numCellsToSearchPerDir;
+float maxSearchDistance;
 
 /******************
 * initSimulation *
@@ -157,7 +163,14 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  maxSearchDistance = std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  gridCellWidth = search_distance_to_cell_width * maxSearchDistance;
+
+  // Extra Credit -- Grid-Loop Optimization
+  // calculate number of cells in each direction encompassed by max search distance
+  float searchRadiusToCellWidthRatio = maxSearchDistance / gridCellWidth;
+  numCellsToSearchPerDir = glm::floor(searchRadiusToCellWidthRatio * 1.99) + 2;
+
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +182,22 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_pos_rearranged, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +262,45 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    int rule1NeighborCount = 0;
+    int rule2NeighborCount = 0;
+    int rule3NeighborCount = 0;
+    glm::vec3 rule1Center = glm::vec3(0, 0, 0);
+    glm::vec3 rule2Seperate = glm::vec3(0, 0, 0);
+    glm::vec3 rule3Cohesion = glm::vec3(0, 0, 0);
+
+    for (int i = 0; i < N; ++i) {
+        if (i != iSelf) {
+            float distance = glm::length(pos[iSelf] - pos[i]);
+            if (distance < rule1Distance) {
+                // Rule 1: Cohesion: boids fly towards the center of mass of neighboring boids
+                rule1Center += pos[i];
+                rule1NeighborCount += 1;
+            }
+            if (distance < rule2Distance) {
+                // Rule 2: Separation: boids try to keep a small distance away from each other
+                rule2Seperate -= pos[i] - pos[iSelf];
+            }
+            if (distance < rule3Distance) {
+                // Rule 3: Alignment: boids try to match the velocities of neighboring boids
+                rule3Cohesion += vel[i];
+                rule3NeighborCount += 1;
+            }
+        }
+    }
+
+    if (rule1NeighborCount > 0) {
+        rule1Center /= rule1NeighborCount;
+        rule1Center = rule1Center - pos[iSelf];
+        rule1Center *= rule1Scale;
+    }
+    rule2Seperate *= rule2Scale;
+    if (rule3NeighborCount > 0) {
+        rule3Cohesion /= rule3NeighborCount;
+        rule3Cohesion *= rule3Scale;
+    }
+
+    return vel[iSelf] + rule1Center + rule2Seperate + rule3Cohesion;
 }
 
 /**
@@ -243,6 +310,17 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 newVel = computeVelocityChange(N, index, pos, vel1);
+    float newVelSpeed = glm::length(newVel);
+    if (newVelSpeed > maxSpeed) {
+        newVel = (newVel / newVelSpeed) * maxSpeed;
+    }
+    vel2[index] = newVel;
+  
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
 }
@@ -289,6 +367,19 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 thisBoidPos = pos[index];
+
+    // get cell this boid is in
+    glm::vec3 thisBoidGrid = (thisBoidPos - gridMin) * inverseCellWidth;
+    int thisBoidGridIndex = gridIndex3Dto1D(thisBoidGrid.x, thisBoidGrid.y, thisBoidGrid.z, gridResolution);
+
+    // set our indices
+    indices[index] = index;
+    gridIndices[index] = thisBoidGridIndex;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +397,25 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    int thisGridIndex = particleGridIndices[index];
+    
+    // check if we need to set grid start index
+    if (index == 0) {
+        // first boid, this is the start of the (0,0,0) cell
+        gridCellStartIndices[thisGridIndex] = index;
+    }
+    else if (index == N - 1) {
+        // last boid, this is the end of the last cell
+        gridCellEndIndices[thisGridIndex] = index;
+    }
+    else if (thisGridIndex != particleGridIndices[index - 1]) {
+        gridCellStartIndices[thisGridIndex] = index;
+        gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -313,7 +423,7 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
   int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2, int numCellsToSearchPerDir) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
@@ -322,13 +432,118 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+
+    int gridArrayIndexNum = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (gridArrayIndexNum >= N) {
+        return;
+    }
+    int index = particleArrayIndices[gridArrayIndexNum];
+    glm::vec3 thisBoidPos = pos[index];
+
+    // get cell this boid is in
+    glm::vec3 thisBoidGrid = (thisBoidPos - gridMin) * inverseCellWidth;
+    int thisBoidGridIndex = gridIndex3Dto1D(thisBoidGrid[0], thisBoidGrid[1], thisBoidGrid[2], gridResolution);
+
+    int lastCellIndex = (gridResolution * gridResolution * gridResolution) - 1;
+      
+    int minCells[3];
+    int maxCells[3];
+    int cellsToAllocate = numCellsToSearchPerDir - 2;
+    for (int i = 0; i < 3; ++i) {
+        // offsets needed to calulate each of the 8 grids to check
+        int gridOffset = 1;
+        if (glm::fract(thisBoidGrid[i]) <= 0.5) {
+            gridOffset = -1;
+        }
+        int oneEnd = thisBoidGrid[i];
+        int otherEnd = oneEnd + gridOffset;
+
+        minCells[i] = imin(oneEnd, otherEnd);
+        maxCells[i] = imax(oneEnd, otherEnd);
+        
+        // handle case where there are more than 2 cells in each direction you must search
+        if (numCellsToSearchPerDir > 2) {
+            if (gridOffset == 1) {
+                minCells[i] -= (cellsToAllocate + 1) / 2;
+                maxCells[i] += cellsToAllocate / 2;
+            }
+            else {
+                minCells[i] -= cellsToAllocate / 2;
+                maxCells[i] += (cellsToAllocate + 1) / 2;
+            }
+        }
+    }
+    
+    // search neighboring grids for nearby boids
+    int rule1NeighborCount = 0;
+    int rule3NeighborCount = 0;
+    glm::vec3 rule1Center = glm::vec3(0, 0, 0);
+    glm::vec3 rule2Seperate = glm::vec3(0, 0, 0);
+    glm::vec3 rule3Cohesion = glm::vec3(0, 0, 0);
+    
+    // loop through the cells we need to test
+    for (int zi = minCells[2]; zi <= maxCells[2]; ++zi) {
+        for (int yi = minCells[1]; yi <= maxCells[1]; ++yi) {
+            for (int xi = minCells[0]; xi <= maxCells[0]; ++xi) {
+
+                // get cell index based on coordinates, skip cell if past grid bounds 
+                int currCellIndex = gridIndex3Dto1D(xi, yi, zi, gridResolution);
+                if (currCellIndex < 0 || currCellIndex > lastCellIndex) continue;
+
+                // get cell start and end indices, skip cell if it is empty (start/end = -1)
+                int currCellStart = gridCellStartIndices[currCellIndex];
+                int currCellEnd = gridCellEndIndices[currCellIndex];
+                if (currCellStart == -1 || currCellEnd == -1) continue;
+
+                // loop through all boids in this cell
+                for (int boidNum = currCellStart; boidNum <= currCellEnd; ++boidNum) {
+                    int boidInCell = particleArrayIndices[boidNum];
+                    if (boidInCell == index) continue;
+
+                    float distance = glm::length(pos[index] - pos[boidInCell]);
+                    if (distance < rule1Distance) {
+                        // Rule 1: Cohesion: boids fly towards the center of mass of neighboring boids
+                        rule1Center += pos[boidInCell];
+                        rule1NeighborCount += 1;
+                    }
+                    if (distance < rule2Distance) {
+                        // Rule 2: Separation: boids try to keep a small distance away from each other
+                        rule2Seperate -= pos[boidInCell] - pos[index];
+                    }
+                    if (distance < rule3Distance) {
+                        // Rule 3: Alignment: boids try to match the velocities of neighboring boids
+                        rule3Cohesion += vel1[boidInCell];
+                        rule3NeighborCount += 1;
+                    }
+                }
+            }
+        }
+    }
+
+    if (rule1NeighborCount > 0) {
+        rule1Center /= rule1NeighborCount;
+        rule1Center = rule1Center - pos[index];
+        rule1Center *= rule1Scale;
+    }
+    rule2Seperate *= rule2Scale;
+    if (rule3NeighborCount > 0) {
+        rule3Cohesion /= rule3NeighborCount;
+        rule3Cohesion *= rule3Scale;
+    }
+    glm::vec3 newVel = vel1[index] + rule1Center + rule2Seperate + rule3Cohesion;
+    float newVelSpeed = glm::length(newVel);
+    if (newVelSpeed > maxSpeed) {
+        newVel = (newVel / newVelSpeed) * maxSpeed;
+    }
+    vel2[index] = newVel;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2, int numCellsToSearchPerDir) {
   // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
@@ -341,6 +556,139 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 thisBoidPos = pos[index];
+
+    // get cell this boid is in
+    glm::vec3 thisBoidGrid = (thisBoidPos - gridMin) * inverseCellWidth;
+    int thisBoidGridIndex = gridIndex3Dto1D(thisBoidGrid[0], thisBoidGrid[1], thisBoidGrid[2], gridResolution);
+
+    int lastCellIndex = (gridResolution * gridResolution * gridResolution) - 1;
+
+    int minCells[3];
+    int maxCells[3];
+    int cellsToAllocate = numCellsToSearchPerDir - 2;
+    for (int i = 0; i < 3; ++i) {
+        // offsets needed to calulate each of the 8 grids to check
+        int gridOffset = 1;
+        if (glm::fract(thisBoidGrid[i]) <= 0.5) {
+            gridOffset = -1;
+        }
+        int oneEnd = thisBoidGrid[i];
+        int otherEnd = oneEnd + gridOffset;
+
+        minCells[i] = imin(oneEnd, otherEnd);
+        maxCells[i] = imax(oneEnd, otherEnd);
+
+        // handle case where there are more than 2 cells in each direction you must search
+        if (numCellsToSearchPerDir > 2) {
+            if (gridOffset == 1) {
+                minCells[i] -= (cellsToAllocate + 1) / 2;
+                maxCells[i] += cellsToAllocate / 2;
+            }
+            else {
+                minCells[i] -= cellsToAllocate / 2;
+                maxCells[i] += (cellsToAllocate + 1) / 2;
+            }
+        }
+    }
+
+    // search neighboring grids for nearby boids
+    int rule1NeighborCount = 0;
+    int rule3NeighborCount = 0;
+    glm::vec3 rule1Center = glm::vec3(0, 0, 0);
+    glm::vec3 rule2Seperate = glm::vec3(0, 0, 0);
+    glm::vec3 rule3Cohesion = glm::vec3(0, 0, 0);
+
+    // loop through the cells we need to test
+    for (int zi = minCells[2]; zi <= maxCells[2]; ++zi) {
+        for (int yi = minCells[1]; yi <= maxCells[1]; ++yi) {
+            for (int xi = minCells[0]; xi <= maxCells[0]; ++xi) {
+
+                // get cell index based on coordinates, skip cell if past grid bounds 
+                int currCellIndex = gridIndex3Dto1D(xi, yi, zi, gridResolution);
+                if (currCellIndex < 0 || currCellIndex > lastCellIndex) continue;
+
+                // get cell start and end indices, skip cell if it is empty (start/end = -1)
+                int currCellStart = gridCellStartIndices[currCellIndex];
+                int currCellEnd = gridCellEndIndices[currCellIndex];
+                if (currCellStart == -1 || currCellEnd == -1) continue;
+
+                // loop through all boids in this cell
+                for (int boidNum = currCellStart; boidNum <= currCellEnd; ++boidNum) {
+                    int boidInCell = boidNum;
+                    if (boidInCell == index) continue;
+
+                    float distance = glm::length(pos[index] - pos[boidInCell]);
+                    if (distance < rule1Distance) {
+                        // Rule 1: Cohesion: boids fly towards the center of mass of neighboring boids
+                        rule1Center += pos[boidInCell];
+                        rule1NeighborCount += 1;
+                    }
+                    if (distance < rule2Distance) {
+                        // Rule 2: Separation: boids try to keep a small distance away from each other
+                        rule2Seperate -= pos[boidInCell] - pos[index];
+                    }
+                    if (distance < rule3Distance) {
+                        // Rule 3: Alignment: boids try to match the velocities of neighboring boids
+                        rule3Cohesion += vel1[boidInCell];
+                        rule3NeighborCount += 1;
+                    }
+                }
+            }
+        }
+    }
+
+    if (rule1NeighborCount > 0) {
+        rule1Center /= rule1NeighborCount;
+        rule1Center = rule1Center - pos[index];
+        rule1Center *= rule1Scale;
+    }
+    rule2Seperate *= rule2Scale;
+    if (rule3NeighborCount > 0) {
+        rule3Cohesion /= rule3NeighborCount;
+        rule3Cohesion *= rule3Scale;
+    }
+    glm::vec3 newVel = vel1[index] + rule1Center + rule2Seperate + rule3Cohesion;
+    float newVelSpeed = glm::length(newVel);
+    if (newVelSpeed > maxSpeed) {
+        newVel = (newVel / newVelSpeed) * maxSpeed;
+    }
+    vel2[index] = newVel;
+
+}
+
+__global__ void kernRearrangePosVel(int N, int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* pos_rearranged, glm::vec3* vel, glm::vec3* vel_rearranged) {
+    // TODO-2.1
+    // - Label each boid with the index of its grid cell.
+    // - Set up a parallel array of integer indices as pointers to the actual
+    //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    int oldIndex = particleArrayIndices[index];
+    pos_rearranged[index] = pos[oldIndex];
+    vel_rearranged[index] = vel[oldIndex];
+}
+
+__global__ void kernUnRearrangePosVel(int N, int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* pos_rearranged, glm::vec3* vel, glm::vec3* vel_rearranged) {
+    // TODO-2.1
+    // - Label each boid with the index of its grid cell.
+    // - Set up a parallel array of integer indices as pointers to the actual
+    //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    pos[particleArrayIndices[index]] = pos_rearranged[index];
+    vel[particleArrayIndices[index]] = vel_rearranged[index];
 }
 
 /**
@@ -349,6 +697,12 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+    int numBlocks = (numObjects + blockSize - 1) / blockSize;
+    kernUpdateVelocityBruteForce <<< numBlocks, blockSize >>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos <<< numBlocks, blockSize >>> (numObjects, dt, dev_pos, dev_vel1);
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +718,37 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+    int numBlocksForPerBoid = (numObjects + blockSize - 1) / blockSize;
+    int numBlocksForPerCell = (gridCellCount + blockSize - 1) / blockSize;
+
+    // label each particle with its array index as well as its grid index
+    kernComputeIndices <<< numBlocksForPerBoid, blockSize >>> ( numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, 
+                                                        dev_pos, dev_particleArrayIndices, dev_particleGridIndices
+                                                      );
+
+    dev_thrust_particleArrayIndices = thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_pointer_cast<int>(dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    // reset start and end indices arrays
+    kernResetIntBuffer <<< numBlocksForPerCell, blockSize >>> (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer <<< numBlocksForPerCell, blockSize >>> (gridCellCount, dev_gridCellEndIndices, -1);
+
+    // fill in start and end index values of each cell in the sorted cell array
+    kernIdentifyCellStartEnd <<< numBlocksForPerBoid, blockSize >>> (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    // generate new velocities
+    kernUpdateVelNeighborSearchScattered <<< numBlocksForPerBoid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+                                                                          dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, 
+                                                                          dev_pos, dev_vel1, dev_vel2, numCellsToSearchPerDir
+                                                                        );
+    
+    // update position with new velocities
+    kernUpdatePos <<< numBlocksForPerBoid, blockSize >>> (numObjects, dt, dev_pos, dev_vel2);
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +767,44 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+
+    int numBlocksForPerBoid = (numObjects + blockSize - 1) / blockSize;
+    int numBlocksForPerCell = (gridCellCount + blockSize - 1) / blockSize;
+
+    // label each particle with its array index as well as its grid index
+    kernComputeIndices <<< numBlocksForPerBoid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+        dev_pos, dev_particleArrayIndices, dev_particleGridIndices
+        );
+
+    // rearrange pos and vel by order of sorted gridIndices
+    dev_thrust_particleArrayIndices = thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_pointer_cast<int>(dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernRearrangePosVel <<< numBlocksForPerBoid, blockSize >>> (numObjects, dev_particleArrayIndices,
+        dev_pos, dev_pos_rearranged, dev_vel1, dev_vel2); // use vel2 as rearranged vel1
+
+    // reset start and end indices arrays
+    kernResetIntBuffer << < numBlocksForPerCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << < numBlocksForPerCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    // fill in start and end index values of each cell in the sorted cell array
+    kernIdentifyCellStartEnd << < numBlocksForPerBoid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); 
+
+    // generate new velocities
+    kernUpdateVelNeighborSearchCoherent <<< numBlocksForPerBoid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_pos_rearranged, dev_vel2, dev_vel1, numCellsToSearchPerDir
+        ); // store updated vel back into vel1
+
+    // update position with new velocities
+    kernUpdatePos <<< numBlocksForPerBoid, blockSize >>> (numObjects, dt, dev_pos_rearranged, dev_vel1);
+
+    glm::vec3* temp = dev_pos;
+    dev_pos = dev_pos_rearranged;
+    dev_pos_rearranged = temp;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +813,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_pos_rearranged);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..c5d20b1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,22 +14,49 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
+
+#define DATA_COLLECTION_AVG_FPS 1
+
+#define FRAMES_TO_RECORD 20
+
+#ifdef DATA_COLLECTION_AVG_FPS
+int fps_record[FRAMES_TO_RECORD];
+#endif
+
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
 const float DT = 0.2f;
 
+int wait_for_gif_recorder = 0;
+
 /**
 * C main function.
 */
 int main(int argc, char* argv[]) {
   projectName = "565 CUDA Intro: Boids";
+  using namespace std::this_thread; // sleep_for, sleep_until
+  using namespace std::chrono; // nanoseconds, system_clock, seconds
+
+  
 
   if (init(argc, argv)) {
+      //sleep_for(seconds(10));
     mainLoop();
     Boids::endSimulation();
+#ifdef DATA_COLLECTION_AVG_FPS
+    int average_fps = 0;
+    std::cout << "{ ";
+    for (int i = 0; i < FRAMES_TO_RECORD; ++i) {
+        average_fps += fps_record[i];
+        std::cout << fps_record[i] << " ";
+    }
+    std::cout << " }" << std::endl;
+    average_fps /= FRAMES_TO_RECORD;
+    std::cout << N_FOR_VIS << " " << average_fps << std::endl;
+#endif
     return 0;
   } else {
     return 1;
@@ -217,10 +244,16 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
+    int seconds = 0;
+
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
 
     while (!glfwWindowShouldClose(window)) {
+#ifdef DATA_COLLECTION_AVG_FPS
+        if (seconds > FRAMES_TO_RECORD) break;
+#endif
+        
       glfwPollEvents();
 
       frame++;
@@ -230,6 +263,10 @@ void initShaders(GLuint * program) {
         fps = frame / (time - timebase);
         timebase = time;
         frame = 0;
+#ifdef DATA_COLLECTION_AVG_FPS
+        if (seconds > 0) fps_record[seconds - 1] = fps;
+#endif
+        ++seconds;
       }
 
       runCUDA();
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..fce24c0 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -5,6 +5,8 @@
 #include <string>
 #include <sstream>
 #include <fstream>
+#include <chrono>
+#include <thread>
 #include <GL/glew.h>
 #include <GLFW/glfw3.h>
 #include <cuda_runtime.h>
@@ -43,6 +45,7 @@ int pointSize = 2;
 // For camera controls
 bool leftMousePressed = false;
 bool rightMousePressed = false;
+bool middleMousePressed = false;
 double lastX;
 double lastY;
 float theta = 1.22f;