CIS5650-Fall-2025 · grievans · Aug 28, 2025 · Aug 28, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/cuda-introduction/build
diff --git a/.vs/CMakeWorkspaceSettings.json b/.vs/CMakeWorkspaceSettings.json
@@ -0,0 +1,3 @@
+{
+  "enableCMake": false
+}
diff --git a/.vs/Project0-Getting-Started/v17/.wsuo b/.vs/Project0-Getting-Started/v17/.wsuo
diff --git a/.vs/Project0-Getting-Started/v17/Browse.VC.db b/.vs/Project0-Getting-Started/v17/Browse.VC.db
diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json
@@ -0,0 +1,3 @@
+{
+  "CurrentProjectSetting": null
+}
diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json
@@ -0,0 +1,6 @@
+{
+  "ExpandedNodes": [
+    ""
+  ],
+  "PreviewInSolutionExplorer": false
+}
diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite
diff --git a/README.md b/README.md
@@ -3,11 +3,35 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Griffin Evans
+  * [email protected], [personal website](evanses.com/griffin)
+* Tested on lab computer: Windows 11, i9-12900F @ 2.40GHz 22GB, NVIDIA GeForce RTX 3090 (Levine 057 #3)
 
-### (TODO: Your README)
+### My README
 
 Include screenshots, analysis, etc. (Remember, this is public, so don't put
 anything here that you don't want to share with the world.)
+
+2.1.2
+
+![](images/Screenshot%202025-08-28%20143319.png)
+
+2.1.3
+
+![](images/Screenshot%202025-08-28%20151313.png)
+
+2.1.4
+
+![](images/Screenshot%202025-08-29%20161428.png)
+
+2.1.5 — had error as described in https://edstem.org/us/courses/81464/discussion/6880884
+
+![](images/Screenshot%202025-08-29%20161908.png)
+
+2.2
+
+![](images/Screenshot%202025-08-29%20162107.png)
+
+2.3
+
+![](images/Screenshot%202025-08-29%20162534.png)
diff --git a/cuda-gl-check/CMakeSettings.json b/cuda-gl-check/CMakeSettings.json
@@ -0,0 +1,15 @@
+{
+  "configurations": [
+    {
+      "name": "x64-Debug",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": ""
+    }
+  ]
+}
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
@@ -11,7 +11,7 @@
  */
 int main(int argc, char* argv[]) {
     // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Griffin Evans";
 
     if (init(argc, argv)) {
         mainLoop();

diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return (size - 1) / div + 1;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)

diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,24 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned py = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px >= sizeMX || py >= sizeNY) {
+        return;
+    }
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
     float dot = 0.0;
+    for (unsigned k = 0; k < sizeXY; ++k) {
+        dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k];
+    }
 
     // TODO 10d: Copy dot to P matrix
     // matrixP[] = dot;
+    matrixP[py * sizeMX + px] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,19 +38,19 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 245;
+    const unsigned sizeXY = 2049;
+    const unsigned sizeNY = 771;
 
     // TODO 2: Allocate host 1D arrays for:
     // matrixM[sizeMX, sizeXY]
     // matrixN[sizeXY, sizeNY]
     // matrixP[sizeMX, sizeNY]
     // matrixPGold[sizeMX, sizeNY]
-    float* matrixM;
-    float* matrixN;
-    float* matrixP;
-    float* matrixPGold;
+    float* matrixM = new float[sizeMX * sizeXY];
+    float* matrixN = new float[sizeXY * sizeNY];
+    float* matrixP = new float[sizeMX * sizeNY];
+    float* matrixPGold = new float[sizeMX * sizeNY];
 
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
@@ -65,13 +72,30 @@ int main(int argc, char *argv[])
     //     for k -> 0 to sizeXY
     //       dot = m[k, px] * n[py, k]
     //  matrixPGold[py, px] = dot
+    for (unsigned py = 0; py < sizeNY; ++py) {
+        for (unsigned px = 0; px < sizeMX; ++px) {
+            float dot = 0.f;
+            for (unsigned k = 0; k < sizeXY; ++k) {
+                dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k];
+            }
+            matrixPGold[py * sizeMX + px] = dot;
+            // TODO check this direction right
+        }
+    }
 
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
+    CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));
 
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));
+
+
 
     CUDA(cudaDeviceSynchronize());
 
@@ -86,13 +110,16 @@ int main(int argc, char *argv[])
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+    const unsigned BS_X = 32, BS_Y = 32;
+    dims.dimBlock = dim3(BS_X, BS_Y, 1);
+    dims.dimGrid  = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1);
 
     // TODO 7: Launch the matrix transpose kernel
     // matrixMultiplicationNaive<<<>>>();
+    matrixMultiplicationNaive<<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
 
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
+    CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
@@ -101,6 +128,9 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 9: free device memory using cudaFree
+    CUDA(cudaFree(d_matrixM));
+    CUDA(cudaFree(d_matrixN));
+    CUDA(cudaFree(d_matrixP));
 
     // free host memory
     delete[] matrixM;

diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,21 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 64;
 
     // Host arrays.
     float* x = new float[size];
@@ -53,9 +54,15 @@ int main(int argc, char *argv[])
 
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
     // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
+    unsigned sizeInBytes = size * sizeof(float);
+    CUDA(cudaMalloc((void**)&d_x, sizeInBytes));
+    CUDA(cudaMalloc((void**)&d_y, sizeInBytes));
+    CUDA(cudaMalloc((void**)&d_z, sizeInBytes));
 
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
     // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
+    CUDA(cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -69,16 +76,18 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
     // saxpy<<< >>> (....);
+    saxpy<<<blocks, threadsPerBlock>>>(d_z, d_x, d_y, a, size);
 
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
+    CUDA(cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost));
 
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
@@ -87,6 +96,9 @@ int main(int argc, char *argv[])
 
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
+    CUDA(cudaFree(d_x));
+    CUDA(cudaFree(d_y));
+    CUDA(cudaFree(d_z));
 
     // free host memory
     delete[] x;

diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu
@@ -19,16 +19,20 @@
 __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 6a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 6b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY) {
+        return;
+    }
 
     // TODO 6c: Compute global 1D index from i and j
-    unsigned index = 0;
+    unsigned index = j * sizeX + i;
 
     // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
     // b[] = a[];
+    b[index] = a[index];
 }
 
 // TODO 11: Implement the transpose kernel
@@ -38,16 +42,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
 __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 11a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 11b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY) {
+        return;
+    }
 
     // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
-    unsigned index_in  = 0;  // Compute input index (i,j) from matrix A
-    unsigned index_out = 0;  // Compute output index (j,i) in matrix B = transpose(A)
-
+    unsigned index_in  = j * sizeX + i;  // Compute input index (i,j) from matrix A
+    unsigned index_out = i * sizeY + j;  // Compute output index (j,i) in matrix B = transpose(A)
     // TODO 11d: Copy data from A to B using transpose indices
+    b[index_out] = a[index_in];
 }
 
 int main(int argc, char *argv[])
@@ -82,8 +89,12 @@ int main(int argc, char *argv[])
     float *d_a, *d_b;
 
     // TODO 2: Allocate memory on the device for d_a and d_b.
+    unsigned sizeInBytes = sizeX * sizeY * sizeof(float);
+    CUDA(cudaMalloc((void**)&d_a, sizeInBytes));
+    CUDA(cudaMalloc((void**)&d_b, sizeInBytes));
 
     // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -97,13 +108,15 @@ int main(int argc, char *argv[])
         // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        const unsigned BS_X = 32, BS_Y = 32;
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(divup(sizeX,BS_X), divup(sizeY,BS_Y), 1);
 
         // LOOK: Launch the copy kernel
         copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 5: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(a_gold, b, sizeX * sizeY);
@@ -121,13 +134,16 @@ int main(int argc, char *argv[])
         // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        const unsigned BS_X = 32, BS_Y = 32;
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1);
 
         // TODO 9: Launch the matrix transpose kernel
         // matrixTransposeNaive<<<>>>(......);
+        matrixTransposeNaive<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 10: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(b_gold, b, sizeX * sizeY);
@@ -136,6 +152,8 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 7: free device memory using cudaFree
+    CUDA(cudaFree(d_a));
+    CUDA(cudaFree(d_b));
 
     // free host memory
     delete[] a;

diff --git a/images/Screenshot 2025-08-28 143319.png b/images/Screenshot 2025-08-28 143319.png
diff --git a/images/Screenshot 2025-08-28 151313.png b/images/Screenshot 2025-08-28 151313.png
diff --git a/images/Screenshot 2025-08-29 161428.png b/images/Screenshot 2025-08-29 161428.png
diff --git a/images/Screenshot 2025-08-29 161908.png b/images/Screenshot 2025-08-29 161908.png
diff --git a/images/Screenshot 2025-08-29 162107.png b/images/Screenshot 2025-08-29 162107.png
diff --git a/images/Screenshot 2025-08-29 162534.png b/images/Screenshot 2025-08-29 162534.png