CIS5650-Fall-2025 · lobizhao · Aug 29, 2025 · Aug 30, 2025 · Aug 30, 2025 · Aug 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+**/build/
+**/.vs/
diff --git a/README.md b/README.md
@@ -3,11 +3,40 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Jiangman(Lobi) Zhao
+  * [Lobi Zhao - LinkedIn](https://www.linkedin.com/in/lobizhao/), [Lobi Zhao - personal website](https://lobizhao.github.io/).
+* Tested on: Windows 11 Pro, i5-10600KF @ 4.10GHz 32GB, RTX 3080 10GB
 
-### (TODO: Your README)
+### README
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## SUMMAYR
+
+    This is my first time coding with CUDA, so I'm not sure if I have successfully completed all the tasks for Project 0.
+
+    Yesterday, I reviewed the class slide documents, which reinforced my understanding of how the GPU operates and its specialized terminology. I was particularly confused about the number of threads that can run simultaneously, especially regarding CPU cores versus GPU cores. However, it's clear to me now.
+
+    I also found some pretty good learning materials.
+
+  [CUDA Thread indexing](https://www.eecs.umich.edu/courses/eecs471/resources/materials/CUDA-Thread-Indexing-Cheatsheet.pdf).
+
+
+
+
+## screenshots 
+- part 2.12
+![part 2.12](images/01_cudaGLCheck_Lobi.png)
+- part 2.13
+![part 2.13](images/02_AutosWarpInfo.png)
+- part 2.14
+![part 2.14](images/03_nsightAnalysisSummary.png)
+![part 2.14.2](images/03_nsightTimelineView.png)
+- part 2.15
+![part 2.15](images/04_NsightComputingError.png)
+- part 2.2
+![part 2.2](images/05_webGLSupport.png)
+- part 2.3
+![part 2.3](images/05_webGPUSupport.png)
+
+## analysis
+- Test
+![test](images/saxpyWarpInfo.png)
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
@@ -11,7 +11,7 @@
  */
 int main(int argc, char* argv[]) {
     // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Jiangman(Lobi) Zhao";
 
     if (init(argc, argv)) {
         mainLoop();

diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
@@ -9,7 +9,9 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    unsigned result = (size + div - 1) / div;
+
+    return result;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)

diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,28 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned py = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px >= sizeMX || py >= sizeNY) {
+        return;
+    }
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
+
     float dot = 0.0;
 
+    for (unsigned i = 0; i < sizeXY; i++) {
+
+        dot += matrixM[px * sizeXY + i] * matrixN[i * sizeNY + py];
+
+    }
+
     // TODO 10d: Copy dot to P matrix
     // matrixP[] = dot;
+    matrixP[px * sizeNY + py] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,9 +42,9 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 16;
+    const unsigned sizeXY = 16;
+    const unsigned sizeNY = 16;
 
     // TODO 2: Allocate host 1D arrays for:
     // matrixM[sizeMX, sizeXY]
@@ -45,6 +56,11 @@ int main(int argc, char *argv[])
     float* matrixP;
     float* matrixPGold;
 
+    matrixM = new float[sizeMX* sizeXY];
+    matrixN = new float[sizeXY* sizeNY];
+    matrixP = new float[sizeMX* sizeNY];
+    matrixPGold = new float[sizeMX* sizeNY];
+
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
     std::mt19937 mt(rd());
@@ -66,13 +82,30 @@ int main(int argc, char *argv[])
     //       dot = m[k, px] * n[py, k]
     //  matrixPGold[py, px] = dot
 
+    for (unsigned py = 0; py < sizeNY; py++) {
+        for (unsigned px = 0; px < sizeMX; px++) {
+            float dot = 0.0;
+            for (unsigned k = 0; k < sizeXY; k++) {
+                dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
+            }
+            matrixPGold[px * sizeNY + py] = dot;
+        }
+    }
+
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
 
+    CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));
+
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
 
+    CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));
+
     CUDA(cudaDeviceSynchronize());
 
     ////////////////////////////////////////////////////////////
@@ -85,15 +118,23 @@ int main(int argc, char *argv[])
     // TODO 6: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
+
+    const unsigned BS_X = 16;
+    const unsigned BS_Y = 16;
+
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+    dims.dimBlock = dim3(BS_X, BS_Y, 1);
+    dims.dimGrid  = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1);
 
     // TODO 7: Launch the matrix transpose kernel
     // matrixMultiplicationNaive<<<>>>();
 
+    matrixMultiplicationNaive <<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
+
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
 
+    CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));
+
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
 
@@ -102,6 +143,10 @@ int main(int argc, char *argv[])
 
     // TODO 9: free device memory using cudaFree
 
+    CUDA(cudaFree(d_matrixM));
+    CUDA(cudaFree(d_matrixN));
+    CUDA(cudaFree(d_matrixP));
+
     // free host memory
     delete[] matrixM;
     delete[] matrixN;

diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,25 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    //unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+
+    //const unsigned size = 0;
+    const unsigned size = 64;
 
     // Host arrays.
     float* x = new float[size];
@@ -54,9 +59,18 @@ int main(int argc, char *argv[])
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
     // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
 
+    CUDA(cudaMalloc((void**) & d_x, size * sizeof(float)));
+    CUDA(cudaMalloc((void**) & d_y, size * sizeof(float)));
+    CUDA(cudaMalloc((void**) & d_z, size * sizeof(float)));
+
+
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
     // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
 
+    CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice));
+
+
     CUDA(cudaDeviceSynchronize());
 
     ////////////////////////////////////////////////////////////
@@ -69,16 +83,20 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+
+    //const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
     // saxpy<<< >>> (....);
+    saxpy <<<blocks, threadsPerBlock >>> (d_z, d_x, d_y, a, size);
 
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
+    CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
@@ -87,6 +105,10 @@ int main(int argc, char *argv[])
 
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
+
+    CUDA(cudaFree(d_x));
+    CUDA(cudaFree(d_y));
+    CUDA(cudaFree(d_z));
 
     // free host memory
     delete[] x;