diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c4f5023 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/cuda-introduction/build diff --git a/.vs/CMakeWorkspaceSettings.json b/.vs/CMakeWorkspaceSettings.json new file mode 100644 index 0000000..d3e1057 --- /dev/null +++ b/.vs/CMakeWorkspaceSettings.json @@ -0,0 +1,3 @@ +{ + "enableCMake": false +} \ No newline at end of file diff --git a/.vs/Project0-Getting-Started/v17/.wsuo b/.vs/Project0-Getting-Started/v17/.wsuo new file mode 100644 index 0000000..44b501b Binary files /dev/null and b/.vs/Project0-Getting-Started/v17/.wsuo differ diff --git a/.vs/Project0-Getting-Started/v17/Browse.VC.db b/.vs/Project0-Getting-Started/v17/Browse.VC.db new file mode 100644 index 0000000..a9363b9 Binary files /dev/null and b/.vs/Project0-Getting-Started/v17/Browse.VC.db differ diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json new file mode 100644 index 0000000..f8b4888 --- /dev/null +++ b/.vs/ProjectSettings.json @@ -0,0 +1,3 @@ +{ + "CurrentProjectSetting": null +} \ No newline at end of file diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json new file mode 100644 index 0000000..6b61141 --- /dev/null +++ b/.vs/VSWorkspaceState.json @@ -0,0 +1,6 @@ +{ + "ExpandedNodes": [ + "" + ], + "PreviewInSolutionExplorer": false +} \ No newline at end of file diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite new file mode 100644 index 0000000..fd1ee83 Binary files /dev/null and b/.vs/slnx.sqlite differ diff --git a/README.md b/README.md index d2fa33d..aa65b1a 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,35 @@ Project 0 Getting Started **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Griffin Evans + * gpevans@seas.upenn.edu, [personal website](evanses.com/griffin) +* Tested on lab computer: Windows 11, i9-12900F @ 2.40GHz 22GB, NVIDIA GeForce RTX 3090 (Levine 057 #3) -### (TODO: Your README) +### My README Include screenshots, analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) + +2.1.2 + +![](images/Screenshot%202025-08-28%20143319.png) + +2.1.3 + +![](images/Screenshot%202025-08-28%20151313.png) + +2.1.4 + +![](images/Screenshot%202025-08-29%20161428.png) + +2.1.5 — had error as described in https://edstem.org/us/courses/81464/discussion/6880884 + +![](images/Screenshot%202025-08-29%20161908.png) + +2.2 + +![](images/Screenshot%202025-08-29%20162107.png) + +2.3 + +![](images/Screenshot%202025-08-29%20162534.png) diff --git a/cuda-gl-check/CMakeSettings.json b/cuda-gl-check/CMakeSettings.json new file mode 100644 index 0000000..9204f06 --- /dev/null +++ b/cuda-gl-check/CMakeSettings.json @@ -0,0 +1,15 @@ +{ + "configurations": [ + { + "name": "x64-Debug", + "generator": "Ninja", + "configurationType": "Debug", + "inheritEnvironments": [ "msvc_x64_x64" ], + "buildRoot": "${projectDir}\\out\\build\\${name}", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "" + } + ] +} \ No newline at end of file diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp index 886fd4c..5b82920 100644 --- a/cuda-gl-check/src/main.cpp +++ b/cuda-gl-check/src/main.cpp @@ -11,7 +11,7 @@ */ int main(int argc, char* argv[]) { // TODO: Change this line to use your name! - m_yourName = "TODO: YOUR NAME HERE"; + m_yourName = "Griffin Evans"; if (init(argc, argv)) { mainLoop(); diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu index dce8793..3446ce1 100644 --- a/cuda-introduction/source/common.cu +++ b/cuda-introduction/source/common.cu @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div) { // TODO: implement a 1 line function to return the divup operation. // Note: You only need to use addition, subtraction, and division operations. - return 0; + return (size - 1) / div + 1; } void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value) diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu index 826e535..1983712 100644 --- a/cuda-introduction/source/matmul.cu +++ b/cuda-introduction/source/matmul.cu @@ -12,17 +12,24 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con { // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions. // Remember that each thread of the kernel computes the result of 1 unique element of P - unsigned px; - unsigned py; + unsigned px = blockIdx.x * blockDim.x + threadIdx.x; + unsigned py = blockIdx.y * blockDim.y + threadIdx.y; // TODO 10b: Check if px or py are out of bounds. If they are, return. + if (px >= sizeMX || py >= sizeNY) { + return; + } // TODO 10c: Compute the dot product for the P element in each thread // This loop will be the same as the host loop float dot = 0.0; + for (unsigned k = 0; k < sizeXY; ++k) { + dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k]; + } // TODO 10d: Copy dot to P matrix // matrixP[] = dot; + matrixP[py * sizeMX + px] = dot; } int main(int argc, char *argv[]) @@ -31,19 +38,19 @@ int main(int argc, char *argv[]) // Then try large multiple-block square matrix like 64x64 up to 2048x2048. // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771 // Then try rectangles with powers of two and then non-power-of-two. - const unsigned sizeMX = 0; - const unsigned sizeXY = 0; - const unsigned sizeNY = 0; + const unsigned sizeMX = 245; + const unsigned sizeXY = 2049; + const unsigned sizeNY = 771; // TODO 2: Allocate host 1D arrays for: // matrixM[sizeMX, sizeXY] // matrixN[sizeXY, sizeNY] // matrixP[sizeMX, sizeNY] // matrixPGold[sizeMX, sizeNY] - float* matrixM; - float* matrixN; - float* matrixP; - float* matrixPGold; + float* matrixM = new float[sizeMX * sizeXY]; + float* matrixN = new float[sizeXY * sizeNY]; + float* matrixP = new float[sizeMX * sizeNY]; + float* matrixPGold = new float[sizeMX * sizeNY]; // LOOK: Setup random number generator and fill host arrays and the scalar a. std::random_device rd; @@ -65,13 +72,30 @@ int main(int argc, char *argv[]) // for k -> 0 to sizeXY // dot = m[k, px] * n[py, k] // matrixPGold[py, px] = dot + for (unsigned py = 0; py < sizeNY; ++py) { + for (unsigned px = 0; px < sizeMX; ++px) { + float dot = 0.f; + for (unsigned k = 0; k < sizeXY; ++k) { + dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k]; + } + matrixPGold[py * sizeMX + px] = dot; + // TODO check this direction right + } + } // Device arrays float *d_matrixM, *d_matrixN, *d_matrixP; // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP. + CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float))); // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice)); + + CUDA(cudaDeviceSynchronize()); @@ -86,13 +110,16 @@ int main(int argc, char *argv[]) // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup // HINT: The shape of matrices has no impact on launch configuaration DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + const unsigned BS_X = 32, BS_Y = 32; + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1); // TODO 7: Launch the matrix transpose kernel // matrixMultiplicationNaive<<<>>>(); + matrixMultiplicationNaive<<>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY); // TODO 8: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3); @@ -101,6 +128,9 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 9: free device memory using cudaFree + CUDA(cudaFree(d_matrixM)); + CUDA(cudaFree(d_matrixN)); + CUDA(cudaFree(d_matrixP)); // free host memory delete[] matrixM; diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu index 5ed591f..e548407 100644 --- a/cuda-introduction/source/saxpy.cu +++ b/cuda-introduction/source/saxpy.cu @@ -9,20 +9,21 @@ __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size) { // TODO 9: Compute the global index for each thread. - unsigned idx = 0; + unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // TODO 10: Check if idx is out of bounds. If yes, return. - if (idx >= 0) + if (idx >= size) return; // TODO 11: Perform the SAXPY operation: z = a * x + y. + z[idx] = a * x[idx] + y[idx]; } int main(int argc, char *argv[]) { // TODO 1: Set the size. Start with something simple like 64. // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127 - const unsigned size = 0; + const unsigned size = 64; // Host arrays. float* x = new float[size]; @@ -53,9 +54,15 @@ int main(int argc, char *argv[]) // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z. // CUDA(cudaMalloc((void **)& pointer, size in bytes))); + unsigned sizeInBytes = size * sizeof(float); + CUDA(cudaMalloc((void**)&d_x, sizeInBytes)); + CUDA(cudaMalloc((void**)&d_y, sizeInBytes)); + CUDA(cudaMalloc((void**)&d_z, sizeInBytes)); // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2, // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum)); + CUDA(cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -69,16 +76,18 @@ int main(int argc, char *argv[]) // TODO 4: Setup threads and blocks. // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024 // Use divup to get the number of blocks to launch. - const unsigned threadsPerBlock = 0; + const unsigned threadsPerBlock = 128; // TODO 5: Implement the divup function in common.cpp const unsigned blocks = divup(size, threadsPerBlock); // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration // saxpy<<< >>> (....); + saxpy<<>>(d_z, d_x, d_y, a, size); // TODO 7: Copy the answer back to the host (CPU) from the device (GPU). // Copy what you did in 3, except for d_z -> z. + CUDA(cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost)); // LOOK: Use postprocess to check the result compareReferenceAndResult(z_gold, z, size, 1e-6); @@ -87,6 +96,9 @@ int main(int argc, char *argv[]) // TODO 8: free device memory using cudaFree // CUDA(cudaFree(device pointer)); + CUDA(cudaFree(d_x)); + CUDA(cudaFree(d_y)); + CUDA(cudaFree(d_z)); // free host memory delete[] x; diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu index 89f6f8f..0f4b599 100644 --- a/cuda-introduction/source/transpose.cu +++ b/cuda-introduction/source/transpose.cu @@ -19,16 +19,20 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 6a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0;; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 6b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) { + return; + } // TODO 6c: Compute global 1D index from i and j - unsigned index = 0; + unsigned index = j * sizeX + i; // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same // b[] = a[]; + b[index] = a[index]; } // TODO 11: Implement the transpose kernel @@ -38,16 +42,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 11a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 11b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) { + return; + } // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i) - unsigned index_in = 0; // Compute input index (i,j) from matrix A - unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A) - + unsigned index_in = j * sizeX + i; // Compute input index (i,j) from matrix A + unsigned index_out = i * sizeY + j; // Compute output index (j,i) in matrix B = transpose(A) // TODO 11d: Copy data from A to B using transpose indices + b[index_out] = a[index_in]; } int main(int argc, char *argv[]) @@ -82,8 +89,12 @@ int main(int argc, char *argv[]) float *d_a, *d_b; // TODO 2: Allocate memory on the device for d_a and d_b. + unsigned sizeInBytes = sizeX * sizeY * sizeof(float); + CUDA(cudaMalloc((void**)&d_a, sizeInBytes)); + CUDA(cudaMalloc((void**)&d_b, sizeInBytes)); // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -97,13 +108,15 @@ int main(int argc, char *argv[]) // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + const unsigned BS_X = 32, BS_Y = 32; + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeX,BS_X), divup(sizeY,BS_Y), 1); // LOOK: Launch the copy kernel copyKernel<<>>(d_a, d_b, sizeX, sizeY); // TODO 5: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(a_gold, b, sizeX * sizeY); @@ -121,13 +134,16 @@ int main(int argc, char *argv[]) // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + const unsigned BS_X = 32, BS_Y = 32; + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1); // TODO 9: Launch the matrix transpose kernel // matrixTransposeNaive<<<>>>(......); + matrixTransposeNaive<<>>(d_a, d_b, sizeX, sizeY); // TODO 10: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(b_gold, b, sizeX * sizeY); @@ -136,6 +152,8 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 7: free device memory using cudaFree + CUDA(cudaFree(d_a)); + CUDA(cudaFree(d_b)); // free host memory delete[] a; diff --git a/images/Screenshot 2025-08-28 143319.png b/images/Screenshot 2025-08-28 143319.png new file mode 100644 index 0000000..f149e58 Binary files /dev/null and b/images/Screenshot 2025-08-28 143319.png differ diff --git a/images/Screenshot 2025-08-28 151313.png b/images/Screenshot 2025-08-28 151313.png new file mode 100644 index 0000000..2cdb928 Binary files /dev/null and b/images/Screenshot 2025-08-28 151313.png differ diff --git a/images/Screenshot 2025-08-29 161428.png b/images/Screenshot 2025-08-29 161428.png new file mode 100644 index 0000000..05b8260 Binary files /dev/null and b/images/Screenshot 2025-08-29 161428.png differ diff --git a/images/Screenshot 2025-08-29 161908.png b/images/Screenshot 2025-08-29 161908.png new file mode 100644 index 0000000..0fafffc Binary files /dev/null and b/images/Screenshot 2025-08-29 161908.png differ diff --git a/images/Screenshot 2025-08-29 162107.png b/images/Screenshot 2025-08-29 162107.png new file mode 100644 index 0000000..9e3f4f2 Binary files /dev/null and b/images/Screenshot 2025-08-29 162107.png differ diff --git a/images/Screenshot 2025-08-29 162534.png b/images/Screenshot 2025-08-29 162534.png new file mode 100644 index 0000000..ca6ef06 Binary files /dev/null and b/images/Screenshot 2025-08-29 162534.png differ