Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/cuda-introduction/build
3 changes: 3 additions & 0 deletions .vs/CMakeWorkspaceSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"enableCMake": false
}
Binary file added .vs/Project0-Getting-Started/v17/.wsuo
Binary file not shown.
Binary file added .vs/Project0-Getting-Started/v17/Browse.VC.db
Binary file not shown.
3 changes: 3 additions & 0 deletions .vs/ProjectSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"CurrentProjectSetting": null
}
6 changes: 6 additions & 0 deletions .vs/VSWorkspaceState.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"ExpandedNodes": [
""
],
"PreviewInSolutionExplorer": false
}
Binary file added .vs/slnx.sqlite
Binary file not shown.
32 changes: 28 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,35 @@ Project 0 Getting Started

**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Griffin Evans
* [email protected], [personal website](evanses.com/griffin)
* Tested on lab computer: Windows 11, i9-12900F @ 2.40GHz 22GB, NVIDIA GeForce RTX 3090 (Levine 057 #3)

### (TODO: Your README)
### My README

Include screenshots, analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)

2.1.2

![](images/Screenshot%202025-08-28%20143319.png)

2.1.3

![](images/Screenshot%202025-08-28%20151313.png)

2.1.4

![](images/Screenshot%202025-08-29%20161428.png)

2.1.5 — had error as described in https://edstem.org/us/courses/81464/discussion/6880884

![](images/Screenshot%202025-08-29%20161908.png)

2.2

![](images/Screenshot%202025-08-29%20162107.png)

2.3

![](images/Screenshot%202025-08-29%20162534.png)
15 changes: 15 additions & 0 deletions cuda-gl-check/CMakeSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"configurations": [
{
"name": "x64-Debug",
"generator": "Ninja",
"configurationType": "Debug",
"inheritEnvironments": [ "msvc_x64_x64" ],
"buildRoot": "${projectDir}\\out\\build\\${name}",
"installRoot": "${projectDir}\\out\\install\\${name}",
"cmakeCommandArgs": "",
"buildCommandArgs": "",
"ctestCommandArgs": ""
}
]
}
2 changes: 1 addition & 1 deletion cuda-gl-check/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
*/
int main(int argc, char* argv[]) {
// TODO: Change this line to use your name!
m_yourName = "TODO: YOUR NAME HERE";
m_yourName = "Griffin Evans";

if (init(argc, argv)) {
mainLoop();
Expand Down
2 changes: 1 addition & 1 deletion cuda-introduction/source/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
{
// TODO: implement a 1 line function to return the divup operation.
// Note: You only need to use addition, subtraction, and division operations.
return 0;
return (size - 1) / div + 1;
}

void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
Expand Down
52 changes: 41 additions & 11 deletions cuda-introduction/source/matmul.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,24 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
{
// TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
// Remember that each thread of the kernel computes the result of 1 unique element of P
unsigned px;
unsigned py;
unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
unsigned py = blockIdx.y * blockDim.y + threadIdx.y;

// TODO 10b: Check if px or py are out of bounds. If they are, return.
if (px >= sizeMX || py >= sizeNY) {
return;
}

// TODO 10c: Compute the dot product for the P element in each thread
// This loop will be the same as the host loop
float dot = 0.0;
for (unsigned k = 0; k < sizeXY; ++k) {
dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k];
}

// TODO 10d: Copy dot to P matrix
// matrixP[] = dot;
matrixP[py * sizeMX + px] = dot;
}

int main(int argc, char *argv[])
Expand All @@ -31,19 +38,19 @@ int main(int argc, char *argv[])
// Then try large multiple-block square matrix like 64x64 up to 2048x2048.
// Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
// Then try rectangles with powers of two and then non-power-of-two.
const unsigned sizeMX = 0;
const unsigned sizeXY = 0;
const unsigned sizeNY = 0;
const unsigned sizeMX = 245;
const unsigned sizeXY = 2049;
const unsigned sizeNY = 771;

// TODO 2: Allocate host 1D arrays for:
// matrixM[sizeMX, sizeXY]
// matrixN[sizeXY, sizeNY]
// matrixP[sizeMX, sizeNY]
// matrixPGold[sizeMX, sizeNY]
float* matrixM;
float* matrixN;
float* matrixP;
float* matrixPGold;
float* matrixM = new float[sizeMX * sizeXY];
float* matrixN = new float[sizeXY * sizeNY];
float* matrixP = new float[sizeMX * sizeNY];
float* matrixPGold = new float[sizeMX * sizeNY];

// LOOK: Setup random number generator and fill host arrays and the scalar a.
std::random_device rd;
Expand All @@ -65,13 +72,30 @@ int main(int argc, char *argv[])
// for k -> 0 to sizeXY
// dot = m[k, px] * n[py, k]
// matrixPGold[py, px] = dot
for (unsigned py = 0; py < sizeNY; ++py) {
for (unsigned px = 0; px < sizeMX; ++px) {
float dot = 0.f;
for (unsigned k = 0; k < sizeXY; ++k) {
dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k];
}
matrixPGold[py * sizeMX + px] = dot;
// TODO check this direction right
}
}

// Device arrays
float *d_matrixM, *d_matrixN, *d_matrixP;

// TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));

// TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));



CUDA(cudaDeviceSynchronize());

Expand All @@ -86,13 +110,16 @@ int main(int argc, char *argv[])
// Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
// HINT: The shape of matrices has no impact on launch configuaration
DIMS dims;
dims.dimBlock = dim3(1, 1, 1);
dims.dimGrid = dim3(1, 1, 1);
const unsigned BS_X = 32, BS_Y = 32;
dims.dimBlock = dim3(BS_X, BS_Y, 1);
dims.dimGrid = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1);

// TODO 7: Launch the matrix transpose kernel
// matrixMultiplicationNaive<<<>>>();
matrixMultiplicationNaive<<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);

// TODO 8: copy the answer back to the host (CPU) from the device (GPU)
CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));

// LOOK: Use compareReferenceAndResult to check the result
compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
Expand All @@ -101,6 +128,9 @@ int main(int argc, char *argv[])
////////////////////////////////////////////////////////////

// TODO 9: free device memory using cudaFree
CUDA(cudaFree(d_matrixM));
CUDA(cudaFree(d_matrixN));
CUDA(cudaFree(d_matrixP));

// free host memory
delete[] matrixM;
Expand Down
20 changes: 16 additions & 4 deletions cuda-introduction/source/saxpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,21 @@
__global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
{
// TODO 9: Compute the global index for each thread.
unsigned idx = 0;
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;

// TODO 10: Check if idx is out of bounds. If yes, return.
if (idx >= 0)
if (idx >= size)
return;

// TODO 11: Perform the SAXPY operation: z = a * x + y.
z[idx] = a * x[idx] + y[idx];
}

int main(int argc, char *argv[])
{
// TODO 1: Set the size. Start with something simple like 64.
// TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
const unsigned size = 0;
const unsigned size = 64;

// Host arrays.
float* x = new float[size];
Expand Down Expand Up @@ -53,9 +54,15 @@ int main(int argc, char *argv[])

// TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
// CUDA(cudaMalloc((void **)& pointer, size in bytes)));
unsigned sizeInBytes = size * sizeof(float);
CUDA(cudaMalloc((void**)&d_x, sizeInBytes));
CUDA(cudaMalloc((void**)&d_y, sizeInBytes));
CUDA(cudaMalloc((void**)&d_z, sizeInBytes));

// TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
// CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
CUDA(cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice));
CUDA(cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice));

CUDA(cudaDeviceSynchronize());

Expand All @@ -69,16 +76,18 @@ int main(int argc, char *argv[])
// TODO 4: Setup threads and blocks.
// Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
// Use divup to get the number of blocks to launch.
const unsigned threadsPerBlock = 0;
const unsigned threadsPerBlock = 128;

// TODO 5: Implement the divup function in common.cpp
const unsigned blocks = divup(size, threadsPerBlock);

// TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
// saxpy<<< >>> (....);
saxpy<<<blocks, threadsPerBlock>>>(d_z, d_x, d_y, a, size);

// TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
// Copy what you did in 3, except for d_z -> z.
CUDA(cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost));

// LOOK: Use postprocess to check the result
compareReferenceAndResult(z_gold, z, size, 1e-6);
Expand All @@ -87,6 +96,9 @@ int main(int argc, char *argv[])

// TODO 8: free device memory using cudaFree
// CUDA(cudaFree(device pointer));
CUDA(cudaFree(d_x));
CUDA(cudaFree(d_y));
CUDA(cudaFree(d_z));

// free host memory
delete[] x;
Expand Down
42 changes: 30 additions & 12 deletions cuda-introduction/source/transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@
__global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
{
// TODO 6a: Compute the global index for each thread along x and y dimentions.
unsigned i = 0;
unsigned j = 0;;
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned j = blockIdx.y * blockDim.y + threadIdx.y;

// TODO 6b: Check if i or j are out of bounds. If they are, return.
if (i >= sizeX || j >= sizeY) {
return;
}

// TODO 6c: Compute global 1D index from i and j
unsigned index = 0;
unsigned index = j * sizeX + i;

// TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
// b[] = a[];
b[index] = a[index];
}

// TODO 11: Implement the transpose kernel
Expand All @@ -38,16 +42,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
__global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
{
// TODO 11a: Compute the global index for each thread along x and y dimentions.
unsigned i = 0;
unsigned j = 0;
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned j = blockIdx.y * blockDim.y + threadIdx.y;

// TODO 11b: Check if i or j are out of bounds. If they are, return.
if (i >= sizeX || j >= sizeY) {
return;
}

// TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
unsigned index_in = 0; // Compute input index (i,j) from matrix A
unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A)

unsigned index_in = j * sizeX + i; // Compute input index (i,j) from matrix A
unsigned index_out = i * sizeY + j; // Compute output index (j,i) in matrix B = transpose(A)
// TODO 11d: Copy data from A to B using transpose indices
b[index_out] = a[index_in];
}

int main(int argc, char *argv[])
Expand Down Expand Up @@ -82,8 +89,12 @@ int main(int argc, char *argv[])
float *d_a, *d_b;

// TODO 2: Allocate memory on the device for d_a and d_b.
unsigned sizeInBytes = sizeX * sizeY * sizeof(float);
CUDA(cudaMalloc((void**)&d_a, sizeInBytes));
CUDA(cudaMalloc((void**)&d_b, sizeInBytes));

// TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
CUDA(cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice));

CUDA(cudaDeviceSynchronize());

Expand All @@ -97,13 +108,15 @@ int main(int argc, char *argv[])
// TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
// Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
DIMS dims;
dims.dimBlock = dim3(1, 1, 1);
dims.dimGrid = dim3(1, 1, 1);
const unsigned BS_X = 32, BS_Y = 32;
dims.dimBlock = dim3(BS_X, BS_Y, 1);
dims.dimGrid = dim3(divup(sizeX,BS_X), divup(sizeY,BS_Y), 1);

// LOOK: Launch the copy kernel
copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);

// TODO 5: copy the answer back to the host (CPU) from the device (GPU)
CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost));

// LOOK: Use compareReferenceAndResult to check the result
compareReferenceAndResult(a_gold, b, sizeX * sizeY);
Expand All @@ -121,13 +134,16 @@ int main(int argc, char *argv[])
// TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
// Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
DIMS dims;
dims.dimBlock = dim3(1, 1, 1);
dims.dimGrid = dim3(1, 1, 1);
const unsigned BS_X = 32, BS_Y = 32;
dims.dimBlock = dim3(BS_X, BS_Y, 1);
dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1);

// TODO 9: Launch the matrix transpose kernel
// matrixTransposeNaive<<<>>>(......);
matrixTransposeNaive<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);

// TODO 10: copy the answer back to the host (CPU) from the device (GPU)
CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost));

// LOOK: Use compareReferenceAndResult to check the result
compareReferenceAndResult(b_gold, b, sizeX * sizeY);
Expand All @@ -136,6 +152,8 @@ int main(int argc, char *argv[])
////////////////////////////////////////////////////////////

// TODO 7: free device memory using cudaFree
CUDA(cudaFree(d_a));
CUDA(cudaFree(d_b));

// free host memory
delete[] a;
Expand Down
Binary file added images/Screenshot 2025-08-28 143319.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Screenshot 2025-08-28 151313.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Screenshot 2025-08-29 161428.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Screenshot 2025-08-29 161908.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Screenshot 2025-08-29 162107.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Screenshot 2025-08-29 162534.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.