Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
**/build/
**/.vs/
41 changes: 35 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,40 @@ Project 0 Getting Started

**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Jiangman(Lobi) Zhao
* [Lobi Zhao - LinkedIn](https://www.linkedin.com/in/lobizhao/), [Lobi Zhao - personal website](https://lobizhao.github.io/).
* Tested on: Windows 11 Pro, i5-10600KF @ 4.10GHz 32GB, RTX 3080 10GB

### (TODO: Your README)
### README

Include screenshots, analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
## SUMMAYR

This is my first time coding with CUDA, so I'm not sure if I have successfully completed all the tasks for Project 0.

Yesterday, I reviewed the class slide documents, which reinforced my understanding of how the GPU operates and its specialized terminology. I was particularly confused about the number of threads that can run simultaneously, especially regarding CPU cores versus GPU cores. However, it's clear to me now.

I also found some pretty good learning materials.

[CUDA Thread indexing](https://www.eecs.umich.edu/courses/eecs471/resources/materials/CUDA-Thread-Indexing-Cheatsheet.pdf).




## screenshots
- part 2.12
![part 2.12](images/01_cudaGLCheck_Lobi.png)
- part 2.13
![part 2.13](images/02_AutosWarpInfo.png)
- part 2.14
![part 2.14](images/03_nsightAnalysisSummary.png)
![part 2.14.2](images/03_nsightTimelineView.png)
- part 2.15
![part 2.15](images/04_NsightComputingError.png)
- part 2.2
![part 2.2](images/05_webGLSupport.png)
- part 2.3
![part 2.3](images/05_webGPUSupport.png)

## analysis
- Test
![test](images/saxpyWarpInfo.png)
2 changes: 1 addition & 1 deletion cuda-gl-check/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
*/
int main(int argc, char* argv[]) {
// TODO: Change this line to use your name!
m_yourName = "TODO: YOUR NAME HERE";
m_yourName = "Jiangman(Lobi) Zhao";

if (init(argc, argv)) {
mainLoop();
Expand Down
4 changes: 3 additions & 1 deletion cuda-introduction/source/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ unsigned divup(unsigned size, unsigned div)
{
// TODO: implement a 1 line function to return the divup operation.
// Note: You only need to use addition, subtraction, and division operations.
return 0;
unsigned result = (size + div - 1) / div;

return result;
}

void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
Expand Down
59 changes: 52 additions & 7 deletions cuda-introduction/source/matmul.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,28 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
{
// TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
// Remember that each thread of the kernel computes the result of 1 unique element of P
unsigned px;
unsigned py;
unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
unsigned py = blockIdx.y * blockDim.y + threadIdx.y;

// TODO 10b: Check if px or py are out of bounds. If they are, return.
if (px >= sizeMX || py >= sizeNY) {
return;
}

// TODO 10c: Compute the dot product for the P element in each thread
// This loop will be the same as the host loop

float dot = 0.0;

for (unsigned i = 0; i < sizeXY; i++) {

dot += matrixM[px * sizeXY + i] * matrixN[i * sizeNY + py];

}

// TODO 10d: Copy dot to P matrix
// matrixP[] = dot;
matrixP[px * sizeNY + py] = dot;
}

int main(int argc, char *argv[])
Expand All @@ -31,9 +42,9 @@ int main(int argc, char *argv[])
// Then try large multiple-block square matrix like 64x64 up to 2048x2048.
// Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
// Then try rectangles with powers of two and then non-power-of-two.
const unsigned sizeMX = 0;
const unsigned sizeXY = 0;
const unsigned sizeNY = 0;
const unsigned sizeMX = 16;
const unsigned sizeXY = 16;
const unsigned sizeNY = 16;

// TODO 2: Allocate host 1D arrays for:
// matrixM[sizeMX, sizeXY]
Expand All @@ -45,6 +56,11 @@ int main(int argc, char *argv[])
float* matrixP;
float* matrixPGold;

matrixM = new float[sizeMX* sizeXY];
matrixN = new float[sizeXY* sizeNY];
matrixP = new float[sizeMX* sizeNY];
matrixPGold = new float[sizeMX* sizeNY];

// LOOK: Setup random number generator and fill host arrays and the scalar a.
std::random_device rd;
std::mt19937 mt(rd());
Expand All @@ -66,13 +82,30 @@ int main(int argc, char *argv[])
// dot = m[k, px] * n[py, k]
// matrixPGold[py, px] = dot

for (unsigned py = 0; py < sizeNY; py++) {
for (unsigned px = 0; px < sizeMX; px++) {
float dot = 0.0;
for (unsigned k = 0; k < sizeXY; k++) {
dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
}
matrixPGold[px * sizeNY + py] = dot;
}
}

// Device arrays
float *d_matrixM, *d_matrixN, *d_matrixP;

// TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.

CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));

// TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)

CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));

CUDA(cudaDeviceSynchronize());

////////////////////////////////////////////////////////////
Expand All @@ -85,15 +118,23 @@ int main(int argc, char *argv[])
// TODO 6: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
// Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
// HINT: The shape of matrices has no impact on launch configuaration

const unsigned BS_X = 16;
const unsigned BS_Y = 16;

DIMS dims;
dims.dimBlock = dim3(1, 1, 1);
dims.dimGrid = dim3(1, 1, 1);
dims.dimBlock = dim3(BS_X, BS_Y, 1);
dims.dimGrid = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1);

// TODO 7: Launch the matrix transpose kernel
// matrixMultiplicationNaive<<<>>>();

matrixMultiplicationNaive <<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);

// TODO 8: copy the answer back to the host (CPU) from the device (GPU)

CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));

// LOOK: Use compareReferenceAndResult to check the result
compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);

Expand All @@ -102,6 +143,10 @@ int main(int argc, char *argv[])

// TODO 9: free device memory using cudaFree

CUDA(cudaFree(d_matrixM));
CUDA(cudaFree(d_matrixN));
CUDA(cudaFree(d_matrixP));

// free host memory
delete[] matrixM;
delete[] matrixN;
Expand Down
30 changes: 26 additions & 4 deletions cuda-introduction/source/saxpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,25 @@
__global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
{
// TODO 9: Compute the global index for each thread.
unsigned idx = 0;
//unsigned idx = 0;
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;

// TODO 10: Check if idx is out of bounds. If yes, return.
if (idx >= 0)
if (idx >= size)
return;

// TODO 11: Perform the SAXPY operation: z = a * x + y.

z[idx] = a * x[idx] + y[idx];
}

int main(int argc, char *argv[])
{
// TODO 1: Set the size. Start with something simple like 64.
// TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
const unsigned size = 0;

//const unsigned size = 0;
const unsigned size = 64;

// Host arrays.
float* x = new float[size];
Expand Down Expand Up @@ -54,9 +59,18 @@ int main(int argc, char *argv[])
// TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
// CUDA(cudaMalloc((void **)& pointer, size in bytes)));

CUDA(cudaMalloc((void**) & d_x, size * sizeof(float)));
CUDA(cudaMalloc((void**) & d_y, size * sizeof(float)));
CUDA(cudaMalloc((void**) & d_z, size * sizeof(float)));


// TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
// CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));

CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice));
CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice));


CUDA(cudaDeviceSynchronize());

////////////////////////////////////////////////////////////
Expand All @@ -69,16 +83,20 @@ int main(int argc, char *argv[])
// TODO 4: Setup threads and blocks.
// Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
// Use divup to get the number of blocks to launch.
const unsigned threadsPerBlock = 0;

//const unsigned threadsPerBlock = 0;
const unsigned threadsPerBlock = 128;

// TODO 5: Implement the divup function in common.cpp
const unsigned blocks = divup(size, threadsPerBlock);

// TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
// saxpy<<< >>> (....);
saxpy <<<blocks, threadsPerBlock >>> (d_z, d_x, d_y, a, size);

// TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
// Copy what you did in 3, except for d_z -> z.
CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost));

// LOOK: Use postprocess to check the result
compareReferenceAndResult(z_gold, z, size, 1e-6);
Expand All @@ -87,6 +105,10 @@ int main(int argc, char *argv[])

// TODO 8: free device memory using cudaFree
// CUDA(cudaFree(device pointer));

CUDA(cudaFree(d_x));
CUDA(cudaFree(d_y));
CUDA(cudaFree(d_z));

// free host memory
delete[] x;
Expand Down
Loading