@@ -31,7 +31,6 @@ using namespace nvcuda;
3131// Support A and B matrix with row-major inorder to compare with the kernels using CUDA Cores in
3232// hgemm.cu and hgemm_async.cu.
3333
34-
3534HOST_DEVICE_INLINE
3635int div_ceil(int a, int b) { return (a % b != 0 ) ? (a / b + 1 ) : (a / b); }
3736
@@ -41,7 +40,7 @@ int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
4140// 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址
4241// 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread
4342// Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
44- // 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch'
43+ // 3. __launch_bounds__: avoid error 'too many resources required for launch'
4544// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
4645template <const int WMMA_M=16 , const int WMMA_N=16 , const int WMMA_K=16 ,
4746 const int WMMA_TILE_M=4 , const int WMMA_TILE_N=2 ,
@@ -257,7 +256,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel(
257256// 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址
258257// 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread
259258// Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
260- // 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch'
259+ // 3. __launch_bounds__: avoid error 'too many resources required for launch'
261260// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
262261template <const int WMMA_M=16 , const int WMMA_N=16 , const int WMMA_K=16 ,
263262 const int WMMA_TILE_M=4 , const int WMMA_TILE_N=2 ,
@@ -476,7 +475,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel(
476475}
477476
478477// stage with 256x256 block, dynamic smem
479- // nedd __launch_bounds__ to avoid error 'too many resources required for launch'
478+ // __launch_bounds__: avoid error 'too many resources required for launch'
480479// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
481480template <const int WMMA_M=16 , const int WMMA_N=16 , const int WMMA_K=16 ,
482481 const int WMMA_TILE_M=4 , const int WMMA_TILE_N=4 ,
0 commit comments