Skip to content

Commit 987a23f

Browse files
authored
Fix correlation on A100 CUTLASS GEMM Kernels (#127)
* 1. Add a queue to promote fairness CTA scheduling 2. Add a Perfect xbar interconnect option * remove merge queue * Should not change this * rename pending cta to allocated_ctas * run formatter
1 parent 21b18be commit 987a23f

File tree

8 files changed

+101
-88
lines changed

8 files changed

+101
-88
lines changed

.github/workflows/format_check.yml

Lines changed: 0 additions & 32 deletions
This file was deleted.

.github/workflows/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
push:
88
branches-ignore:
99
- "gh-readonly-queue**"
10-
merge_group:
10+
pull_request:
1111
# Allows you to run this workflow manually from the Actions tab
1212
workflow_dispatch:
1313

src/abstract_hardware_model.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,7 @@ kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
782782
num_blocks() * entry->gpgpu_ctx->device_runtime->g_TB_launch_latency;
783783

784784
cache_config_set = false;
785+
allocated_ctas = 0;
785786
}
786787

787788
/*A snapshot of the texture mappings needs to be stored in the kernel's info as
@@ -815,6 +816,7 @@ kernel_info_t::kernel_info_t(
815816
cache_config_set = false;
816817
m_NameToCudaArray = nameToCudaArray;
817818
m_NameToTextureInfo = nameToTextureInfo;
819+
allocated_ctas = 0;
818820
}
819821

820822
kernel_info_t::~kernel_info_t() {

src/abstract_hardware_model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,7 @@ class kernel_info_t {
369369

370370
// Jin: kernel timing
371371
public:
372+
unsigned allocated_ctas;
372373
unsigned long long launch_cycle;
373374
unsigned long long start_cycle;
374375
unsigned long long end_cycle;

src/gpgpu-sim/local_interconnect.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,24 @@ void xbar_router::Advance() {
116116
RR_Advance();
117117
else if (arbit_type == iSLIP)
118118
iSLIP_Advance();
119+
else if (arbit_type == PERFECT)
120+
Perfect_Advance();
119121
else
120122
assert(0);
121123
}
122124

125+
void xbar_router::Perfect_Advance() {
126+
for (unsigned node_id = 0; node_id < total_nodes; node_id++) {
127+
if (!in_buffers[node_id].empty()) {
128+
Packet _packet = in_buffers[node_id].front();
129+
if (Has_Buffer_Out(_packet.output_deviceID, 1)) {
130+
out_buffers[_packet.output_deviceID].push(_packet);
131+
in_buffers[node_id].pop();
132+
}
133+
}
134+
}
135+
};
136+
123137
void xbar_router::RR_Advance() {
124138
bool active = false;
125139
vector<bool> issued(total_nodes, false);
@@ -433,4 +447,4 @@ unsigned LocalInterconnect::GetFlitSize() const { return LOCAL_INCT_FLIT_SIZE; }
433447

434448
void LocalInterconnect::DisplayState(FILE* fp) const {
435449
fprintf(fp, "GPGPU-Sim uArch: ICNT:Display State: Under implementation\n");
436-
}
450+
}

src/gpgpu-sim/local_interconnect.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,11 @@ using namespace std;
3737

3838
enum Interconnect_type { REQ_NET = 0, REPLY_NET = 1 };
3939

40-
enum Arbiteration_type { NAIVE_RR = 0, iSLIP = 1 };
40+
enum Arbiteration_type {
41+
NAIVE_RR = 0, //
42+
iSLIP,
43+
PERFECT
44+
};
4145

4246
struct inct_config {
4347
// config for local interconnect
@@ -80,6 +84,7 @@ class xbar_router {
8084
private:
8185
void iSLIP_Advance();
8286
void RR_Advance();
87+
void Perfect_Advance();
8388

8489
struct Packet {
8590
Packet(void* m_data, unsigned m_output_deviceID) {

src/gpgpu-sim/shader.cc

Lines changed: 75 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2282,32 +2282,40 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
22822282
// bypass L1 cache
22832283
unsigned control_size =
22842284
inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
2285-
unsigned size = access.get_size() + control_size;
2286-
// printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
2287-
if (m_memory_config->SST_mode &&
2288-
(static_cast<sst_memory_interface *>(m_icnt)->full(
2289-
size, inst.is_store() || inst.isatomic(), access.get_type()))) {
2290-
// SST need mf type here
2291-
// Cast it to sst_memory_interface pointer first as this full() method
2292-
// is not a virtual method in parent class
2293-
stall_cond = ICNT_RC_FAIL;
2294-
} else if (!m_memory_config->SST_mode &&
2295-
(m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
2296-
stall_cond = ICNT_RC_FAIL;
2297-
} else {
2298-
mem_fetch *mf =
2299-
m_mf_allocator->alloc(inst, access,
2300-
m_core->get_gpu()->gpu_sim_cycle +
2301-
m_core->get_gpu()->gpu_tot_sim_cycle);
2302-
m_icnt->push(mf);
2303-
inst.accessq_pop_back();
2304-
// inst.clear_active( access.get_warp_mask() );
2305-
if (inst.is_load()) {
2306-
for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
2307-
if (inst.out[r] > 0)
2308-
assert(m_pending_writes[inst.warp_id()][inst.out[r]] > 0);
2309-
} else if (inst.is_store())
2310-
m_core->inc_store_req(inst.warp_id());
2285+
for (unsigned i = 0; i < m_config->m_L1D_config.l1_banks; i++) {
2286+
if (inst.accessq_empty()) {
2287+
break;
2288+
}
2289+
const mem_access_t &access = inst.accessq_back();
2290+
unsigned size = access.get_size() + control_size;
2291+
// printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
2292+
if (m_memory_config->SST_mode &&
2293+
(static_cast<sst_memory_interface *>(m_icnt)->full(
2294+
size, inst.is_store() || inst.isatomic(), access.get_type()))) {
2295+
// SST need mf type here
2296+
// Cast it to sst_memory_interface pointer first as this full() method
2297+
// is not a virtual method in parent class
2298+
stall_cond = ICNT_RC_FAIL;
2299+
break;
2300+
} else if (!m_memory_config->SST_mode &&
2301+
(m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
2302+
stall_cond = ICNT_RC_FAIL;
2303+
break;
2304+
} else {
2305+
mem_fetch *mf =
2306+
m_mf_allocator->alloc(inst, access,
2307+
m_core->get_gpu()->gpu_sim_cycle +
2308+
m_core->get_gpu()->gpu_tot_sim_cycle);
2309+
m_icnt->push(mf);
2310+
inst.accessq_pop_back();
2311+
// inst.clear_active( access.get_warp_mask() );
2312+
if (inst.is_load()) {
2313+
for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
2314+
if (inst.out[r] > 0)
2315+
assert(m_pending_writes[inst.warp_id()][inst.out[r]] > 0);
2316+
} else if (inst.is_store())
2317+
m_core->inc_store_req(inst.warp_id());
2318+
}
23112319
}
23122320
} else {
23132321
assert(CACHE_UNDEFINED != inst.cache_op);
@@ -4534,41 +4542,55 @@ unsigned simt_core_cluster::get_n_active_sms() const {
45344542
}
45354543

45364544
unsigned simt_core_cluster::issue_block2core() {
4545+
const unsigned max_pending_ctas = 4;
4546+
for (unsigned core = 0; core < m_config->n_simt_cores_per_cluster; core++) {
4547+
if (m_core[core]->pending_ctas.size() < max_pending_ctas) {
4548+
kernel_info_t *kernel;
4549+
// Jin: fetch kernel according to concurrent kernel setting
4550+
if (m_config->gpgpu_concurrent_kernel_sm) { // concurrent kernel on sm
4551+
// always select latest issued kernel
4552+
kernel_info_t *k = m_gpu->select_kernel();
4553+
kernel = k;
4554+
} else {
4555+
// first select core kernel, if no more cta, get a new kernel
4556+
// only when core completes
4557+
kernel = m_core[core]->get_kernel();
4558+
if (!m_gpu->kernel_more_cta_left(kernel)) {
4559+
// wait till current kernel finishes
4560+
if (m_core[core]->get_not_completed() == 0 &&
4561+
m_core[core]->pending_ctas.empty()) {
4562+
kernel_info_t *k = m_gpu->select_kernel();
4563+
if (k) m_core[core]->set_kernel(k);
4564+
kernel = k;
4565+
}
4566+
}
4567+
}
4568+
if (kernel) {
4569+
if (kernel->allocated_ctas < kernel->num_blocks()) {
4570+
m_core[core]->pending_ctas.push_back(kernel);
4571+
kernel->allocated_ctas++;
4572+
}
4573+
}
4574+
}
4575+
}
4576+
45374577
unsigned num_blocks_issued = 0;
45384578
for (unsigned i = 0; i < m_config->n_simt_cores_per_cluster; i++) {
45394579
unsigned core =
45404580
(i + m_cta_issue_next_core + 1) % m_config->n_simt_cores_per_cluster;
45414581

4542-
kernel_info_t *kernel;
4543-
// Jin: fetch kernel according to concurrent kernel setting
4544-
if (m_config->gpgpu_concurrent_kernel_sm) { // concurrent kernel on sm
4545-
// always select latest issued kernel
4546-
kernel_info_t *k = m_gpu->select_kernel();
4547-
kernel = k;
4548-
} else {
4549-
// first select core kernel, if no more cta, get a new kernel
4550-
// only when core completes
4551-
kernel = m_core[core]->get_kernel();
4552-
if (!m_gpu->kernel_more_cta_left(kernel)) {
4553-
// wait till current kernel finishes
4554-
if (m_core[core]->get_not_completed() == 0) {
4555-
kernel_info_t *k = m_gpu->select_kernel();
4556-
if (k) m_core[core]->set_kernel(k);
4557-
kernel = k;
4558-
}
4582+
if (m_core[core]->pending_ctas.size() > 0) {
4583+
kernel_info_t *pending_cta = m_core[core]->pending_ctas.front();
4584+
if (m_core[core]->can_issue_1block(*pending_cta)) {
4585+
m_core[core]->issue_block2core(*pending_cta);
4586+
m_core[core]->pending_ctas.pop_front();
4587+
num_blocks_issued++;
4588+
m_cta_issue_next_core = core;
4589+
break;
45594590
}
45604591
}
4561-
4562-
if (m_gpu->kernel_more_cta_left(kernel) &&
4563-
// (m_core[core]->get_n_active_cta() <
4564-
// m_config->max_cta(*kernel)) ) {
4565-
m_core[core]->can_issue_1block(*kernel)) {
4566-
m_core[core]->issue_block2core(*kernel);
4567-
num_blocks_issued++;
4568-
m_cta_issue_next_core = core;
4569-
break;
4570-
}
45714592
}
4593+
45724594
return num_blocks_issued;
45734595
}
45744596

src/gpgpu-sim/shader.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2564,6 +2564,7 @@ class shader_core_ctx : public core_t {
25642564

25652565
// Jin: concurrent kernels on a sm
25662566
public:
2567+
std::deque<kernel_info_t *> pending_ctas;
25672568
bool can_issue_1block(kernel_info_t &kernel);
25682569
bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy);
25692570
void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel);

0 commit comments

Comments
 (0)