Fix correlation on A100 CUTLASS GEMM Kernels (#127)

JRPan · web-flow · commit 987a23f84f87 · 2025-09-03T13:01:54.000-07:00
* 1. Add a queue to promote fairness CTA scheduling
2. Add a Perfect xbar interconnect option

* remove merge queue

* Should not change this

* rename pending cta to allocated_ctas

* run formatter
diff --git a/.github/workflows/format_check.yml b/.github/workflows/format_check.yml
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -7,7 +7,7 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue**"
-  merge_group:
+  pull_request:
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
@@ -782,6 +782,7 @@ kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
       num_blocks() * entry->gpgpu_ctx->device_runtime->g_TB_launch_latency;
 
   cache_config_set = false;
+  allocated_ctas = 0;
 }
 
 /*A snapshot of the texture mappings needs to be stored in the kernel's info as
@@ -815,6 +816,7 @@ kernel_info_t::kernel_info_t(
   cache_config_set = false;
   m_NameToCudaArray = nameToCudaArray;
   m_NameToTextureInfo = nameToTextureInfo;
+  allocated_ctas = 0;
 }
 
 kernel_info_t::~kernel_info_t() {
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
@@ -369,6 +369,7 @@ class kernel_info_t {
 
   // Jin: kernel timing
  public:
+  unsigned allocated_ctas;
   unsigned long long launch_cycle;
   unsigned long long start_cycle;
   unsigned long long end_cycle;
diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc
@@ -116,10 +116,24 @@ void xbar_router::Advance() {
     RR_Advance();
   else if (arbit_type == iSLIP)
     iSLIP_Advance();
+  else if (arbit_type == PERFECT)
+    Perfect_Advance();
   else
     assert(0);
 }
 
+void xbar_router::Perfect_Advance() {
+  for (unsigned node_id = 0; node_id < total_nodes; node_id++) {
+    if (!in_buffers[node_id].empty()) {
+      Packet _packet = in_buffers[node_id].front();
+      if (Has_Buffer_Out(_packet.output_deviceID, 1)) {
+        out_buffers[_packet.output_deviceID].push(_packet);
+        in_buffers[node_id].pop();
+      }
+    }
+  }
+};
+
 void xbar_router::RR_Advance() {
   bool active = false;
   vector<bool> issued(total_nodes, false);
@@ -433,4 +447,4 @@ unsigned LocalInterconnect::GetFlitSize() const { return LOCAL_INCT_FLIT_SIZE; }
 
 void LocalInterconnect::DisplayState(FILE* fp) const {
   fprintf(fp, "GPGPU-Sim uArch: ICNT:Display State: Under implementation\n");
-}
+}
diff --git a/src/gpgpu-sim/local_interconnect.h b/src/gpgpu-sim/local_interconnect.h
@@ -37,7 +37,11 @@ using namespace std;
 
 enum Interconnect_type { REQ_NET = 0, REPLY_NET = 1 };
 
-enum Arbiteration_type { NAIVE_RR = 0, iSLIP = 1 };
+enum Arbiteration_type {
+  NAIVE_RR = 0,  //
+  iSLIP,
+  PERFECT
+};
 
 struct inct_config {
   // config for local interconnect
@@ -80,6 +84,7 @@ class xbar_router {
  private:
   void iSLIP_Advance();
   void RR_Advance();
+  void Perfect_Advance();
 
   struct Packet {
     Packet(void* m_data, unsigned m_output_deviceID) {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
@@ -2282,32 +2282,40 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
     // bypass L1 cache
     unsigned control_size =
         inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
-    unsigned size = access.get_size() + control_size;
-    // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
-    if (m_memory_config->SST_mode &&
-        (static_cast<sst_memory_interface *>(m_icnt)->full(
-            size, inst.is_store() || inst.isatomic(), access.get_type()))) {
-      // SST need mf type here
-      // Cast it to sst_memory_interface pointer first as this full() method
-      // is not a virtual method in parent class
-      stall_cond = ICNT_RC_FAIL;
-    } else if (!m_memory_config->SST_mode &&
-               (m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
-      stall_cond = ICNT_RC_FAIL;
-    } else {
-      mem_fetch *mf =
-          m_mf_allocator->alloc(inst, access,
-                                m_core->get_gpu()->gpu_sim_cycle +
-                                    m_core->get_gpu()->gpu_tot_sim_cycle);
-      m_icnt->push(mf);
-      inst.accessq_pop_back();
-      // inst.clear_active( access.get_warp_mask() );
-      if (inst.is_load()) {
-        for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
-          if (inst.out[r] > 0)
-            assert(m_pending_writes[inst.warp_id()][inst.out[r]] > 0);
-      } else if (inst.is_store())
-        m_core->inc_store_req(inst.warp_id());
+    for (unsigned i = 0; i < m_config->m_L1D_config.l1_banks; i++) {
+      if (inst.accessq_empty()) {
+        break;
+      }
+      const mem_access_t &access = inst.accessq_back();
+      unsigned size = access.get_size() + control_size;
+      // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
+      if (m_memory_config->SST_mode &&
+          (static_cast<sst_memory_interface *>(m_icnt)->full(
+              size, inst.is_store() || inst.isatomic(), access.get_type()))) {
+        // SST need mf type here
+        // Cast it to sst_memory_interface pointer first as this full() method
+        // is not a virtual method in parent class
+        stall_cond = ICNT_RC_FAIL;
+        break;
+      } else if (!m_memory_config->SST_mode &&
+                 (m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
+        stall_cond = ICNT_RC_FAIL;
+        break;
+      } else {
+        mem_fetch *mf =
+            m_mf_allocator->alloc(inst, access,
+                                  m_core->get_gpu()->gpu_sim_cycle +
+                                      m_core->get_gpu()->gpu_tot_sim_cycle);
+        m_icnt->push(mf);
+        inst.accessq_pop_back();
+        // inst.clear_active( access.get_warp_mask() );
+        if (inst.is_load()) {
+          for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
+            if (inst.out[r] > 0)
+              assert(m_pending_writes[inst.warp_id()][inst.out[r]] > 0);
+        } else if (inst.is_store())
+          m_core->inc_store_req(inst.warp_id());
+      }
     }
   } else {
     assert(CACHE_UNDEFINED != inst.cache_op);
@@ -4534,41 +4542,55 @@ unsigned simt_core_cluster::get_n_active_sms() const {
 }
 
 unsigned simt_core_cluster::issue_block2core() {
+  const unsigned max_pending_ctas = 4;
+  for (unsigned core = 0; core < m_config->n_simt_cores_per_cluster; core++) {
+    if (m_core[core]->pending_ctas.size() < max_pending_ctas) {
+      kernel_info_t *kernel;
+      // Jin: fetch kernel according to concurrent kernel setting
+      if (m_config->gpgpu_concurrent_kernel_sm) {  // concurrent kernel on sm
+        // always select latest issued kernel
+        kernel_info_t *k = m_gpu->select_kernel();
+        kernel = k;
+      } else {
+        // first select core kernel, if no more cta, get a new kernel
+        // only when core completes
+        kernel = m_core[core]->get_kernel();
+        if (!m_gpu->kernel_more_cta_left(kernel)) {
+          // wait till current kernel finishes
+          if (m_core[core]->get_not_completed() == 0 &&
+              m_core[core]->pending_ctas.empty()) {
+            kernel_info_t *k = m_gpu->select_kernel();
+            if (k) m_core[core]->set_kernel(k);
+            kernel = k;
+          }
+        }
+      }
+      if (kernel) {
+        if (kernel->allocated_ctas < kernel->num_blocks()) {
+          m_core[core]->pending_ctas.push_back(kernel);
+          kernel->allocated_ctas++;
+        }
+      }
+    }
+  }
+
   unsigned num_blocks_issued = 0;
   for (unsigned i = 0; i < m_config->n_simt_cores_per_cluster; i++) {
     unsigned core =
         (i + m_cta_issue_next_core + 1) % m_config->n_simt_cores_per_cluster;
 
-    kernel_info_t *kernel;
-    // Jin: fetch kernel according to concurrent kernel setting
-    if (m_config->gpgpu_concurrent_kernel_sm) {  // concurrent kernel on sm
-      // always select latest issued kernel
-      kernel_info_t *k = m_gpu->select_kernel();
-      kernel = k;
-    } else {
-      // first select core kernel, if no more cta, get a new kernel
-      // only when core completes
-      kernel = m_core[core]->get_kernel();
-      if (!m_gpu->kernel_more_cta_left(kernel)) {
-        // wait till current kernel finishes
-        if (m_core[core]->get_not_completed() == 0) {
-          kernel_info_t *k = m_gpu->select_kernel();
-          if (k) m_core[core]->set_kernel(k);
-          kernel = k;
-        }
+    if (m_core[core]->pending_ctas.size() > 0) {
+      kernel_info_t *pending_cta = m_core[core]->pending_ctas.front();
+      if (m_core[core]->can_issue_1block(*pending_cta)) {
+        m_core[core]->issue_block2core(*pending_cta);
+        m_core[core]->pending_ctas.pop_front();
+        num_blocks_issued++;
+        m_cta_issue_next_core = core;
+        break;
       }
     }
-
-    if (m_gpu->kernel_more_cta_left(kernel) &&
-        //            (m_core[core]->get_n_active_cta() <
-        //            m_config->max_cta(*kernel)) ) {
-        m_core[core]->can_issue_1block(*kernel)) {
-      m_core[core]->issue_block2core(*kernel);
-      num_blocks_issued++;
-      m_cta_issue_next_core = core;
-      break;
-    }
   }
+
   return num_blocks_issued;
 }
 
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
@@ -2564,6 +2564,7 @@ class shader_core_ctx : public core_t {
 
   // Jin: concurrent kernels on a sm
  public:
+  std::deque<kernel_info_t *> pending_ctas;
   bool can_issue_1block(kernel_info_t &kernel);
   bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy);
   void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel);

Original file line number	Diff line number	Diff line change
`@@ -782,6 +782,7 @@ kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,`
`782`	`782`	`num_blocks() * entry->gpgpu_ctx->device_runtime->g_TB_launch_latency;`
`783`	`783`
`784`	`784`	`cache_config_set = false;`
	`785`	`+ allocated_ctas = 0;`
`785`	`786`	`}`
`786`	`787`
`787`	`788`	`/*A snapshot of the texture mappings needs to be stored in the kernel's info as`
`@@ -815,6 +816,7 @@ kernel_info_t::kernel_info_t(`
`815`	`816`	`cache_config_set = false;`
`816`	`817`	`m_NameToCudaArray = nameToCudaArray;`
`817`	`818`	`m_NameToTextureInfo = nameToTextureInfo;`
	`819`	`+ allocated_ctas = 0;`
`818`	`820`	`}`
`819`	`821`
`820`	`822`	`kernel_info_t::~kernel_info_t() {`