@@ -2282,32 +2282,40 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
2282
2282
// bypass L1 cache
2283
2283
unsigned control_size =
2284
2284
inst.is_store () ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
2285
- unsigned size = access.get_size () + control_size;
2286
- // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
2287
- if (m_memory_config->SST_mode &&
2288
- (static_cast <sst_memory_interface *>(m_icnt)->full (
2289
- size, inst.is_store () || inst.isatomic (), access.get_type ()))) {
2290
- // SST need mf type here
2291
- // Cast it to sst_memory_interface pointer first as this full() method
2292
- // is not a virtual method in parent class
2293
- stall_cond = ICNT_RC_FAIL;
2294
- } else if (!m_memory_config->SST_mode &&
2295
- (m_icnt->full (size, inst.is_store () || inst.isatomic ()))) {
2296
- stall_cond = ICNT_RC_FAIL;
2297
- } else {
2298
- mem_fetch *mf =
2299
- m_mf_allocator->alloc (inst, access,
2300
- m_core->get_gpu ()->gpu_sim_cycle +
2301
- m_core->get_gpu ()->gpu_tot_sim_cycle );
2302
- m_icnt->push (mf);
2303
- inst.accessq_pop_back ();
2304
- // inst.clear_active( access.get_warp_mask() );
2305
- if (inst.is_load ()) {
2306
- for (unsigned r = 0 ; r < MAX_OUTPUT_VALUES; r++)
2307
- if (inst.out [r] > 0 )
2308
- assert (m_pending_writes[inst.warp_id ()][inst.out [r]] > 0 );
2309
- } else if (inst.is_store ())
2310
- m_core->inc_store_req (inst.warp_id ());
2285
+ for (unsigned i = 0 ; i < m_config->m_L1D_config .l1_banks ; i++) {
2286
+ if (inst.accessq_empty ()) {
2287
+ break ;
2288
+ }
2289
+ const mem_access_t &access = inst.accessq_back ();
2290
+ unsigned size = access.get_size () + control_size;
2291
+ // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
2292
+ if (m_memory_config->SST_mode &&
2293
+ (static_cast <sst_memory_interface *>(m_icnt)->full (
2294
+ size, inst.is_store () || inst.isatomic (), access.get_type ()))) {
2295
+ // SST need mf type here
2296
+ // Cast it to sst_memory_interface pointer first as this full() method
2297
+ // is not a virtual method in parent class
2298
+ stall_cond = ICNT_RC_FAIL;
2299
+ break ;
2300
+ } else if (!m_memory_config->SST_mode &&
2301
+ (m_icnt->full (size, inst.is_store () || inst.isatomic ()))) {
2302
+ stall_cond = ICNT_RC_FAIL;
2303
+ break ;
2304
+ } else {
2305
+ mem_fetch *mf =
2306
+ m_mf_allocator->alloc (inst, access,
2307
+ m_core->get_gpu ()->gpu_sim_cycle +
2308
+ m_core->get_gpu ()->gpu_tot_sim_cycle );
2309
+ m_icnt->push (mf);
2310
+ inst.accessq_pop_back ();
2311
+ // inst.clear_active( access.get_warp_mask() );
2312
+ if (inst.is_load ()) {
2313
+ for (unsigned r = 0 ; r < MAX_OUTPUT_VALUES; r++)
2314
+ if (inst.out [r] > 0 )
2315
+ assert (m_pending_writes[inst.warp_id ()][inst.out [r]] > 0 );
2316
+ } else if (inst.is_store ())
2317
+ m_core->inc_store_req (inst.warp_id ());
2318
+ }
2311
2319
}
2312
2320
} else {
2313
2321
assert (CACHE_UNDEFINED != inst.cache_op );
@@ -4534,41 +4542,55 @@ unsigned simt_core_cluster::get_n_active_sms() const {
4534
4542
}
4535
4543
4536
4544
unsigned simt_core_cluster::issue_block2core () {
4545
+ const unsigned max_pending_ctas = 4 ;
4546
+ for (unsigned core = 0 ; core < m_config->n_simt_cores_per_cluster ; core++) {
4547
+ if (m_core[core]->pending_ctas .size () < max_pending_ctas) {
4548
+ kernel_info_t *kernel;
4549
+ // Jin: fetch kernel according to concurrent kernel setting
4550
+ if (m_config->gpgpu_concurrent_kernel_sm ) { // concurrent kernel on sm
4551
+ // always select latest issued kernel
4552
+ kernel_info_t *k = m_gpu->select_kernel ();
4553
+ kernel = k;
4554
+ } else {
4555
+ // first select core kernel, if no more cta, get a new kernel
4556
+ // only when core completes
4557
+ kernel = m_core[core]->get_kernel ();
4558
+ if (!m_gpu->kernel_more_cta_left (kernel)) {
4559
+ // wait till current kernel finishes
4560
+ if (m_core[core]->get_not_completed () == 0 &&
4561
+ m_core[core]->pending_ctas .empty ()) {
4562
+ kernel_info_t *k = m_gpu->select_kernel ();
4563
+ if (k) m_core[core]->set_kernel (k);
4564
+ kernel = k;
4565
+ }
4566
+ }
4567
+ }
4568
+ if (kernel) {
4569
+ if (kernel->allocated_ctas < kernel->num_blocks ()) {
4570
+ m_core[core]->pending_ctas .push_back (kernel);
4571
+ kernel->allocated_ctas ++;
4572
+ }
4573
+ }
4574
+ }
4575
+ }
4576
+
4537
4577
unsigned num_blocks_issued = 0 ;
4538
4578
for (unsigned i = 0 ; i < m_config->n_simt_cores_per_cluster ; i++) {
4539
4579
unsigned core =
4540
4580
(i + m_cta_issue_next_core + 1 ) % m_config->n_simt_cores_per_cluster ;
4541
4581
4542
- kernel_info_t *kernel;
4543
- // Jin: fetch kernel according to concurrent kernel setting
4544
- if (m_config->gpgpu_concurrent_kernel_sm ) { // concurrent kernel on sm
4545
- // always select latest issued kernel
4546
- kernel_info_t *k = m_gpu->select_kernel ();
4547
- kernel = k;
4548
- } else {
4549
- // first select core kernel, if no more cta, get a new kernel
4550
- // only when core completes
4551
- kernel = m_core[core]->get_kernel ();
4552
- if (!m_gpu->kernel_more_cta_left (kernel)) {
4553
- // wait till current kernel finishes
4554
- if (m_core[core]->get_not_completed () == 0 ) {
4555
- kernel_info_t *k = m_gpu->select_kernel ();
4556
- if (k) m_core[core]->set_kernel (k);
4557
- kernel = k;
4558
- }
4582
+ if (m_core[core]->pending_ctas .size () > 0 ) {
4583
+ kernel_info_t *pending_cta = m_core[core]->pending_ctas .front ();
4584
+ if (m_core[core]->can_issue_1block (*pending_cta)) {
4585
+ m_core[core]->issue_block2core (*pending_cta);
4586
+ m_core[core]->pending_ctas .pop_front ();
4587
+ num_blocks_issued++;
4588
+ m_cta_issue_next_core = core;
4589
+ break ;
4559
4590
}
4560
4591
}
4561
-
4562
- if (m_gpu->kernel_more_cta_left (kernel) &&
4563
- // (m_core[core]->get_n_active_cta() <
4564
- // m_config->max_cta(*kernel)) ) {
4565
- m_core[core]->can_issue_1block (*kernel)) {
4566
- m_core[core]->issue_block2core (*kernel);
4567
- num_blocks_issued++;
4568
- m_cta_issue_next_core = core;
4569
- break ;
4570
- }
4571
4592
}
4593
+
4572
4594
return num_blocks_issued;
4573
4595
}
4574
4596
0 commit comments