Skip to content

Commit 2a9ecf9

Browse files
authored
refactor tdr detect (#734)
Signed-off-by: Lizhi Hou <[email protected]>
1 parent c3635af commit 2a9ecf9

File tree

7 files changed

+40
-33
lines changed

7 files changed

+40
-33
lines changed

src/driver/amdxdna/aie2_ctx.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,14 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
213213
{
214214
struct amdxdna_ctx *ctx = job->ctx;
215215
struct dma_fence *fence = job->fence;
216+
struct amdxdna_dev_hdl *ndev;
216217
int idx;
217218

218219
amdxdna_pm_suspend_put(ctx->client->xdna);
219220

220221
ctx->completed++;
222+
ndev = ctx->client->xdna->dev_handle;
223+
WRITE_ONCE(ndev->tdr_status, AIE2_TDR_SIGNALED);
221224
trace_xdna_job(&job->base, ctx->name, "signaling fence", job->seq, job->opcode);
222225
job->job_done = true;
223226
dma_fence_signal(fence);
@@ -358,6 +361,7 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
358361
struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
359362
struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
360363
struct amdxdna_ctx *ctx = job->ctx;
364+
struct amdxdna_dev_hdl *ndev;
361365
enum cmd_chain_class class;
362366
struct dma_fence *fence;
363367
int ret = 0;
@@ -415,6 +419,8 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
415419
} else {
416420
if (job->opcode != OP_NOOP)
417421
amdxdna_stats_start(ctx->client);
422+
ndev = ctx->client->xdna->dev_handle;
423+
WRITE_ONCE(ndev->tdr_status, AIE2_TDR_SIGNALED);
418424
}
419425

420426
return fence;
@@ -968,7 +974,12 @@ int aie2_cmd_submit(struct amdxdna_ctx *ctx, struct amdxdna_sched_job *job,
968974
goto rq_yield;
969975
}
970976

977+
#if KERNEL_VERSION(6, 17, 0) <= LINUX_VERSION_CODE
978+
ret = drm_sched_job_init(&job->base, &ctx->priv->entity, 1, ctx,
979+
ctx->client->filp->client_id);
980+
#else
971981
ret = drm_sched_job_init(&job->base, &ctx->priv->entity, 1, ctx);
982+
#endif
972983
if (ret) {
973984
XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
974985
goto free_chain;

src/driver/amdxdna/aie2_ctx_runqueue.c

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -171,33 +171,21 @@ static bool part_handle_idle_ctx(struct aie2_partition *part, bool force)
171171
}
172172

173173
static bool
174-
part_is_all_ctx_stuck(struct aie2_partition *part)
174+
part_has_pending_cmd(struct aie2_partition *part)
175175
{
176176
struct amdxdna_dev *xdna;
177177
struct amdxdna_ctx *ctx;
178-
int progress_cnt = 0;
179-
int running_cnt = 0;
180178

181179
xdna = ctx_rq_to_xdna_dev(part->rq);
182180
list_for_each_entry(ctx, &part->conn_list, entry) {
183-
u64 completed = ctx->completed;
184-
u64 last = ctx->last_completed;
185-
u64 submitted = ctx->submitted;
186-
187-
XDNA_DBG(xdna, "%s @[%d, %d] submitted %lld completed %lld last %lld",
181+
XDNA_DBG(xdna, "%s @[%d, %d] submitted %lld completed %lld",
188182
ctx->name, part->start_col, part->end_col,
189-
submitted, completed, last);
190-
if (submitted == completed)
191-
continue;
192-
193-
running_cnt++;
194-
if (last != completed) {
195-
ctx->last_completed = completed;
196-
progress_cnt++;
197-
}
183+
ctx->submitted, ctx->completed);
184+
if (ctx->submitted != ctx->completed)
185+
return true;
198186
}
199187

200-
return running_cnt && !progress_cnt;
188+
return false;
201189
}
202190

203191
static struct aie2_partition *
@@ -812,26 +800,37 @@ static void rq_parts_work(struct work_struct *work)
812800
*/
813801
bool aie2_rq_is_all_context_stuck(struct aie2_ctx_rq *rq)
814802
{
803+
struct amdxdna_dev_hdl *ndev;
815804
struct aie2_partition *part;
816805
struct amdxdna_dev *xdna;
817-
int active_cnt = 0;
818-
int stuck_cnt = 0;
806+
bool pending = false;
807+
u32 tdr;
819808
int i;
820809

821810
xdna = ctx_rq_to_xdna_dev(rq);
811+
ndev = xdna->dev_handle;
822812
mutex_lock(&xdna->dev_lock);
823813
for (i = 0; i < rq->num_parts; i++) {
824814
part = &rq->parts[i];
825815
if (!part->hwctx_cnt)
826816
continue;
827817

828-
active_cnt++;
829-
if (part_is_all_ctx_stuck(part))
830-
stuck_cnt++;
818+
pending = part_has_pending_cmd(part);
819+
if (pending)
820+
break;
831821
}
832822
mutex_unlock(&xdna->dev_lock);
833823

834-
return active_cnt && active_cnt == stuck_cnt;
824+
tdr = READ_ONCE(ndev->tdr_status);
825+
if (pending && xdna->tdr.progress == tdr && tdr == AIE2_TDR_WAIT)
826+
return true;
827+
828+
if (tdr != AIE2_TDR_WAIT)
829+
WRITE_ONCE(ndev->tdr_status, AIE2_TDR_WAIT);
830+
831+
xdna->tdr.progress = tdr;
832+
833+
return false;
835834
}
836835

837836
bool aie2_rq_handle_idle_ctx(struct aie2_ctx_rq *rq)

src/driver/amdxdna/aie2_pci.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@
7676
#define ctx_rq_to_xdna_dev(r) \
7777
(ctx_rq_to_ndev(r)->xdna)
7878

79+
#define AIE2_TDR_WAIT 0
80+
#define AIE2_TDR_SIGNALED 1
81+
7982
struct amdxdna_ctx_priv;
8083
struct event_trace_req_buf;
8184
struct start_event_trace_resp;
@@ -360,6 +363,8 @@ struct amdxdna_dev_hdl {
360363
struct mutex aie2_lock;
361364

362365
struct aie2_ctx_rq ctx_rq;
366+
367+
u32 tdr_status;
363368
};
364369

365370
#define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \

src/driver/amdxdna/amdxdna_ctx.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
113113
}
114114

115115
ctx->client = client;
116-
ctx->last_completed = -1;
117116
ctx->num_tiles = args->num_tiles;
118117
ctx->mem_size = args->mem_size;
119118
ctx->max_opc = args->max_opc;

src/driver/amdxdna/amdxdna_ctx.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,6 @@ struct amdxdna_ctx {
151151
u64 completed ____cacheline_aligned_in_smp;
152152
/* Counter for freed job */
153153
atomic64_t job_free_cnt;
154-
/* For context runqueue to keep last completed. low frequency update */
155-
u64 last_completed;
156154
/* For command completion notification. */
157155
u32 syncobj_hdl;
158156
struct amdxdna_ctx_health_data health_data;

src/driver/amdxdna/amdxdna_pci_drv.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55

66
#include <linux/module.h>
77
#include <linux/version.h>
8-
#if KERNEL_VERSION(6, 10, 0) > LINUX_VERSION_CODE
98
#include <drm/drm_managed.h>
10-
#endif
119

1210
#include "amdxdna_pci_drv.h"
1311
#include "amdxdna_sysfs.h"
@@ -85,11 +83,7 @@ static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id)
8583
if (!xdna->dev_info)
8684
return -ENODEV;
8785

88-
#if KERNEL_VERSION(6, 10, 0) > LINUX_VERSION_CODE
8986
drmm_mutex_init(&xdna->ddev, &xdna->dev_lock);
90-
#else
91-
devm_mutex_init(dev, &xdna->dev_lock);
92-
#endif
9387
init_rwsem(&xdna->notifier_lock);
9488
INIT_LIST_HEAD(&xdna->client_list);
9589
pci_set_drvdata(pdev, xdna);

src/driver/amdxdna/amdxdna_tdr.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct amdxdna_tdr {
1919
struct work_struct tdr_work;
2020
int tdr_counter;
2121
int started;
22+
u32 progress;
2223
};
2324

2425
void amdxdna_tdr_start(struct amdxdna_tdr *tdr);

0 commit comments

Comments
 (0)