Skip to content

Commit 14da1e2

Browse files
committed
vllm_queue_status
Signed-off-by: Huamin Li <[email protected]>
1 parent a287d59 commit 14da1e2

File tree

4 files changed

+385
-0
lines changed

4 files changed

+385
-0
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"description": "vLLM: per-build GPU queue run/wait hours & cost (windowed, UTC)",
3+
"params": ["startTime", "stopTime"],
4+
"cache_ttl_seconds": 60,
5+
"tests": [
6+
{
7+
"name": "smoke-24h-iso",
8+
"params": {
9+
"startTime": "2025-10-17T00:00:00.000Z",
10+
"stopTime": "2025-10-18T00:00:00.000Z"
11+
},
12+
"row_limit": 10
13+
}
14+
]
15+
}
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/* Windowed per-build table (UTC), incl. PR & main builds, with queue totals, cost, and is_main_branch.
2+
WAIT: only attempts with started_at IS NOT NULL contribute wait (runnable → started).
3+
RUN: clip to [w_start, w_end]; 1-day zombie guard for open 'running' attempts.
4+
COST: 1.3232 * gpu_1_queue_run_hours + 4.602 * gpu_4_queue_run_hours (fixed).
5+
*/
6+
7+
WITH
8+
parseDateTime64BestEffort({startTime:String}, 3) AS w_start, -- inclusive (UTC)
9+
parseDateTime64BestEffort({stopTime:String}, 3) AS w_end, -- exclusive (UTC)
10+
toDateTime64(now(), 3) AS now64,
11+
(w_end - INTERVAL 1 DAY) AS zombie_cutoff,
12+
toDateTime64('2100-01-01 00:00:00', 3) AS FAR_FUTURE,
13+
['gpu_1_queue','gpu_4_queue'] AS QUEUES
14+
15+
/* 1) All builds created within the window (+ branch/PR context) */
16+
, builds_window AS (
17+
SELECT
18+
tupleElement(build,'id') AS build_id,
19+
20+
argMax(tupleElement(build,'number'), tupleElement(job,'created_at')) AS build_number,
21+
argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')) AS build_url,
22+
concat(argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')), '/steps/table') AS steps_table_url,
23+
argMax(tupleElement(build,'commit'), tupleElement(job,'created_at')) AS commit_sha,
24+
25+
/* robust start/finish (fallback to job min/max if build-level fields are NULL) */
26+
coalesce(argMax(tupleElement(build,'started_at'), tupleElement(job,'created_at')),
27+
min(tupleElement(job,'started_at'))) AS robust_start,
28+
coalesce(argMax(tupleElement(build,'finished_at'), tupleElement(job,'created_at')),
29+
max(tupleElement(job,'finished_at'))) AS robust_finish,
30+
31+
countDistinct(tupleElement(job,'id')) AS steps_count,
32+
argMax(tupleElement(build,'state'), tupleElement(job,'created_at')) AS latest_build_state,
33+
34+
/* repo + PR mapping (repo_slug may come from pipeline or PR repo) */
35+
coalesce(
36+
nullIf(extract(argMax(tupleElement(pipeline,'repository'), tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
37+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
38+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), '([^/]+/[^/.]+)'), '')
39+
) AS repo_slug,
40+
coalesce(
41+
toInt64OrNull(argMax(tupleElement(build,'pull_request').id, tupleElement(job,'created_at'))),
42+
toInt64OrNull(extract(argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')), 'pull/([0-9]+)'))
43+
) AS pr_number,
44+
45+
argMax(tupleElement(build,'created_at'), tupleElement(job,'created_at')) AS build_created_at_utc,
46+
argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')) AS branch_name
47+
FROM vllm.vllm_buildkite_jobs
48+
GROUP BY tupleElement(build,'id')
49+
HAVING build_created_at_utc >= w_start AND build_created_at_utc < w_end
50+
)
51+
52+
/* 2) Agent-run attempts for those builds that can overlap the window */
53+
, base_agent AS (
54+
SELECT
55+
tupleElement(build,'id') AS build_id,
56+
tupleElement(job,'id') AS job_id,
57+
tupleElement(job,'created_at') AS created_at,
58+
tupleElement(job,'state') AS state,
59+
tupleElement(job,'runnable_at') AS runnable_at,
60+
tupleElement(job,'started_at') AS started_at,
61+
tupleElement(job,'finished_at') AS finished_at,
62+
replaceOne(arrayFirst(x -> startsWith(x,'queue='),
63+
tupleElement(job,'agent_query_rules')), 'queue=', '') AS queue_key
64+
FROM vllm.vllm_buildkite_jobs
65+
INNER JOIN builds_window b ON tupleElement(build,'id') = b.build_id
66+
WHERE tupleElement(job,'type') IN ('script','command')
67+
AND (
68+
tupleElement(job,'runnable_at') < w_end OR
69+
tupleElement(job,'started_at') < w_end OR
70+
ifNull(tupleElement(job,'finished_at'), FAR_FUTURE) >= w_start
71+
)
72+
)
73+
74+
/* 3) Collapse to (build_id, job_id) and collect attempts keyed by queue */
75+
, jobs_by_build AS (
76+
SELECT
77+
build_id,
78+
job_id,
79+
argMax(state, created_at) AS latest_state,
80+
max(created_at) AS last_event_at,
81+
82+
/* RUN attempts: (queue, start, finish) */
83+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL,
84+
groupArray((queue_key, started_at, finished_at))
85+
)) AS run_triplets,
86+
87+
/* WAIT attempts: (queue, runnable, start) — ONLY attempts that actually started */
88+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL AND t.3 IS NOT NULL,
89+
groupArray((queue_key, runnable_at, started_at))
90+
)) AS wait_triplets
91+
FROM base_agent
92+
GROUP BY build_id, job_id
93+
)
94+
95+
/* 4) RUN attempts → per build × queue (clip to window; zombie guard for open runs) */
96+
, runs_scored AS (
97+
SELECT
98+
build_id,
99+
tupleElement(rt, 1) AS queue_key,
100+
greatest(tupleElement(rt, 2), w_start) AS s_clip,
101+
least(
102+
ifNull(
103+
tupleElement(rt, 3),
104+
if(latest_state = 'running' AND last_event_at < zombie_cutoff,
105+
least(last_event_at + INTERVAL 1 MINUTE, w_end),
106+
w_end)
107+
),
108+
w_end
109+
) AS e_clip
110+
FROM jobs_by_build
111+
ARRAY JOIN run_triplets AS rt
112+
WHERE tupleElement(rt, 1) IN QUEUES
113+
)
114+
, run_by_build AS (
115+
SELECT
116+
build_id, queue_key,
117+
sumIf(dateDiff('second', s_clip, e_clip), e_clip > s_clip) AS total_run_s
118+
FROM runs_scored
119+
GROUP BY build_id, queue_key
120+
)
121+
122+
/* 5) WAIT attempts (runnable → started) → per build × queue (clip to window) */
123+
, waits_scored AS (
124+
SELECT
125+
build_id,
126+
tupleElement(wt, 1) AS queue_key,
127+
greatest(tupleElement(wt, 2), w_start) AS ra_clip,
128+
least(tupleElement(wt, 3), w_end) AS st_clip
129+
FROM jobs_by_build
130+
ARRAY JOIN wait_triplets AS wt
131+
WHERE tupleElement(wt, 1) IN QUEUES
132+
)
133+
, wait_by_build AS (
134+
SELECT
135+
build_id, queue_key,
136+
sumIf(dateDiff('second', ra_clip, st_clip), st_clip > ra_clip) AS total_wait_s
137+
FROM waits_scored
138+
GROUP BY build_id, queue_key
139+
)
140+
141+
/* 6) Pivot per-build totals to hour columns */
142+
, totals_by_build AS (
143+
SELECT
144+
build_id,
145+
round(ifNull(sumIf(total_wait_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_wait_hours,
146+
round(ifNull(sumIf(total_run_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_run_hours,
147+
round(ifNull(sumIf(total_wait_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_wait_hours,
148+
round(ifNull(sumIf(total_run_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_run_hours
149+
FROM (
150+
SELECT build_id, queue_key, total_run_s, toInt64(0) AS total_wait_s FROM run_by_build
151+
UNION ALL
152+
SELECT build_id, queue_key, toInt64(0) AS total_run_s, total_wait_s FROM wait_by_build
153+
)
154+
GROUP BY build_id
155+
)
156+
157+
/* 7) Final table (UTC) — includes both PR and main builds */
158+
SELECT
159+
/* PR URL (NULL for non-PR builds) */
160+
if((pr_number IS NULL) OR (repo_slug IS NULL),
161+
NULL,
162+
concat('https://github.com/', repo_slug, '/pull/', toString(pr_number))
163+
) AS pr_url,
164+
165+
build_number,
166+
build_id,
167+
build_url,
168+
steps_table_url,
169+
commit_sha,
170+
171+
robust_start AS build_started_at,
172+
robust_finish AS build_finished_at,
173+
174+
/* duration (hours) = finish − start (UTC) */
175+
multiIf(
176+
robust_start IS NULL OR robust_finish IS NULL,
177+
NULL,
178+
round(dateDiff('second', robust_start, robust_finish) / 3600.0, 2)
179+
) AS duration_hours,
180+
181+
steps_count,
182+
latest_build_state,
183+
184+
ifNull(t.gpu_1_queue_wait_hours, 0) AS gpu_1_queue_wait_hours,
185+
ifNull(t.gpu_1_queue_run_hours, 0) AS gpu_1_queue_run_hours,
186+
ifNull(t.gpu_4_queue_wait_hours, 0) AS gpu_4_queue_wait_hours,
187+
ifNull(t.gpu_4_queue_run_hours, 0) AS gpu_4_queue_run_hours,
188+
189+
/* Fixed-rate cost */
190+
round(
191+
1.3232 * ifNull(t.gpu_1_queue_run_hours, 0) +
192+
4.602 * ifNull(t.gpu_4_queue_run_hours, 0),
193+
2
194+
) AS cost,
195+
196+
/* Mark if the build branch is literally 'main' */
197+
toUInt8(branch_name = 'main') AS is_main_branch
198+
199+
FROM builds_window AS b
200+
LEFT JOIN totals_by_build AS t ON t.build_id = b.build_id
201+
ORDER BY build_created_at_utc ASC;
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import React, { useMemo, useState } from "react";
2+
import { Box, Stack, Switch, Tooltip, Typography } from "@mui/material";
3+
import ReactECharts from "echarts-for-react";
4+
5+
type Row = {
6+
pr_url: string | null;
7+
build_number: number;
8+
build_id: string;
9+
build_url: string;
10+
steps_table_url: string;
11+
commit_sha: string;
12+
build_started_at: string | null; // UTC
13+
build_finished_at: string | null; // UTC
14+
duration_hours: number | null;
15+
steps_count: number;
16+
latest_build_state: string;
17+
18+
gpu_1_queue_wait_hours: number;
19+
gpu_1_queue_run_hours: number;
20+
gpu_4_queue_wait_hours: number;
21+
gpu_4_queue_run_hours: number;
22+
23+
cost: number;
24+
is_main_branch: number; // 0/1
25+
};
26+
27+
export default function QueuePerBuildPanel({ data }: { data: Row[] | undefined }) {
28+
const [mainOnly, setMainOnly] = useState(false);
29+
const [showWait, setShowWait] = useState(false);
30+
const [showCost, setShowCost] = useState(true);
31+
32+
const rows = useMemo(() => {
33+
const r = (data ?? []).filter((x) => (mainOnly ? x.is_main_branch === 1 : true));
34+
return r.sort((a, b) => {
35+
const ta = a.build_started_at ? new Date(a.build_started_at).getTime() : 0;
36+
const tb = b.build_started_at ? new Date(b.build_started_at).getTime() : 0;
37+
return ta - tb || a.build_number - b.build_number;
38+
});
39+
}, [data, mainOnly]);
40+
41+
const option = useMemo(() => {
42+
const series: any[] = [
43+
{
44+
name: "GPU1 run (h)",
45+
type: "line",
46+
showSymbol: false,
47+
data: rows.map((r) => [r.build_started_at, r.gpu_1_queue_run_hours])
48+
},
49+
{
50+
name: "GPU4 run (h)",
51+
type: "line",
52+
showSymbol: false,
53+
data: rows.map((r) => [r.build_started_at, r.gpu_4_queue_run_hours])
54+
}
55+
];
56+
57+
if (showWait) {
58+
series.push(
59+
{
60+
name: "GPU1 wait (h)",
61+
type: "line",
62+
showSymbol: false,
63+
lineStyle: { type: "dashed" },
64+
data: rows.map((r) => [r.build_started_at, r.gpu_1_queue_wait_hours])
65+
},
66+
{
67+
name: "GPU4 wait (h)",
68+
type: "line",
69+
showSymbol: false,
70+
lineStyle: { type: "dashed" },
71+
data: rows.map((r) => [r.build_started_at, r.gpu_4_queue_wait_hours])
72+
}
73+
);
74+
}
75+
76+
if (showCost) {
77+
series.push({
78+
name: "Cost ($)",
79+
type: "bar",
80+
yAxisIndex: 1,
81+
emphasis: { focus: "series" },
82+
data: rows.map((r) => [r.build_started_at, r.cost])
83+
});
84+
}
85+
86+
return {
87+
tooltip: {
88+
trigger: "axis",
89+
axisPointer: { type: "cross" },
90+
confine: true,
91+
formatter: (params: any[]) => {
92+
const idx = params?.[0]?.dataIndex ?? 0;
93+
const r = rows[idx];
94+
const buildLink = r.build_url
95+
? `<a href="${r.build_url}" target="_blank" rel="noreferrer">#${r.build_number}</a>`
96+
: `#${r.build_number}`;
97+
const prLine = r.pr_url
98+
? `<div>PR: <a href="${r.pr_url}" target="_blank" rel="noreferrer">${r.pr_url.replace("https://github.com/","").replace("/pull/","#")}</a></div>`
99+
: "";
100+
return `
101+
<div>
102+
<div><b>${r.build_started_at ?? ""}</b></div>
103+
<div>Build: ${buildLink}</div>
104+
${prLine}
105+
<div>GPU1 run: ${r.gpu_1_queue_run_hours.toFixed(2)} h</div>
106+
<div>GPU4 run: ${r.gpu_4_queue_run_hours.toFixed(2)} h</div>
107+
${showWait ? `<div>GPU1 wait: ${r.gpu_1_queue_wait_hours.toFixed(2)} h</div>` : ""}
108+
${showWait ? `<div>GPU4 wait: ${r.gpu_4_queue_wait_hours.toFixed(2)} h</div>` : ""}
109+
${showCost ? `<div>Cost: $${r.cost.toFixed(2)}</div>` : ""}
110+
<div>Branch: ${r.is_main_branch ? "main" : "PR/other"}</div>
111+
</div>
112+
`;
113+
}
114+
},
115+
legend: { top: 0 },
116+
grid: { left: 40, right: 50, bottom: 40, top: 40 },
117+
xAxis: { type: "time", name: "Build start (UTC)" },
118+
yAxis: [
119+
{ type: "value", name: "Hours" },
120+
{ type: "value", name: "Cost ($)", alignTicks: true }
121+
],
122+
series
123+
};
124+
}, [rows, showWait, showCost]);
125+
126+
return (
127+
<Box sx={{ height: "100%", display: "flex", flexDirection: "column" }}>
128+
<Stack direction="row" spacing={2} alignItems="center" sx={{ px: 2, pt: 1 }}>
129+
<Typography variant="h6" sx={{ fontWeight: "bold" }}>
130+
Queue Run/Wait & Cost (per build)
131+
</Typography>
132+
<Tooltip title="Show only builds on branch 'main'">
133+
<Stack direction="row" spacing={1} alignItems="center">
134+
<Typography variant="body2">Main only</Typography>
135+
<Switch size="small" checked={mainOnly} onChange={() => setMainOnly((s) => !s)} />
136+
</Stack>
137+
</Tooltip>
138+
<Tooltip title="Overlay wait hours series">
139+
<Stack direction="row" spacing={1} alignItems="center">
140+
<Typography variant="body2">Wait</Typography>
141+
<Switch size="small" checked={showWait} onChange={() => setShowWait((s) => !s)} />
142+
</Stack>
143+
</Tooltip>
144+
<Tooltip title="Overlay cost series (secondary axis)">
145+
<Stack direction="row" spacing={1} alignItems="center">
146+
<Typography variant="body2">Cost</Typography>
147+
<Switch size="small" checked={showCost} onChange={() => setShowCost((s) => !s)} />
148+
</Stack>
149+
</Tooltip>
150+
</Stack>
151+
<Box sx={{ flex: 1, minHeight: 240 }}>
152+
<ReactECharts style={{ height: "100%" }} option={option} notMerge />
153+
</Box>
154+
</Box>
155+
);
156+
}

0 commit comments

Comments
 (0)