Skip to content

Commit 8917478

Browse files
authored
fix(runner): SIGTERM handling during warm start long poll (#2593)
1 parent f5caa66 commit 8917478

File tree

3 files changed

+93
-34
lines changed

3 files changed

+93
-34
lines changed

.changeset/poor-files-taste.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"trigger.dev": patch
3+
"@trigger.dev/core": patch
4+
---
5+
6+
Fix SIGTERM handling during warm start long poll

packages/cli-v3/src/entryPoints/managed/controller.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,18 @@ export class ManagedRunController {
106106
message: "Received SIGTERM, stopping worker",
107107
});
108108

109-
// Disable warm starts
109+
// Disable warm starts - prevents new warm start requests
110110
this.warmStartEnabled = false;
111111

112-
// ..now we wait for any active runs to finish
113-
// SIGKILL will handle the rest, nothing to do here
112+
// Abort any ongoing warm start long poll - immediately stops waiting for next run
113+
// This prevents the scenario where:
114+
// 1. SIGTERM kills a prepared child process
115+
// 2. Warm start poll returns a new run
116+
// 3. Controller tries to use the dead child process
117+
this.warmStartClient?.abort();
118+
119+
// Now we wait for any active runs to finish gracefully
120+
// SIGKILL will handle forced termination after termination grace period
114121
});
115122
}
116123

packages/core/src/v3/workers/warmStartClient.ts

Lines changed: 77 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export class WarmStartClient {
1717
private readonly logger = new SimpleStructuredLogger("warm-start-client");
1818
private readonly apiUrl: URL;
1919
private backoff = new ExponentialBackoff("FullJitter");
20+
private abortController: AbortController | null = null;
2021

2122
private get connectUrl() {
2223
return new URL("/connect", this.apiUrl);
@@ -30,6 +31,30 @@ export class WarmStartClient {
3031
this.apiUrl = opts.apiUrl;
3132
}
3233

34+
abort() {
35+
if (!this.abortController) {
36+
this.logger.warn("Abort called but no abort controller exists");
37+
return;
38+
}
39+
40+
this.abortController.abort();
41+
this.abortController = null;
42+
}
43+
44+
private async withAbort<T>(fn: (signal: AbortSignal) => Promise<T>): Promise<T> {
45+
if (this.abortController) {
46+
throw new Error("A warm start is already in progress");
47+
}
48+
49+
this.abortController = new AbortController();
50+
51+
try {
52+
return await fn(this.abortController.signal);
53+
} finally {
54+
this.abortController = null;
55+
}
56+
}
57+
3358
async connect(): Promise<ApiResult<WarmStartConnectResponse>> {
3459
return wrapZodFetch(
3560
WarmStartConnectResponse,
@@ -61,39 +86,42 @@ export class WarmStartClient {
6186
connectionTimeoutMs: number;
6287
keepaliveMs: number;
6388
}): Promise<DequeuedMessage | null> {
64-
const res = await this.longPoll<unknown>(
65-
this.warmStartUrl.href,
66-
{
67-
method: "GET",
68-
headers: {
69-
"x-trigger-workload-controller-id": this.opts.controllerId,
70-
"x-trigger-deployment-id": this.opts.deploymentId,
71-
"x-trigger-deployment-version": this.opts.deploymentVersion,
72-
"x-trigger-machine-cpu": this.opts.machineCpu,
73-
"x-trigger-machine-memory": this.opts.machineMemory,
74-
"x-trigger-worker-instance-name": workerInstanceName,
89+
return this.withAbort(async (abortSignal) => {
90+
const res = await this.longPoll<unknown>(
91+
this.warmStartUrl.href,
92+
{
93+
method: "GET",
94+
headers: {
95+
"x-trigger-workload-controller-id": this.opts.controllerId,
96+
"x-trigger-deployment-id": this.opts.deploymentId,
97+
"x-trigger-deployment-version": this.opts.deploymentVersion,
98+
"x-trigger-machine-cpu": this.opts.machineCpu,
99+
"x-trigger-machine-memory": this.opts.machineMemory,
100+
"x-trigger-worker-instance-name": workerInstanceName,
101+
},
75102
},
76-
},
77-
{
78-
timeoutMs: connectionTimeoutMs,
79-
totalDurationMs: keepaliveMs,
103+
{
104+
timeoutMs: connectionTimeoutMs,
105+
totalDurationMs: keepaliveMs,
106+
abortSignal,
107+
}
108+
);
109+
110+
if (!res.ok) {
111+
this.logger.error("warmStart: failed", {
112+
error: res.error,
113+
connectionTimeoutMs,
114+
keepaliveMs,
115+
});
116+
return null;
80117
}
81-
);
82-
83-
if (!res.ok) {
84-
this.logger.error("warmStart: failed", {
85-
error: res.error,
86-
connectionTimeoutMs,
87-
keepaliveMs,
88-
});
89-
return null;
90-
}
91118

92-
const nextRun = DequeuedMessage.parse(res.data);
119+
const nextRun = DequeuedMessage.parse(res.data);
93120

94-
this.logger.debug("warmStart: got next run", { nextRun });
121+
this.logger.debug("warmStart: got next run", { nextRun });
95122

96-
return nextRun;
123+
return nextRun;
124+
});
97125
}
98126

99127
private async longPoll<T = any>(
@@ -102,9 +130,11 @@ export class WarmStartClient {
102130
{
103131
timeoutMs,
104132
totalDurationMs,
133+
abortSignal,
105134
}: {
106135
timeoutMs: number;
107136
totalDurationMs: number;
137+
abortSignal: AbortSignal;
108138
}
109139
): Promise<
110140
| {
@@ -123,11 +153,20 @@ export class WarmStartClient {
123153
let retries = 0;
124154

125155
while (Date.now() < endTime) {
156+
if (abortSignal.aborted) {
157+
return {
158+
ok: false,
159+
error: "Aborted - abort signal triggered before fetch",
160+
};
161+
}
162+
126163
try {
127-
const controller = new AbortController();
128-
const signal = controller.signal;
164+
const timeoutController = new AbortController();
165+
const timeoutId = setTimeout(() => timeoutController.abort(), timeoutMs);
129166

130-
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
167+
// Create compound signal that aborts on either timeout or parent abort
168+
const signals = [timeoutController.signal, abortSignal];
169+
const signal = AbortSignal.any(signals);
131170

132171
const response = await fetch(url, { ...requestInit, signal });
133172

@@ -148,6 +187,13 @@ export class WarmStartClient {
148187
}
149188
} catch (error) {
150189
if (error instanceof Error && error.name === "AbortError") {
190+
// Check if this was a parent abort or just a timeout
191+
if (abortSignal.aborted) {
192+
return {
193+
ok: false,
194+
error: "Aborted - abort signal triggered during fetch",
195+
};
196+
}
151197
this.logger.log("Long poll request timed out, retrying...");
152198
continue;
153199
} else {

0 commit comments

Comments
 (0)