fix: async bash job timeout hangs indefinitely instead of erroring out (#2214)
When an async bash job exceeds its timeout, killTree sends SIGTERM but some processes (e.g. those trapping SIGTERM) never exit, causing the promise to hang forever since the 'close' event never fires. Add a three-stage escalation: SIGTERM -> SIGKILL after 5s grace -> force-resolve after 3s hard deadline. Use settled guards to prevent double-resolution when the close event races with the hard deadline. Fixes #2186 Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b0fc552a2e
commit
97bdf3b071
2 changed files with 162 additions and 4 deletions
122
src/resources/extensions/async-jobs/async-bash-timeout.test.ts
Normal file
122
src/resources/extensions/async-jobs/async-bash-timeout.test.ts
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
/**
|
||||
* async-bash-timeout.test.ts — Tests for async_bash timeout behavior.
|
||||
*
|
||||
* Reproduces issue #2186: when an async bash job exceeds its timeout and
|
||||
* the child process ignores SIGTERM, the promise hangs indefinitely.
|
||||
* The fix adds a SIGKILL fallback and a hard deadline that force-resolves
|
||||
* the promise so execution can continue.
|
||||
*/
|
||||
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { createAsyncBashTool } from "./async-bash-tool.ts";
|
||||
import { AsyncJobManager } from "./job-manager.ts";
|
||||
|
||||
function getTextFromResult(result: { content: Array<{ type: string; text?: string }> }): string {
|
||||
return result.content.map((c) => c.text ?? "").join("\n");
|
||||
}
|
||||
|
||||
const noopSignal = new AbortController().signal;
|
||||
|
||||
test("async_bash with timeout resolves even if process ignores SIGTERM", async () => {
|
||||
const manager = new AsyncJobManager();
|
||||
const tool = createAsyncBashTool(() => manager, () => process.cwd());
|
||||
|
||||
// Start a job that traps SIGTERM (ignores it), with a 2s timeout.
|
||||
// The process installs a SIGTERM trap and sleeps for 60s.
|
||||
// Before the fix, this would hang forever because SIGTERM is ignored
|
||||
// and the close event never fires.
|
||||
const result = await tool.execute(
|
||||
"tc-timeout",
|
||||
{
|
||||
command: "trap '' TERM; sleep 60",
|
||||
timeout: 2,
|
||||
label: "sigterm-resistant",
|
||||
},
|
||||
noopSignal,
|
||||
() => {},
|
||||
undefined as never,
|
||||
);
|
||||
|
||||
const text = getTextFromResult(result);
|
||||
assert.match(text, /sigterm-resistant/);
|
||||
|
||||
const jobId = text.match(/\*\*(bg_[a-f0-9]+)\*\*/)?.[1];
|
||||
assert.ok(jobId, "Should have returned a job ID");
|
||||
|
||||
// Now await the job — it should resolve within a reasonable time
|
||||
// (timeout 2s + SIGKILL grace 5s + buffer = well under 15s)
|
||||
const start = Date.now();
|
||||
const job = manager.getJob(jobId)!;
|
||||
assert.ok(job, "Job should exist");
|
||||
|
||||
await Promise.race([
|
||||
job.promise,
|
||||
new Promise<never>((_, reject) => {
|
||||
const t = setTimeout(() => reject(new Error(
|
||||
`Job promise hung for ${Date.now() - start}ms — ` +
|
||||
`this is the bug from issue #2186: timeout hangs indefinitely`,
|
||||
)), 15_000);
|
||||
if (typeof t === "object" && "unref" in t) t.unref();
|
||||
}),
|
||||
]);
|
||||
|
||||
const elapsed = Date.now() - start;
|
||||
// Should have resolved well within 15s (timeout 2s + kill grace ~5s)
|
||||
assert.ok(elapsed < 15_000, `Job took ${elapsed}ms — expected <15s`);
|
||||
|
||||
// Job should have completed (resolved, not rejected) with timeout message
|
||||
assert.ok(
|
||||
job.status === "completed" || job.status === "failed",
|
||||
`Job status should be completed or failed, got: ${job.status}`,
|
||||
);
|
||||
|
||||
if (job.status === "completed") {
|
||||
assert.ok(
|
||||
job.resultText?.includes("timed out") || job.resultText?.includes("Timed out"),
|
||||
`Result should mention timeout, got: ${job.resultText}`,
|
||||
);
|
||||
}
|
||||
|
||||
manager.shutdown();
|
||||
});
|
||||
|
||||
test("async_bash with timeout resolves normally when process exits on SIGTERM", async () => {
|
||||
const manager = new AsyncJobManager();
|
||||
const tool = createAsyncBashTool(() => manager, () => process.cwd());
|
||||
|
||||
// Start a normal sleep that will die on SIGTERM, with a 1s timeout
|
||||
const result = await tool.execute(
|
||||
"tc-normal-timeout",
|
||||
{
|
||||
command: "sleep 60",
|
||||
timeout: 1,
|
||||
label: "normal-timeout",
|
||||
},
|
||||
noopSignal,
|
||||
() => {},
|
||||
undefined as never,
|
||||
);
|
||||
|
||||
const text = getTextFromResult(result);
|
||||
const jobId = text.match(/\*\*(bg_[a-f0-9]+)\*\*/)?.[1];
|
||||
assert.ok(jobId, "Should have returned a job ID");
|
||||
|
||||
const job = manager.getJob(jobId)!;
|
||||
const start = Date.now();
|
||||
|
||||
await Promise.race([
|
||||
job.promise,
|
||||
new Promise<never>((_, reject) => {
|
||||
const t = setTimeout(() => reject(new Error("Job hung")), 10_000);
|
||||
if (typeof t === "object" && "unref" in t) t.unref();
|
||||
}),
|
||||
]);
|
||||
|
||||
const elapsed = Date.now() - start;
|
||||
assert.ok(elapsed < 5_000, `Expected quick resolution after SIGTERM, took ${elapsed}ms`);
|
||||
assert.equal(job.status, "completed");
|
||||
assert.ok(job.resultText?.includes("timed out"), `Should mention timeout: ${job.resultText}`);
|
||||
|
||||
manager.shutdown();
|
||||
});
|
||||
|
|
@ -109,6 +109,10 @@ function executeBashInBackground(
|
|||
timeout?: number,
|
||||
): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
let settled = false;
|
||||
const safeResolve = (value: string) => { if (!settled) { settled = true; resolve(value); } };
|
||||
const safeReject = (err: unknown) => { if (!settled) { settled = true; reject(err); } };
|
||||
|
||||
const { shell, args } = getShellConfig();
|
||||
const resolvedCommand = sanitizeCommand(command);
|
||||
|
||||
|
|
@ -121,11 +125,39 @@ function executeBashInBackground(
|
|||
|
||||
let timedOut = false;
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
|
||||
let sigkillHandle: ReturnType<typeof setTimeout> | undefined;
|
||||
let hardDeadlineHandle: ReturnType<typeof setTimeout> | undefined;
|
||||
|
||||
/** Grace period (ms) between SIGTERM and SIGKILL. */
|
||||
const SIGKILL_GRACE_MS = 5_000;
|
||||
/** Hard deadline (ms) after SIGKILL to force-resolve the promise. */
|
||||
const HARD_DEADLINE_MS = 3_000;
|
||||
|
||||
if (timeout !== undefined && timeout > 0) {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
timedOut = true;
|
||||
if (child.pid) killTree(child.pid);
|
||||
|
||||
// If the process ignores SIGTERM, escalate to SIGKILL
|
||||
sigkillHandle = setTimeout(() => {
|
||||
if (child.pid) {
|
||||
try { process.kill(-child.pid, "SIGKILL"); } catch { /* ignore */ }
|
||||
try { process.kill(child.pid, "SIGKILL"); } catch { /* ignore */ }
|
||||
}
|
||||
|
||||
// Hard deadline: if even SIGKILL doesn't trigger 'close',
|
||||
// force-resolve so the job doesn't hang forever (#2186).
|
||||
hardDeadlineHandle = setTimeout(() => {
|
||||
const output = Buffer.concat(chunks).toString("utf-8");
|
||||
safeResolve(
|
||||
output
|
||||
? `${output}\n\nCommand timed out after ${timeout} seconds (force-killed)`
|
||||
: `Command timed out after ${timeout} seconds (force-killed)`,
|
||||
);
|
||||
}, HARD_DEADLINE_MS);
|
||||
if (typeof hardDeadlineHandle === "object" && "unref" in hardDeadlineHandle) hardDeadlineHandle.unref();
|
||||
}, SIGKILL_GRACE_MS);
|
||||
if (typeof sigkillHandle === "object" && "unref" in sigkillHandle) sigkillHandle.unref();
|
||||
}, timeout * 1000);
|
||||
}
|
||||
|
||||
|
|
@ -168,24 +200,28 @@ function executeBashInBackground(
|
|||
|
||||
child.on("error", (err) => {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
if (sigkillHandle) clearTimeout(sigkillHandle);
|
||||
if (hardDeadlineHandle) clearTimeout(hardDeadlineHandle);
|
||||
signal.removeEventListener("abort", onAbort);
|
||||
reject(err);
|
||||
safeReject(err);
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
if (sigkillHandle) clearTimeout(sigkillHandle);
|
||||
if (hardDeadlineHandle) clearTimeout(hardDeadlineHandle);
|
||||
signal.removeEventListener("abort", onAbort);
|
||||
if (spillStream) spillStream.end();
|
||||
|
||||
if (signal.aborted) {
|
||||
const output = Buffer.concat(chunks).toString("utf-8");
|
||||
resolve(output ? `${output}\n\nCommand aborted` : "Command aborted");
|
||||
safeResolve(output ? `${output}\n\nCommand aborted` : "Command aborted");
|
||||
return;
|
||||
}
|
||||
|
||||
if (timedOut) {
|
||||
const output = Buffer.concat(chunks).toString("utf-8");
|
||||
resolve(output ? `${output}\n\nCommand timed out after ${timeout} seconds` : `Command timed out after ${timeout} seconds`);
|
||||
safeResolve(output ? `${output}\n\nCommand timed out after ${timeout} seconds` : `Command timed out after ${timeout} seconds`);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -208,7 +244,7 @@ function executeBashInBackground(
|
|||
text += `\n\nCommand exited with code ${code}`;
|
||||
}
|
||||
|
||||
resolve(text);
|
||||
safeResolve(text);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue