diff --git a/src/resources/extensions/sf/auto/phases.js b/src/resources/extensions/sf/auto/phases.js index a90306017..aea33968f 100644 --- a/src/resources/extensions/sf/auto/phases.js +++ b/src/resources/extensions/sf/auto/phases.js @@ -2336,6 +2336,7 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { solverAssessment.repairAttempt, solverAssessment.maxRepairAttempts, ), + { keepSession: true }, ); s.lastUnitAgentEndMessages = currentUnitResult.event?.messages ?? null; if (currentUnitResult.status === "cancelled") { diff --git a/src/resources/extensions/sf/auto/run-unit.js b/src/resources/extensions/sf/auto/run-unit.js index 246cbf0b0..86ac9c6cb 100644 --- a/src/resources/extensions/sf/auto/run-unit.js +++ b/src/resources/extensions/sf/auto/run-unit.js @@ -37,9 +37,16 @@ let sessionSwitchGeneration = 0; * The promise is one-shot: resolveAgentEnd() is the only way to resolve it. * On session creation failure or timeout, returns { status: 'cancelled' } * without awaiting the promise. + * + * Options: + * keepSession — when true, skip newSession() and continue in the current + * conversation. Used for repair retries within the same unit so the agent + * sees what it tried before and why it failed, rather than starting cold. + * Default: false (each new unit starts with a clean session). */ -export async function runUnit(ctx, pi, s, unitType, unitId, prompt) { - debugLog("runUnit", { phase: "start", unitType, unitId }); +export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) { + const keepSession = options?.keepSession === true; + debugLog("runUnit", { phase: "start", unitType, unitId, keepSession }); // GAP-10: Ensure cwd matches basePath BEFORE newSession() captures it. The // new session reads process.cwd() during construction to anchor its tool // runtime and system prompt; if cwd has drifted (async_bash, background @@ -62,70 +69,75 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt) { }, }; } - // ── Session creation with timeout ── - debugLog("runUnit", { phase: "session-create", unitType, unitId }); - let sessionResult; - let sessionTimeoutHandle; - const mySessionSwitchGeneration = ++sessionSwitchGeneration; - // GAP-07: Cancellation controller for newSession(). When the session-creation - // timeout fires, we abort this controller so that any still-in-flight - // newSession() work (which may clobber process.cwd()) is signalled to stop. - // Note: SF's newSession() does not currently accept abortSignal in its - // options type, so we cannot pass it directly — but aborting the controller - // documents the intent clearly and is a no-op call site when the API adds it. - const sessionAbortController = new AbortController(); - _setSessionSwitchInFlight(true); - try { - const sessionPromise = s.cmdCtx.newSession().finally(() => { - if (sessionSwitchGeneration === mySessionSwitchGeneration) { - _setSessionSwitchInFlight(false); - } - }); - const timeoutPromise = new Promise((resolve) => { - sessionTimeoutHandle = setTimeout(() => { - sessionAbortController.abort(); - resolve({ cancelled: true }); - }, NEW_SESSION_TIMEOUT_MS); - }); - sessionResult = await Promise.race([sessionPromise, timeoutPromise]); - } catch (sessionErr) { + // ── Session creation (skipped for same-unit repair retries) ── + // keepSession=true: send the repair prompt into the existing conversation so + // the agent has full context of what it already tried and why it failed. + // keepSession=false (default): start a clean session for each new unit. + if (!keepSession) { + debugLog("runUnit", { phase: "session-create", unitType, unitId }); + let sessionResult; + let sessionTimeoutHandle; + const mySessionSwitchGeneration = ++sessionSwitchGeneration; + // GAP-07: Cancellation controller for newSession(). When the session-creation + // timeout fires, we abort this controller so that any still-in-flight + // newSession() work (which may clobber process.cwd()) is signalled to stop. + // Note: SF's newSession() does not currently accept abortSignal in its + // options type, so we cannot pass it directly — but aborting the controller + // documents the intent clearly and is a no-op call site when the API adds it. + const sessionAbortController = new AbortController(); + _setSessionSwitchInFlight(true); + try { + const sessionPromise = s.cmdCtx.newSession().finally(() => { + if (sessionSwitchGeneration === mySessionSwitchGeneration) { + _setSessionSwitchInFlight(false); + } + }); + const timeoutPromise = new Promise((resolve) => { + sessionTimeoutHandle = setTimeout(() => { + sessionAbortController.abort(); + resolve({ cancelled: true }); + }, NEW_SESSION_TIMEOUT_MS); + }); + sessionResult = await Promise.race([sessionPromise, timeoutPromise]); + } catch (sessionErr) { + if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); + const msg = + sessionErr instanceof Error ? sessionErr.message : String(sessionErr); + debugLog("runUnit", { + phase: "session-error", + unitType, + unitId, + error: msg, + }); + return { + status: "cancelled", + errorContext: { + message: `Session creation failed: ${msg}`, + category: "session-failed", + isTransient: true, + }, + }; + } if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); - const msg = - sessionErr instanceof Error ? sessionErr.message : String(sessionErr); - debugLog("runUnit", { - phase: "session-error", - unitType, - unitId, - error: msg, - }); - return { - status: "cancelled", - errorContext: { - message: `Session creation failed: ${msg}`, - category: "session-failed", - isTransient: true, - }, - }; - } - if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); - if (sessionResult.cancelled) { - debugLog("runUnit-session-timeout", { unitType, unitId }); - // On timeout, do NOT clear the in-flight guard here. The dangling - // sessionPromise's .finally() has a generation check — it will clear the - // guard when the underlying newSession promise eventually settles, but only - // if no newer runUnit call has already incremented the generation. This is - // the correct design: the guard stays true until the next session is ready, - // preventing stale agent_end events from the timed-out session from being - // processed by handleAgentEnd. The next runUnit call sets inFlight=true - // again and its own .finally() manages the clearing. - return { - status: "cancelled", - errorContext: { - message: "Session creation timed out", - category: "timeout", - isTransient: true, - }, - }; + if (sessionResult.cancelled) { + debugLog("runUnit-session-timeout", { unitType, unitId }); + // On timeout, do NOT clear the in-flight guard here. The dangling + // sessionPromise's .finally() has a generation check — it will clear the + // guard when the underlying newSession promise eventually settles, but only + // if no newer runUnit call has already incremented the generation. This is + // the correct design: the guard stays true until the next session is ready, + // preventing stale agent_end events from the timed-out session from being + // processed by handleAgentEnd. The next runUnit call sets inFlight=true + // again and its own .finally() manages the clearing. + return { + status: "cancelled", + errorContext: { + message: "Session creation timed out", + category: "timeout", + isTransient: true, + }, + }; + } } if (!s.active) { return { status: "cancelled" }; @@ -150,9 +162,12 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt) { } } // ── Create the agent_end promise (per-unit one-shot) ── - // This happens after newSession completes so session-switch agent_end events - // from the previous session cannot resolve the new unit. - _setSessionSwitchInFlight(false); + // When keepSession=false: clear the in-flight guard now that the new session + // is fully ready, so handleAgentEnd processes events for this unit only. + // When keepSession=true: no session switch happened, guard is already clear. + if (!keepSession) { + _setSessionSwitchInFlight(false); + } const unitPromise = new Promise((resolve) => { _setCurrentResolve(resolve); });