diff --git a/docs/RECORDS_KEEPER.md b/docs/RECORDS_KEEPER.md index d3992aa83..5da2e1dd8 100644 --- a/docs/RECORDS_KEEPER.md +++ b/docs/RECORDS_KEEPER.md @@ -1,4 +1,4 @@ - + # Records Keeper The records keeper keeps repo memory ordered after meaningful changes. Run this checklist at milestone close, after architecture changes, after product behavior changes, and whenever docs/source disagree. diff --git a/src/resources/extensions/sf/auto/loop.js b/src/resources/extensions/sf/auto/loop.js index 360b0e67b..5dc4e4fda 100644 --- a/src/resources/extensions/sf/auto/loop.js +++ b/src/resources/extensions/sf/auto/loop.js @@ -17,7 +17,7 @@ import { debugLog } from "../debug-logger.js"; import { resolveEngine } from "../engine-resolver.js"; import { getErrorMessage } from "../error-utils.js"; import { NOTICE_KIND } from "../notification-store.js"; -import { sfRoot } from "../paths.js"; +import { resolveSliceFile, sfRoot } from "../paths.js"; import { dispatchSelfFeedbackInlineFixIfNeeded } from "../self-feedback-drain.js"; import { recordSelfFeedback } from "../self-feedback.js"; import { getDatabase } from "../sf-db.js"; @@ -1209,31 +1209,52 @@ export async function autoLoop(ctx, pi, s, deps) { // will either advance state via a different // unit or hit detectStuck and bail. } else { - ctx.ui.notify( - `Health issues detected with slice references — queuing reassess-roadmap instead of pausing.`, - "warning", - { - noticeKind: NOTICE_KIND.SYSTEM_NOTICE, - dedupe_key: "doctor-health-reassess-roadmap", - }, - ); - const { buildReassessRoadmapPrompt } = await import( - "../auto-prompts.js" - ); - const reassessPrompt = await buildReassessRoadmapPrompt( - mid, - midTitle, - sliceId, + // Guard: if the target slice is already complete AND has an + // ASSESSMENT file, skip queuing reassess-roadmap — there's + // nothing for it to do and it would loop on completed work. + const assessFile = resolveSliceFile( s.basePath, + mid, + sliceId, + "ASSESS", ); - s.sidecarQueue.unshift({ - kind: "hook", - unitType: "reassess-roadmap", - unitId: `${mid}/${sliceId}`, - prompt: `## Doctor Health Issues\n\n${healthCheck.issues.map((i) => `- ${i}`).join("\n")}\n\n${reassessPrompt}`, - }); - finishTurn("retry"); - continue; + if (assessFile && existsSync(assessFile)) { + ctx.ui.notify( + `Doctor health issues referenced ${sliceId} but it already has an ASSESSMENT — skipping redundant reassess-roadmap, falling through to normal dispatch.`, + "info", + { + noticeKind: NOTICE_KIND.TOOL_NOTICE, + dedupe_key: "doctor-reassess-already-assessed", + }, + ); + // Fall through to normal pre-dispatch + } else { + ctx.ui.notify( + `Health issues detected with slice references — queuing reassess-roadmap instead of pausing.`, + "warning", + { + noticeKind: NOTICE_KIND.SYSTEM_NOTICE, + dedupe_key: "doctor-health-reassess-roadmap", + }, + ); + const { buildReassessRoadmapPrompt } = await import( + "../auto-prompts.js" + ); + const reassessPrompt = await buildReassessRoadmapPrompt( + mid, + midTitle, + sliceId, + s.basePath, + ); + s.sidecarQueue.unshift({ + kind: "hook", + unitType: "reassess-roadmap", + unitId: `${mid}/${sliceId}`, + prompt: `## Doctor Health Issues\n\n${healthCheck.issues.map((i) => `- ${i}`).join("\n")}\n\n${reassessPrompt}`, + }); + finishTurn("retry"); + continue; + } } } } diff --git a/src/resources/extensions/sf/auto/phases-unit.js b/src/resources/extensions/sf/auto/phases-unit.js index b30615ac1..d03883d0c 100644 --- a/src/resources/extensions/sf/auto/phases-unit.js +++ b/src/resources/extensions/sf/auto/phases-unit.js @@ -41,6 +41,10 @@ import { recordExecutorRefusalEscalation, } from "../autonomous-solver.js"; import { blockModel } from "../blocked-models.js"; +import { + getCooldownRetryAfterMs, + isTransientCooldownError, +} from "../infra-errors.js"; import { resumeAutoAfterProviderDelay } from "../bootstrap/provider-error-resume.js"; import { debugLog } from "../debug-logger.js"; import { PROJECT_FILES } from "../detection.js"; @@ -1385,13 +1389,68 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { // Provider-error: try to reselect a ready provider and continue rather // than stopping autonomous mode. Only stop if no ready provider exists. if (currentUnitResult.errorContext?.category === "provider") { - await emitCancelledUnitEnd( - ic, - unitType, - unitId, - unitStartSeq, - currentUnitResult.errorContext, - ); + // Check whether this is a transient cooldown (rate-limit / quota), which + // should attempt failover to a non-cooldowned provider rather than stopping. + // Use the raw error from errorContext when available, falling back to + // the structured error field if present. + const rawErr = currentUnitResult.errorContext?.error; + const isCooldown = + rawErr != null && isTransientCooldownError(rawErr); + if (isCooldown) { + // Mark the failing model as temporarily blocked with its retry-after + // window so the next selectAndApplyModel call skips it. + const cooldownModel = s.currentUnitModel; + const retryAfterMs = rawErr != null + ? getCooldownRetryAfterMs(rawErr) + : undefined; + const expiresAt = retryAfterMs != null && retryAfterMs > 0 + ? Date.now() + retryAfterMs + : Date.now() + 60_000; // default 60s block if no retry hint + if (cooldownModel?.provider && cooldownModel?.id) { + blockModel( + s.basePath, + cooldownModel.provider, + cooldownModel.id, + `transient cooldown: retry after ${retryAfterMs ?? 60_000}ms`, + { expiresAt }, + ); + } + // Try to find any non-cooldowned, provider-ready fallback. + const failedProvider = + s.currentUnitModel?.provider ?? ctx.model?.provider; + const allModels = ctx.modelRegistry?.getAvailable?.() ?? []; + const fallback = allModels.find( + (m) => + m.provider !== failedProvider && + ctx.modelRegistry?.isProviderRequestReady?.(m.provider), + ); + if (fallback) { + const ok = await pi.setModel(fallback, { persist: false }); + if (ok) { + s.currentUnitModel = fallback; + const waitNote = retryAfterMs != null && retryAfterMs > 0 + ? ` (retrying immediately — cooldown expires in ${Math.round(retryAfterMs / 1000)}s)` + : ""; + ctx.ui.notify( + `Provider ${failedProvider} in cooldown${waitNote} — switched to ${fallback.provider}/${fallback.id}`, + "warning", + ); + return { action: "continue" }; + } + } + // All providers are unavailable — stopAuto with enumeration. + const cooldowned = allModels + .filter((m) => !ctx.modelRegistry?.isProviderRequestReady?.(m.provider)) + .map((m) => `${m.provider}/${m.id}`); + const cooldownedList = cooldowned.length > 0 + ? cooldowned.join(", ") + : "(none listed)"; + const msg = `All providers in cooldown. Failed: ${failedProvider}. Remaining unavailable: ${cooldownedList}. Stopping.`; + ctx.ui.notify(msg, "error"); + await deps.stopAuto(ctx, pi, msg); + return { action: "break", reason: "all-providers-cooldown" }; + } + // Non-cooldown provider error: try a different provider, stop if none ready. const failedProvider = s.currentUnitModel?.provider ?? ctx.model?.provider; const allModels = ctx.modelRegistry?.getAvailable?.() ?? []; diff --git a/src/resources/extensions/sf/prompts/complete-slice.md b/src/resources/extensions/sf/prompts/complete-slice.md index dcedbb260..59bd505b2 100644 --- a/src/resources/extensions/sf/prompts/complete-slice.md +++ b/src/resources/extensions/sf/prompts/complete-slice.md @@ -10,6 +10,13 @@ Executor agents built each task and wrote task summaries. You are the closer — Write the summary for those downstream readers. What did this slice actually deliver? What patterns did it establish? What should the next slice know? +**Structured frontmatter fields:** Three fields in the slice summary YAML frontmatter — `key_files`, `key_decisions`, and `patterns_established` — must be populated from actual work, not left as `(none)`. Before calling `complete_slice`, derive these from your inlined context: +- `key_files`: scan all task summaries for files created or modified; deduplicate and list the most significant ones (max ~8 entries). +- `key_decisions`: extract every named architectural decision from task narratives (e.g. "chose X over Y because Z"); list each as one bullet. +- `patterns_established`: extract repeatable patterns, conventions, or lessons that future agents should reuse; avoid obvious observations. + +If the slice did not produce any meaningful files, decisions, or patterns, write a brief honest entry rather than leaving the field as `(none)`. A one-line entry is better than silence — it shows the agent thought about it. + All relevant context has been preloaded below — the slice plan, all task summaries, and the milestone roadmap are inlined. Start working immediately without re-reading these files. {{closeoutControl}}