fix(self-feedback): 3 sf-internal defects resolved

1. cooldown failover (sf-mp8w9cg9-arixq7, high) When a provider hits AUTH_COOLDOWN in unit execution, block the failing model with an expiry using the existing blockModel() API, then try a non-cooldowned provider via isProviderRequestReady. Only stops if every provider is unavailable, with an enumerated message showing which ones are down. loop.js consecutiveCooldowns is not touched here (it tracks the loop-level retry budget for provider-not-ready errors that bypass phases-unit; the cooldown path in loop.js is separate and handles errors thrown before runUnitPhase, while this fix handles cancellation returned from runUnitPhase due to provider error during session creation). 2. redundant reassess-roadmap on completed slices (sf-mp8wa4qr-xw8fjb, medium) Doctor-triggered reassess path (loop.js P4-A) now checks whether the target slice already has an ASSESSMENT file before queuing reassess-roadmap. Mirrors the guard already present in the normal dispatch path (checkNeedsReassessment). 3. empty structured fields in slice summary (sf-mp8w6s88-ckv4yr, low) Added explicit instruction in complete-slice.md prompt template directing the executor to derive key_files, key_decisions, and patterns_established from task summaries before calling complete_slice.
2026-05-17 00:55:56 +02:00 · 2026-05-17 00:55:56 +02:00 · 53259aebf1
commit 53259aebf1
parent 41276a7b7a
4 changed files with 119 additions and 32 deletions
--- a/docs/RECORDS_KEEPER.md
+++ b/docs/RECORDS_KEEPER.md
@ -1,4 +1,4 @@
-<!-- sf-doc: version=2.75.3 template=docs/RECORDS_KEEPER.md state=pending hash=sha256:3872de9cd72bd9129814a5e77e3b86abe76bef33f3ca34e04ae7582b4cfd066a -->
+<!-- sf-doc: version=2.75.4 template=docs/RECORDS_KEEPER.md state=pending hash=sha256:3872de9cd72bd9129814a5e77e3b86abe76bef33f3ca34e04ae7582b4cfd066a -->
 # Records Keeper

 The records keeper keeps repo memory ordered after meaningful changes. Run this checklist at milestone close, after architecture changes, after product behavior changes, and whenever docs/source disagree.
--- a/src/resources/extensions/sf/auto/loop.js
+++ b/src/resources/extensions/sf/auto/loop.js
@ -17,7 +17,7 @@ import { debugLog } from "../debug-logger.js";
 import { resolveEngine } from "../engine-resolver.js";
 import { getErrorMessage } from "../error-utils.js";
 import { NOTICE_KIND } from "../notification-store.js";
-import { sfRoot } from "../paths.js";
+import { resolveSliceFile, sfRoot } from "../paths.js";
 import { dispatchSelfFeedbackInlineFixIfNeeded } from "../self-feedback-drain.js";
 import { recordSelfFeedback } from "../self-feedback.js";
 import { getDatabase } from "../sf-db.js";
@ -1209,31 +1209,52 @@ export async function autoLoop(ctx, pi, s, deps) {
 									// will either advance state via a different
 									// unit or hit detectStuck and bail.
 								} else {
-									ctx.ui.notify(
-										`Health issues detected with slice references — queuing reassess-roadmap instead of pausing.`,
-										"warning",
-										{
-											noticeKind: NOTICE_KIND.SYSTEM_NOTICE,
-											dedupe_key: "doctor-health-reassess-roadmap",
-										},
-									);
-									const { buildReassessRoadmapPrompt } = await import(
-										"../auto-prompts.js"
-									);
-									const reassessPrompt = await buildReassessRoadmapPrompt(
-										mid,
-										midTitle,
-										sliceId,
+									// Guard: if the target slice is already complete AND has an
+									// ASSESSMENT file, skip queuing reassess-roadmap — there's
+									// nothing for it to do and it would loop on completed work.
+									const assessFile = resolveSliceFile(
 										s.basePath,
+										mid,
+										sliceId,
+										"ASSESS",
 									);
-									s.sidecarQueue.unshift({
-										kind: "hook",
-										unitType: "reassess-roadmap",
-										unitId: `${mid}/${sliceId}`,
-										prompt: `## Doctor Health Issues\n\n${healthCheck.issues.map((i) => `- ${i}`).join("\n")}\n\n${reassessPrompt}`,
-									});
-									finishTurn("retry");
-									continue;
+									if (assessFile && existsSync(assessFile)) {
+										ctx.ui.notify(
+											`Doctor health issues referenced ${sliceId} but it already has an ASSESSMENT — skipping redundant reassess-roadmap, falling through to normal dispatch.`,
+											"info",
+											{
+												noticeKind: NOTICE_KIND.TOOL_NOTICE,
+												dedupe_key: "doctor-reassess-already-assessed",
+											},
+										);
+										// Fall through to normal pre-dispatch
+									} else {
+										ctx.ui.notify(
+											`Health issues detected with slice references — queuing reassess-roadmap instead of pausing.`,
+											"warning",
+											{
+												noticeKind: NOTICE_KIND.SYSTEM_NOTICE,
+												dedupe_key: "doctor-health-reassess-roadmap",
+											},
+										);
+										const { buildReassessRoadmapPrompt } = await import(
+											"../auto-prompts.js"
+										);
+										const reassessPrompt = await buildReassessRoadmapPrompt(
+											mid,
+											midTitle,
+											sliceId,
+											s.basePath,
+										);
+										s.sidecarQueue.unshift({
+											kind: "hook",
+											unitType: "reassess-roadmap",
+											unitId: `${mid}/${sliceId}`,
+											prompt: `## Doctor Health Issues\n\n${healthCheck.issues.map((i) => `- ${i}`).join("\n")}\n\n${reassessPrompt}`,
+										});
+										finishTurn("retry");
+										continue;
+									}
 								}
 							}
 						}
--- a/src/resources/extensions/sf/auto/phases-unit.js
+++ b/src/resources/extensions/sf/auto/phases-unit.js
@ -41,6 +41,10 @@ import {
 	recordExecutorRefusalEscalation,
 } from "../autonomous-solver.js";
 import { blockModel } from "../blocked-models.js";
+import {
+	getCooldownRetryAfterMs,
+	isTransientCooldownError,
+} from "../infra-errors.js";
 import { resumeAutoAfterProviderDelay } from "../bootstrap/provider-error-resume.js";
 import { debugLog } from "../debug-logger.js";
 import { PROJECT_FILES } from "../detection.js";
@ -1385,13 +1389,68 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
 		// Provider-error: try to reselect a ready provider and continue rather
 		// than stopping autonomous mode. Only stop if no ready provider exists.
 		if (currentUnitResult.errorContext?.category === "provider") {
-			await emitCancelledUnitEnd(
-				ic,
-				unitType,
-				unitId,
-				unitStartSeq,
-				currentUnitResult.errorContext,
-			);
+			// Check whether this is a transient cooldown (rate-limit / quota), which
+			// should attempt failover to a non-cooldowned provider rather than stopping.
+			// Use the raw error from errorContext when available, falling back to
+			// the structured error field if present.
+			const rawErr = currentUnitResult.errorContext?.error;
+			const isCooldown =
+				rawErr != null && isTransientCooldownError(rawErr);
+			if (isCooldown) {
+				// Mark the failing model as temporarily blocked with its retry-after
+				// window so the next selectAndApplyModel call skips it.
+				const cooldownModel = s.currentUnitModel;
+				const retryAfterMs = rawErr != null
+					? getCooldownRetryAfterMs(rawErr)
+					: undefined;
+				const expiresAt = retryAfterMs != null && retryAfterMs > 0
+					? Date.now() + retryAfterMs
+					: Date.now() + 60_000; // default 60s block if no retry hint
+				if (cooldownModel?.provider && cooldownModel?.id) {
+					blockModel(
+						s.basePath,
+						cooldownModel.provider,
+						cooldownModel.id,
+						`transient cooldown: retry after ${retryAfterMs ?? 60_000}ms`,
+						{ expiresAt },
+					);
+				}
+				// Try to find any non-cooldowned, provider-ready fallback.
+				const failedProvider =
+					s.currentUnitModel?.provider ?? ctx.model?.provider;
+				const allModels = ctx.modelRegistry?.getAvailable?.() ?? [];
+				const fallback = allModels.find(
+					(m) =>
+						m.provider !== failedProvider &&
+						ctx.modelRegistry?.isProviderRequestReady?.(m.provider),
+				);
+				if (fallback) {
+					const ok = await pi.setModel(fallback, { persist: false });
+					if (ok) {
+						s.currentUnitModel = fallback;
+						const waitNote = retryAfterMs != null && retryAfterMs > 0
+							? ` (retrying immediately — cooldown expires in ${Math.round(retryAfterMs / 1000)}s)`
+							: "";
+						ctx.ui.notify(
+							`Provider ${failedProvider} in cooldown${waitNote} — switched to ${fallback.provider}/${fallback.id}`,
+							"warning",
+						);
+						return { action: "continue" };
+					}
+				}
+				// All providers are unavailable — stopAuto with enumeration.
+				const cooldowned = allModels
+					.filter((m) => !ctx.modelRegistry?.isProviderRequestReady?.(m.provider))
+					.map((m) => `${m.provider}/${m.id}`);
+				const cooldownedList = cooldowned.length > 0
+					? cooldowned.join(", ")
+					: "(none listed)";
+				const msg = `All providers in cooldown. Failed: ${failedProvider}. Remaining unavailable: ${cooldownedList}. Stopping.`;
+				ctx.ui.notify(msg, "error");
+				await deps.stopAuto(ctx, pi, msg);
+				return { action: "break", reason: "all-providers-cooldown" };
+			}
+			// Non-cooldown provider error: try a different provider, stop if none ready.
 			const failedProvider =
 				s.currentUnitModel?.provider ?? ctx.model?.provider;
 			const allModels = ctx.modelRegistry?.getAvailable?.() ?? [];
--- a/src/resources/extensions/sf/prompts/complete-slice.md
+++ b/src/resources/extensions/sf/prompts/complete-slice.md
@ -10,6 +10,13 @@ Executor agents built each task and wrote task summaries. You are the closer —

 Write the summary for those downstream readers. What did this slice actually deliver? What patterns did it establish? What should the next slice know?

+**Structured frontmatter fields:** Three fields in the slice summary YAML frontmatter — `key_files`, `key_decisions`, and `patterns_established` — must be populated from actual work, not left as `(none)`. Before calling `complete_slice`, derive these from your inlined context:
+- `key_files`: scan all task summaries for files created or modified; deduplicate and list the most significant ones (max ~8 entries).
+- `key_decisions`: extract every named architectural decision from task narratives (e.g. "chose X over Y because Z"); list each as one bullet.
+- `patterns_established`: extract repeatable patterns, conventions, or lessons that future agents should reuse; avoid obvious observations.
+
+If the slice did not produce any meaningful files, decisions, or patterns, write a brief honest entry rather than leaving the field as `(none)`. A one-line entry is better than silence — it shows the agent thought about it.
+
 All relevant context has been preloaded below — the slice plan, all task summaries, and the milestone roadmap are inlined. Start working immediately without re-reading these files.

 {{closeoutControl}}