diff --git a/.sf/model-performance.json b/.sf/model-performance.json new file mode 100644 index 000000000..9e93a240f --- /dev/null +++ b/.sf/model-performance.json @@ -0,0 +1,14 @@ +{ + "research-slice": { + "kimi-coding/kimi-k2.6": { + "successes": 4, + "failures": 0, + "timeouts": 0, + "totalTokens": 1590810, + "totalCost": 0.22167976, + "lastUsed": "2026-05-08T13:36:05.865Z", + "successRate": 1, + "total": 4 + } + } +} \ No newline at end of file diff --git a/src/resources/extensions/sf/autonomous-solver.js b/src/resources/extensions/sf/autonomous-solver.js index 052ce55d8..4c3fb2de0 100644 --- a/src/resources/extensions/sf/autonomous-solver.js +++ b/src/resources/extensions/sf/autonomous-solver.js @@ -657,7 +657,15 @@ export function buildAutonomousSolverMissingCheckpointRepairPrompt( "2. List files in the milestone/slice/task directories to find what artifacts exist.", "3. Read any SUMMARY.md or PLAN.md files to understand what progress was made.", "4. Based on the evidence, call sf_autonomous_checkpoint with the appropriate outcome and PDD fields.", - "5. If you cannot determine what happened, use outcome='decide' and ask the human what the checkpoint should contain.", + "5. **Important**: If you cannot determine what happened with high confidence (≥0.98), use outcome='decide' and ask the human what the checkpoint should contain.", + ); + lines.push( + "", + "**Low-confidence reconstruction guidance**:", + "- Use outcome='decide' when evidence is sparse or ambiguous (confidence < 0.98)", + "- Use outcome='decide' when you cannot verify what work was actually completed", + "- Use outcome='decide' when there are multiple possible interpretations of progress", + "- This ensures autonomous mode pauses for human acceptance rather than guessing incorrectly", ); } else if (repairAttempt <= 1) { lines.push("Do not continue implementation work in this repair turn."); @@ -685,6 +693,9 @@ export function buildAutonomousSolverMissingCheckpointRepairPrompt( lines.push( 'This is the final automatic repair attempt. Prefer outcome="decide" over guessing; autonomous mode will pause with your decision question for human acceptance.', ); + lines.push( + '**Final guidance**: If there is any doubt about the correctness of the checkpoint, use outcome="decide" with a clear question asking the human to specify the correct state.', + ); } lines.push( "If no useful progress happened, use outcome=blocked and explain why.",