From 8c549bd9c71622d818afbd55c7b106fb6ff8c01d Mon Sep 17 00:00:00 2001 From: Lex Christopherson Date: Fri, 13 Mar 2026 00:31:37 -0600 Subject: [PATCH] feat(M002/S05): Intent-ranked retrieval and semantic actions Tasks: - chore(M002/S05): auto-commit after complete-slice - chore(M002/S05): auto-commit after complete-slice - chore(M002/S05/T01): auto-commit after execute-task - chore(M002/S05/T01): auto-commit after execute-task - chore(M002/S05): auto-commit after plan-slice - docs(S05): add slice plan Branch: gsd/M002/S05 --- .gsd/DECISIONS.md | 2 + .gsd/PROJECT.md | 4 +- .gsd/REQUIREMENTS.md | 16 +- .gsd/STATE.md | 4 +- .gsd/milestones/M002/M002-ROADMAP.md | 2 +- .gsd/milestones/M002/slices/S05/S05-PLAN.md | 52 ++ .../M002/slices/S05/tasks/T01-PLAN.md | 85 +++ .../extensions/browser-tools/index.ts | 2 + .../extensions/browser-tools/tools/intent.ts | 614 ++++++++++++++++++ 9 files changed, 768 insertions(+), 13 deletions(-) create mode 100644 .gsd/milestones/M002/slices/S05/S05-PLAN.md create mode 100644 .gsd/milestones/M002/slices/S05/tasks/T01-PLAN.md create mode 100644 src/resources/extensions/browser-tools/tools/intent.ts diff --git a/.gsd/DECISIONS.md b/.gsd/DECISIONS.md index 0c3159a18..24c061b31 100644 --- a/.gsd/DECISIONS.md +++ b/.gsd/DECISIONS.md @@ -28,3 +28,5 @@ | D020 | M002/S04 | pattern | Form analysis evaluate location | Form analysis evaluate logic lives in tools/forms.ts, not extracted to evaluate-helpers.ts | Form-specific, not a shared utility. The label resolution heuristic is only used by form tools. Keeping it local avoids bloating the shared injection. | Yes — if S05 intent tools need label resolution | | D021 | M002/S04 | pattern | Fill uses Playwright APIs, not evaluate | browser_fill_form uses Playwright locator.fill()/selectOption()/setChecked() instead of page.evaluate() value setting | Playwright APIs trigger proper input/change events and handle framework-specific reactivity (React, Vue). Direct value setting via evaluate skips event dispatch and breaks reactive frameworks. | No | | D022 | M002/S04 | pattern | Fill field matching priority | Label (exact → case-insensitive) → name → placeholder → aria-label | Label is the most human-readable identifier. Name is the most reliable programmatic identifier. Placeholder and aria-label are fallbacks. Exact match before fuzzy prevents wrong-field fills. | Yes — if real-world usage shows a different priority works better | +| D023 | M002/S05 | pattern | Intent scoring model | 4 orthogonal dimensions per intent, each 0-1, summed and clamped | Consistent scoring structure across all 8 intents. Makes scoring testable and debuggable — each dimension has a named reason. 4 dimensions balance discrimination vs complexity. | Yes — could add/remove dimensions per intent if real-world usage shows imbalance | +| D024 | M002/S05 | pattern | search_field action type | Focus instead of click for search_field intent in browser_act | Search fields need keyboard focus for typing, not a click that might submit or toggle. Focus is the semantically correct action. Other intents use click. | Yes — if focus proves unreliable on specific input implementations | diff --git a/.gsd/PROJECT.md b/.gsd/PROJECT.md index fd216e036..9fd033848 100644 --- a/.gsd/PROJECT.md +++ b/.gsd/PROJECT.md @@ -16,7 +16,7 @@ The GSD extension is fully functional with: - Guided `/gsd` wizard flow - `secure_env_collect` tool with masked TUI input, multi-destination write support, guidance display, and summary screen - Proactive secret management: planning prompts forecast secrets, manifests persist them, auto-mode collects them before first dispatch -- Browser-tools extension with 45 registered tools covering navigation, interaction, inspection, verification, tracing, debugging, and form intelligence (browser_analyze_form, browser_fill_form) +- Browser-tools extension with 47 registered tools covering navigation, interaction, inspection, verification, tracing, debugging, form intelligence (browser_analyze_form, browser_fill_form), and intent-ranked retrieval and semantic actions (browser_find_best, browser_act) - Browser-tools `core.js` with shared utilities for action timeline, page registry, state diffing, assertions, fingerprinting ## Architecture / Key Patterns @@ -26,7 +26,7 @@ The GSD extension is fully functional with: - **Secrets gate**: `startAuto()` checks `getManifestStatus()` before first dispatch - **Disk-driven state**: `.gsd/` files are the source of truth, `STATE.md` is derived cache - **File parsing**: `files.ts` has markdown parsers for all GSD file types -- **Browser-tools**: Modular structure — slim `index.ts` orchestrator, 8 focused infrastructure modules (state.ts, utils.ts, evaluate-helpers.ts, lifecycle.ts, capture.ts, settle.ts, refs.ts), 10 categorized tool files under `tools/` (including forms.ts), shared infrastructure in `core.js` (~1000 lines). Browser-side utilities injected once via `addInitScript` under `window.__pi` namespace. Uses Playwright for browser control. Accessibility-first state representation, deterministic versioned refs, adaptive DOM settling, compact post-action summaries. Form tools use Playwright locator APIs for type-aware filling with structured result reporting. +- **Browser-tools**: Modular structure — slim `index.ts` orchestrator, 8 focused infrastructure modules (state.ts, utils.ts, evaluate-helpers.ts, lifecycle.ts, capture.ts, settle.ts, refs.ts), 11 categorized tool files under `tools/` (including forms.ts, intent.ts), shared infrastructure in `core.js` (~1000 lines). Browser-side utilities injected once via `addInitScript` under `window.__pi` namespace. Uses Playwright for browser control. Accessibility-first state representation, deterministic versioned refs, adaptive DOM settling, compact post-action summaries. Form tools use Playwright locator APIs for type-aware filling with structured result reporting. Intent tools use deterministic 4-dimension heuristic scoring for element retrieval and one-call semantic actions. - **Prompt templates**: `prompts/` directory with mustache-like `{{var}}` substitution - **TUI components**: `@gsd/pi-tui` provides `Editor`, `Text`, key handling, themes - **Branch-per-slice**: git branches isolate slice work, squash-merged to main on completion diff --git a/.gsd/REQUIREMENTS.md b/.gsd/REQUIREMENTS.md index c562f7655..52c5c0f2a 100644 --- a/.gsd/REQUIREMENTS.md +++ b/.gsd/REQUIREMENTS.md @@ -50,24 +50,24 @@ This file is the explicit capability and coverage contract for the project. ### R024 — Intent-ranked element retrieval (browser_find_best) - Class: core-capability -- Status: active +- Status: validated - Description: A browser_find_best tool that takes an intent string (e.g. "submit form", "close dialog", "primary CTA") and returns scored candidates with reasons, using deterministic heuristic ranking. - Why it matters: The agent frequently needs "which button submits this form?" Currently it does browser_find → gets 15 candidates → reasons about which one. A heuristic ranker cuts a round trip and reduces reasoning tokens. - Source: user - Primary owning slice: M002/S05 - Supporting slices: M002/S01 -- Validation: unmapped +- Validation: 8 intents implemented with 4-dimension scoring (submit_form, close_dialog, primary_cta, search_field, next_step, dismiss, auth_action, back_navigation). Each returns up to 5 candidates sorted by score with CSS selectors and reason strings. Intent normalization accepts underscores/spaces/hyphens. Verified via Playwright tests against real HTML pages with differentiated rankings. Build passes, tool count = 47. - Notes: Deterministic heuristics only. No hidden LLM calls. ### R025 — Semantic action tool (browser_act) - Class: core-capability -- Status: active +- Status: validated - Description: A browser_act tool that takes a semantic intent (e.g. "submit the current form", "close the active modal", "click the primary CTA") and executes the obvious action sequence internally. - Why it matters: Each of these common micro-tasks currently takes 2-4 tool calls. browser_act collapses them into one. - Source: user - Primary owning slice: M002/S05 - Supporting slices: M002/S04 -- Validation: unmapped +- Validation: Resolves top candidate via same scoring engine as browser_find_best. Executes via Playwright locator.click() with getByRole fallback (focus for search_field). Settles via settleAfterActionAdaptive, returns before/after diff. Zero-candidate returns isError:true without throwing. Verified via Playwright test scripts. Build passes, tool count = 47. - Notes: Builds on browser_find_best for element selection. Bounded — does not loop or retry. ### R026 — Test coverage for new and refactored code @@ -345,16 +345,16 @@ This file is the explicit capability and coverage contract for the project. | R021 | core-capability | validated | M002/S03 | none | screenshot param default false, capture gated, browser_reload unchanged, build passes | | R022 | core-capability | validated | M002/S04 | M002/S01 | 7-level label resolution, form auto-detection, verified against 12-field test form | | R023 | core-capability | validated | M002/S04 | M002/S01 | 5-strategy field resolution, type-aware fill, verified end-to-end with 10 fields | -| R024 | core-capability | active | M002/S05 | M002/S01 | unmapped | -| R025 | core-capability | active | M002/S05 | M002/S04 | unmapped | +| R024 | core-capability | validated | M002/S05 | M002/S01 | 8-intent scoring, Playwright tests, differentiated rankings, build passes | +| R025 | core-capability | validated | M002/S05 | M002/S04 | top candidate execution via Playwright locator, settle + diff, graceful error, build passes | | R026 | quality-attribute | active | M002/S06 | all M002 | unmapped | | R027 | core-capability | deferred | none | none | unmapped | | R028 | anti-feature | out-of-scope | none | none | n/a | ## Coverage Summary -- Active requirements: 3 -- Validated requirements: 19 +- Active requirements: 1 +- Validated requirements: 21 - Deferred requirements: 3 - Out of scope: 3 - Unmapped active requirements: 3 diff --git a/.gsd/STATE.md b/.gsd/STATE.md index 3baaef13a..d2e098e0c 100644 --- a/.gsd/STATE.md +++ b/.gsd/STATE.md @@ -1,7 +1,7 @@ # GSD State **Active Milestone:** M002 — Browser Tools Performance & Intelligence -**Active Slice:** S05 — Intent-ranked retrieval and semantic actions +**Active Slice:** S06 — Test coverage **Phase:** planning **Requirements Status:** 7 active · 15 validated · 3 deferred · 3 out of scope @@ -16,4 +16,4 @@ - None ## Next Action -Plan slice S05 (Intent-ranked retrieval and semantic actions). +Plan slice S06 (Test coverage). diff --git a/.gsd/milestones/M002/M002-ROADMAP.md b/.gsd/milestones/M002/M002-ROADMAP.md index 1fb3c1880..22f327edf 100644 --- a/.gsd/milestones/M002/M002-ROADMAP.md +++ b/.gsd/milestones/M002/M002-ROADMAP.md @@ -70,7 +70,7 @@ This milestone is complete only when all are true: - [x] **S04: Form intelligence** `risk:medium` `depends:[S01]` > After this: browser_analyze_form returns field inventory (labels, types, required, values, validation) for any form; browser_fill_form fills fields by label/name/placeholder mapping and optionally submits — verified by running both tools against a real multi-field form. -- [ ] **S05: Intent-ranked retrieval and semantic actions** `risk:medium` `depends:[S01]` +- [x] **S05: Intent-ranked retrieval and semantic actions** `risk:medium` `depends:[S01]` > After this: browser_find_best returns scored candidates for intents like "submit form", "close dialog", "primary CTA"; browser_act executes common micro-tasks in one call — verified by running both tools against real pages. - [ ] **S06: Test coverage** `risk:low` `depends:[S01,S02,S03,S04,S05]` diff --git a/.gsd/milestones/M002/slices/S05/S05-PLAN.md b/.gsd/milestones/M002/slices/S05/S05-PLAN.md new file mode 100644 index 000000000..195e95e5b --- /dev/null +++ b/.gsd/milestones/M002/slices/S05/S05-PLAN.md @@ -0,0 +1,52 @@ +# S05: Intent-ranked retrieval and semantic actions + +**Goal:** `browser_find_best` returns scored candidates for semantic intents; `browser_act` resolves the top candidate and executes it in one call. +**Demo:** Run `browser_find_best` with intent "submit_form" against a real page with a form and get ranked candidates. Run `browser_act` with intent "close_dialog" against a page with a modal and see it dismissed. + +## Must-Haves + +- `browser_find_best` registered and functional with 8 intents: submit_form, close_dialog, primary_cta, search_field, next_step, dismiss, auth_action, back_navigation +- Each intent uses deterministic heuristic scoring (no LLM calls) with 2+ scoring dimensions per intent +- Candidates include CSS selectors usable with Playwright locator APIs +- Results capped at 5 candidates, scored 0-1 with human-readable reasons +- Intent strings normalized (accept underscores, spaces, mixed case) +- `browser_act` resolves top candidate, executes via Playwright locator click (not evaluate click), settles, returns before/after diff +- `browser_act` returns error (not throw) when zero candidates found +- Both tools wired into index.ts, tool count = 47 +- Build passes + +## Proof Level + +- This slice proves: integration (new tools against real browser pages) +- Real runtime required: yes (Playwright against real pages) +- Human/UAT required: no (automated verification sufficient) + +## Verification + +- `npm run build` passes +- `grep -c "pi.registerTool" src/resources/extensions/browser-tools/tools/*.ts` sums to 47 +- `browser_find_best` with intent "submit_form" against a page with a `
` returns candidates with scores > 0 +- `browser_find_best` with intent "close_dialog" against a page with a `[role="dialog"]` returns close button candidates +- `browser_act` with intent "submit_form" clicks the submit button and returns before/after state +- `browser_act` against a page with no dialog returns a graceful error (not throw) for "close_dialog" intent +- Scoring heuristics produce differentiated rankings (top candidate scores higher than others) + +## Integration Closure + +- Upstream surfaces consumed: `evaluate-helpers.ts` (window.__pi utilities), `lifecycle.ts` (ensureBrowser, getActiveTarget), `state.ts` (ToolDeps, CompactPageState), `utils.ts` (action tracking, formatting), `core.js` (diffCompactStates), `settle.ts` (settleAfterActionAdaptive) +- New wiring introduced: `tools/intent.ts` + import/call in `index.ts` +- What remains before the milestone is truly usable end-to-end: S06 (test coverage) + +## Tasks + +- [x] **T01: Implement browser_find_best and browser_act with 8-intent scoring engine** `est:45m` + - Why: This is the entire slice — two tools sharing a single intent resolution engine, all in one file following the established forms.ts pattern. The scoring evaluate script, both tool registrations, and the index.ts wiring are tightly coupled and well within a single context window (~350 lines new code, 2 files created/modified). + - Files: `src/resources/extensions/browser-tools/tools/intent.ts` (new), `src/resources/extensions/browser-tools/index.ts` (wire) + - Do: Build `buildIntentScoringScript(intent, scope?)` as a string template evaluate returning scored candidates with cssPath selectors. Implement 8 intent scoring functions using window.__pi utilities (inferRole, accessibleName, isVisible, isEnabled, isInteractiveEl). Register `browser_find_best` (intent + optional scope → scored candidates) and `browser_act` (intent + optional scope → resolve top candidate → Playwright locator click → settle → diff). Wire via registerIntentTools import + call in index.ts. + - Verify: `npm run build` passes; grep tool count = 47; run both tools against real test pages via Playwright scripts + - Done when: Both tools registered, build passes, verified against real pages with forms and dialogs + +## Files Likely Touched + +- `src/resources/extensions/browser-tools/tools/intent.ts` (new) +- `src/resources/extensions/browser-tools/index.ts` (wire registration) diff --git a/.gsd/milestones/M002/slices/S05/tasks/T01-PLAN.md b/.gsd/milestones/M002/slices/S05/tasks/T01-PLAN.md new file mode 100644 index 000000000..55eb2b7d0 --- /dev/null +++ b/.gsd/milestones/M002/slices/S05/tasks/T01-PLAN.md @@ -0,0 +1,85 @@ +--- +estimated_steps: 5 +estimated_files: 2 +--- + +# T01: Implement browser_find_best and browser_act with 8-intent scoring engine + +**Slice:** S05 — Intent-ranked retrieval and semantic actions +**Milestone:** M002 + +## Description + +Create `tools/intent.ts` with both `browser_find_best` and `browser_act`, sharing a single intent resolution engine built as a string template evaluate script (same pattern as forms.ts `buildFormAnalysisScript`). The scoring engine runs entirely in-browser via `page.evaluate()`, using `window.__pi` utilities for element metadata. Each of 8 intents has a candidate selector strategy and multi-dimensional scoring function. `browser_act` takes the top candidate from the same scoring logic, executes via Playwright `locator().click()` (D021), settles, and returns a before/after diff. + +## Steps + +1. **Create `tools/intent.ts`** with the `registerIntentTools(pi, deps)` export function. Define the 8 intent names as a const array and use `StringEnum` for the parameter schema. Build `buildIntentScoringScript(intent, scope?)` as a string template that: + - Normalizes the intent string (lowercase, strip spaces/underscores/hyphens) + - For each intent, selects candidate elements (e.g., submit_form → buttons/inputs inside or near forms; close_dialog → buttons inside `[role="dialog"]` or `dialog` elements) + - Scores each candidate 0-1 across 2-4 dimensions (structural position, role, text signals, visibility/enabled state) + - Returns top 5 candidates sorted by score, each with: `{ score, selector, tag, role, name, text, reason }` + - Uses `window.__pi.cssPath()` for selector generation, `window.__pi.inferRole()` / `window.__pi.accessibleName()` / `window.__pi.isVisible()` / `window.__pi.isEnabled()` for scoring signals + +2. **Implement the 8 intent scoring functions** inside the evaluate string template: + - `submitform` — query `button[type="submit"], input[type="submit"], button:not([type])` within forms; score by: is-submit-type, inside-form, text-suggests-submission, visible+enabled + - `closedialog` — query buttons/links inside `[role="dialog"], dialog, [aria-modal="true"]`; score by: text-matches-close-pattern, has-aria-label-close, is-visible, position (top-right gets a boost) + - `primarycta` — query all visible enabled buttons/links; score by: visual prominence (size), semantic weight (role=button > link), text-not-cancel/dismiss, position (main content area) + - `searchfield` — query inputs with type=search or role=searchbox or name/placeholder matching "search"; score by: type-match, placeholder-match, visibility, is-in-header/nav + - `nextstep` — query buttons/links with text matching next/continue/proceed/forward patterns; score by: text-match-strength, is-button, visible+enabled, not-disabled + - `dismiss` — query buttons/links matching close/cancel/dismiss/skip/no-thanks patterns; score by: text-match, position, inside-dialog/modal/overlay, is-visible + - `authaction` — query buttons/links matching login/sign-in/signup/register patterns; score by: text-match-strength, is-button-or-link, prominent-position, visible + - `backnavigation` — query buttons/links matching back/previous/return patterns; score by: text-match, has-back-arrow/icon, is-in-nav/header, visible + +3. **Register `browser_find_best`** tool: + - Parameters: `intent` (StringEnum of 8 intents), optional `scope` (CSS selector to narrow search) + - Execute: ensureBrowser → getActiveTarget → captureCompactPageState (before) → target.evaluate(buildIntentScoringScript) → format results as markdown with scores and selectors → tracked action finish + - Output format: numbered candidates with score, selector, role, text, and reason + +4. **Register `browser_act`** tool: + - Parameters: `intent` (same StringEnum), optional `scope` (CSS selector) + - Execute: ensureBrowser → captureCompactPageState (before) → target.evaluate(buildIntentScoringScript) → if zero candidates, return error → take top candidate → locator(candidate.selector).click() with getByRole fallback → settleAfterActionAdaptive → captureCompactPageState (after) → diffCompactStates → format result with before/after diff + - For search_field intent: focus instead of click + - Error handling: graceful error return when no candidates found, captureErrorScreenshot on unexpected failures + +5. **Wire into index.ts**: Add `import { registerIntentTools } from "./tools/intent.js"` and `registerIntentTools(pi, deps)` call. Verify build passes and tool count = 47. + +## Must-Haves + +- [ ] `browser_find_best` registered with 8-intent StringEnum parameter +- [ ] `browser_act` registered with same 8-intent parameter +- [ ] Intent scoring runs as a single page.evaluate() string template per call +- [ ] Each intent has 2+ orthogonal scoring dimensions producing differentiated rankings +- [ ] Scoring uses `window.__pi.*` utilities (no inline redeclarations) +- [ ] Candidates include CSS selectors from `window.__pi.cssPath()` +- [ ] Results capped at 5 candidates, scored 0-1 +- [ ] Intent string normalization handles underscores, spaces, mixed case +- [ ] `browser_act` clicks via `target.locator(selector).click()` not `page.evaluate(() => el.click())` +- [ ] `browser_act` returns error (not throw) when zero candidates +- [ ] Both tools use tracked action pattern (beginTrackedAction / finishTrackedAction) +- [ ] Tool count = 47 after wiring +- [ ] `npm run build` passes + +## Verification + +- `npm run build` passes with zero errors +- `grep -c "pi.registerTool" src/resources/extensions/browser-tools/tools/*.ts | awk -F: '{s+=$2} END {print s}'` outputs 47 +- Playwright verification script against a test HTML page with form + dialog: + - `browser_find_best` intent="submit_form" returns candidates with submit button scored highest + - `browser_find_best` intent="close_dialog" returns close/dismiss button inside dialog + - `browser_act` intent="submit_form" clicks the submit button + - `browser_act` intent="close_dialog" with no dialog on page returns error, not crash + +## Inputs + +- `src/resources/extensions/browser-tools/tools/forms.ts` — pattern for string template evaluates, tool registration, error handling +- `src/resources/extensions/browser-tools/tools/interaction.ts` — pattern for Playwright locator click with getByRole fallback +- `src/resources/extensions/browser-tools/evaluate-helpers.ts` — window.__pi API surface (9 functions) +- `src/resources/extensions/browser-tools/index.ts` — wiring pattern (import + ToolDeps + registerXTools call) +- `src/resources/extensions/browser-tools/state.ts` — ToolDeps interface, CompactPageState type +- S05-RESEARCH.md — intent list, scoring guidance, common pitfalls + +## Expected Output + +- `src/resources/extensions/browser-tools/tools/intent.ts` — new file with ~350-400 lines containing `registerIntentTools(pi, deps)`, `buildIntentScoringScript()`, and both tool registrations +- `src/resources/extensions/browser-tools/index.ts` — modified with 1 new import line + 1 new registration call diff --git a/src/resources/extensions/browser-tools/index.ts b/src/resources/extensions/browser-tools/index.ts index c69fdfa9d..b9753ba0e 100644 --- a/src/resources/extensions/browser-tools/index.ts +++ b/src/resources/extensions/browser-tools/index.ts @@ -16,6 +16,7 @@ import { registerRefTools } from "./tools/refs.js"; import { registerWaitTools } from "./tools/wait.js"; import { registerPageTools } from "./tools/pages.js"; import { registerFormTools } from "./tools/forms.js"; +import { registerIntentTools } from "./tools/intent.js"; export default function (pi: ExtensionAPI) { pi.on("session_shutdown", async () => { await closeBrowser(); }); @@ -46,4 +47,5 @@ export default function (pi: ExtensionAPI) { registerRefTools(pi, deps); registerWaitTools(pi, deps); registerPageTools(pi, deps); registerFormTools(pi, deps); + registerIntentTools(pi, deps); } diff --git a/src/resources/extensions/browser-tools/tools/intent.ts b/src/resources/extensions/browser-tools/tools/intent.ts new file mode 100644 index 000000000..09d4892b2 --- /dev/null +++ b/src/resources/extensions/browser-tools/tools/intent.ts @@ -0,0 +1,614 @@ +import type { ExtensionAPI } from "@gsd/pi-coding-agent"; +import { Type } from "@sinclair/typebox"; +import { StringEnum } from "@gsd/pi-ai"; +import { diffCompactStates } from "../core.js"; +import type { ToolDeps, CompactPageState } from "../state.js"; +import { + setLastActionBeforeState, + setLastActionAfterState, +} from "../state.js"; + +// --------------------------------------------------------------------------- +// Intent definitions +// --------------------------------------------------------------------------- + +const INTENTS = [ + "submit_form", + "close_dialog", + "primary_cta", + "search_field", + "next_step", + "dismiss", + "auth_action", + "back_navigation", +] as const; + +type Intent = (typeof INTENTS)[number]; + +// --------------------------------------------------------------------------- +// Scoring evaluate script — runs entirely in-browser via page.evaluate() +// --------------------------------------------------------------------------- + +/** + * Builds a self-contained IIFE string that scores candidate elements for a + * given intent. Returns top 5 candidates sorted by score descending, each + * with { score, selector, tag, role, name, text, reason }. + * + * Uses window.__pi utilities (injected via addInitScript) for element + * metadata — no inline redeclarations. + */ +function buildIntentScoringScript(intent: string, scope?: string): string { + const scopeSelector = JSON.stringify(scope ?? null); + + return `(() => { + var pi = window.__pi; + if (!pi) return { error: "window.__pi not available — browser helpers not injected" }; + + var intentRaw = ${JSON.stringify(intent)}; + var normalized = intentRaw.toLowerCase().replace(/[\\s_\\-]+/g, ""); + var scopeSel = ${scopeSelector}; + var root = scopeSel ? document.querySelector(scopeSel) : document.body; + if (!root) return { error: "Scope selector not found: " + scopeSel }; + + // --- Shared helpers --- + function textOf(el) { + return (el.textContent || "").trim().replace(/\\s+/g, " ").slice(0, 120).toLowerCase(); + } + + function clamp01(v) { return Math.max(0, Math.min(1, v)); } + + function makeCandidate(el, score, reason) { + return { + score: Math.round(clamp01(score) * 100) / 100, + selector: pi.cssPath(el), + tag: el.tagName.toLowerCase(), + role: pi.inferRole(el) || "", + name: pi.accessibleName(el) || "", + text: textOf(el).slice(0, 80), + reason: reason, + }; + } + + function qsa(sel) { return Array.from(root.querySelectorAll(sel)); } + + function visibleEnabled(el) { + return pi.isVisible(el) && pi.isEnabled(el); + } + + function textMatches(el, patterns) { + var t = textOf(el); + var n = (pi.accessibleName(el) || "").toLowerCase(); + var combined = t + " " + n; + for (var i = 0; i < patterns.length; i++) { + if (combined.indexOf(patterns[i]) !== -1) return true; + } + return false; + } + + function textMatchStrength(el, patterns) { + var t = textOf(el); + var n = (pi.accessibleName(el) || "").toLowerCase(); + var combined = t + " " + n; + var count = 0; + for (var i = 0; i < patterns.length; i++) { + if (combined.indexOf(patterns[i]) !== -1) count++; + } + return Math.min(count / Math.max(patterns.length, 1), 1); + } + + // --- Intent-specific scoring --- + var candidates = []; + + if (normalized === "submitform") { + var els = qsa('button[type="submit"], input[type="submit"], button:not([type]), button[type="button"]'); + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!visibleEnabled(el)) continue; + var d1 = el.type === "submit" || el.getAttribute("type") === "submit" ? 0.35 : 0; + var d2 = el.closest("form") ? 0.3 : 0; + var d3 = textMatches(el, ["submit", "send", "save", "create", "add", "post", "confirm", "ok", "done", "register", "sign up", "log in"]) ? 0.2 : 0; + var d4 = 0.15; + var score = d1 + d2 + d3 + d4; + var reasons = []; + if (d1 > 0) reasons.push("submit-type"); + if (d2 > 0) reasons.push("inside-form"); + if (d3 > 0) reasons.push("text-suggests-submit"); + reasons.push("visible+enabled"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else if (normalized === "closedialog") { + var containers = qsa('[role="dialog"], dialog, [aria-modal="true"], [role="alertdialog"]'); + for (var ci = 0; ci < containers.length; ci++) { + var btns = containers[ci].querySelectorAll("button, a, [role='button']"); + for (var bi = 0; bi < btns.length; bi++) { + var el = btns[bi]; + if (!visibleEnabled(el)) continue; + var d1 = textMatches(el, ["close", "cancel", "dismiss", "×", "✕", "x", "got it", "ok", "done"]) ? 0.35 : 0; + var ariaLbl = (el.getAttribute("aria-label") || "").toLowerCase(); + var d2 = (ariaLbl.indexOf("close") !== -1 || ariaLbl.indexOf("dismiss") !== -1) ? 0.25 : 0; + var d3 = 0.2; + var rect = el.getBoundingClientRect(); + var parentRect = containers[ci].getBoundingClientRect(); + var isTopRight = rect.top - parentRect.top < 60 && parentRect.right - rect.right < 60; + var d4 = isTopRight ? 0.2 : 0; + var score = d1 + d2 + d3 + d4; + var reasons = []; + if (d1 > 0) reasons.push("text-matches-close"); + if (d2 > 0) reasons.push("aria-label-close"); + reasons.push("inside-dialog"); + if (d4 > 0) reasons.push("top-right-position"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + } + + else if (normalized === "primarycta") { + var els = qsa("button, a, [role='button'], input[type='submit'], input[type='button']"); + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!visibleEnabled(el)) continue; + var rect = el.getBoundingClientRect(); + var area = rect.width * rect.height; + var d1 = clamp01(area / 12000); + var role = pi.inferRole(el); + var d2 = role === "button" ? 0.25 : (role === "link" ? 0.1 : 0.15); + var isNegative = textMatches(el, ["cancel", "dismiss", "close", "skip", "no thanks", "no, thanks", "maybe later"]); + var d3 = isNegative ? 0 : 0.2; + var inMain = !!el.closest("main, [role='main'], article, section, .hero, .content"); + var d4 = inMain ? 0.15 : 0; + var score = d1 + d2 + d3 + d4; + var reasons = []; + reasons.push("size:" + Math.round(area)); + if (d2 >= 0.25) reasons.push("button-role"); + if (d3 > 0) reasons.push("non-dismissive"); + if (d4 > 0) reasons.push("in-main-content"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else if (normalized === "searchfield") { + var els = qsa("input, textarea, [role='searchbox'], [role='combobox'], [contenteditable='true']"); + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!pi.isVisible(el)) continue; + var type = (el.getAttribute("type") || "text").toLowerCase(); + if (["hidden", "submit", "button", "reset", "image", "checkbox", "radio", "file"].indexOf(type) !== -1 && el.tagName.toLowerCase() === "input") continue; + var d1 = type === "search" || pi.inferRole(el) === "searchbox" ? 0.4 : 0; + var ph = (el.getAttribute("placeholder") || "").toLowerCase(); + var nm = (el.getAttribute("name") || "").toLowerCase(); + var ariaLbl = (el.getAttribute("aria-label") || "").toLowerCase(); + var combined = ph + " " + nm + " " + ariaLbl; + var d2 = combined.indexOf("search") !== -1 || combined.indexOf("query") !== -1 || combined.indexOf("find") !== -1 ? 0.3 : 0; + var d3 = pi.isEnabled(el) ? 0.15 : 0; + var inHeader = !!el.closest("header, nav, [role='banner'], [role='navigation'], [role='search']"); + var d4 = inHeader ? 0.15 : 0; + var score = d1 + d2 + d3 + d4; + if (score < 0.1) continue; + var reasons = []; + if (d1 > 0) reasons.push("search-type/role"); + if (d2 > 0) reasons.push("name/placeholder-match"); + if (d3 > 0) reasons.push("enabled"); + if (d4 > 0) reasons.push("in-header/nav"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else if (normalized === "nextstep") { + var els = qsa("button, a, [role='button'], input[type='submit'], input[type='button']"); + var patterns = ["next", "continue", "proceed", "forward", "go", "step"]; + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!visibleEnabled(el)) continue; + var d1 = textMatchStrength(el, patterns) * 0.4; + if (d1 === 0) continue; + var role = pi.inferRole(el); + var d2 = role === "button" ? 0.25 : 0.1; + var d3 = 0.2; + var isDisabled = !pi.isEnabled(el); + var d4 = isDisabled ? 0 : 0.15; + var score = d1 + d2 + d3 + d4; + var reasons = []; + reasons.push("text-match"); + if (d2 >= 0.25) reasons.push("button-role"); + reasons.push("visible"); + if (d4 > 0) reasons.push("enabled"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else if (normalized === "dismiss") { + var els = qsa("button, a, [role='button'], [role='link']"); + var patterns = ["close", "cancel", "dismiss", "skip", "no thanks", "no, thanks", "maybe later", "not now", "×", "✕"]; + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!visibleEnabled(el)) continue; + var d1 = textMatchStrength(el, patterns) * 0.35; + if (d1 === 0) continue; + var inOverlay = !!el.closest('[role="dialog"], dialog, [aria-modal="true"], [role="alertdialog"], .modal, .overlay, .popup, .popover, .toast, .banner'); + var d2 = inOverlay ? 0.3 : 0.05; + var rect = el.getBoundingClientRect(); + var isEdge = rect.top < 80 || rect.right > window.innerWidth - 80; + var d3 = isEdge ? 0.15 : 0; + var d4 = 0.15; + var score = d1 + d2 + d3 + d4; + var reasons = []; + reasons.push("text-match"); + if (d2 >= 0.3) reasons.push("inside-overlay"); + if (d3 > 0) reasons.push("edge-position"); + reasons.push("visible+enabled"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else if (normalized === "authaction") { + var els = qsa("button, a, [role='button'], [role='link'], input[type='submit']"); + var patterns = ["log in", "login", "sign in", "signin", "sign up", "signup", "register", "create account", "join", "get started"]; + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!visibleEnabled(el)) continue; + var d1 = textMatchStrength(el, patterns) * 0.4; + if (d1 === 0) continue; + var role = pi.inferRole(el); + var d2 = (role === "button" || role === "link") ? 0.25 : 0.1; + var rect = el.getBoundingClientRect(); + var inHeader = !!el.closest("header, nav, [role='banner'], [role='navigation']"); + var isProminent = inHeader || rect.top < 200; + var d3 = isProminent ? 0.2 : 0.05; + var d4 = 0.15; + var score = d1 + d2 + d3 + d4; + var reasons = []; + reasons.push("text-match"); + if (d2 >= 0.25) reasons.push("button-or-link"); + if (d3 >= 0.2) reasons.push("prominent-position"); + reasons.push("visible+enabled"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else if (normalized === "backnavigation") { + var els = qsa("button, a, [role='button'], [role='link']"); + var patterns = ["back", "previous", "prev", "return", "go back"]; + for (var i = 0; i < els.length; i++) { + var el = els[i]; + if (!visibleEnabled(el)) continue; + var d1 = textMatchStrength(el, patterns) * 0.35; + if (d1 === 0) continue; + var innerHtml = el.innerHTML.toLowerCase(); + var hasArrow = innerHtml.indexOf("←") !== -1 || innerHtml.indexOf("&larr") !== -1 || innerHtml.indexOf("arrow") !== -1 || innerHtml.indexOf("chevron-left") !== -1 || innerHtml.indexOf("back") !== -1; + var d2 = hasArrow ? 0.25 : 0; + var inNav = !!el.closest("header, nav, [role='banner'], [role='navigation'], .breadcrumb, .toolbar"); + var d3 = inNav ? 0.25 : 0.05; + var d4 = 0.15; + var score = d1 + d2 + d3 + d4; + var reasons = []; + reasons.push("text-match"); + if (d2 > 0) reasons.push("has-back-arrow/icon"); + if (d3 >= 0.25) reasons.push("in-nav/header"); + reasons.push("visible+enabled"); + candidates.push(makeCandidate(el, score, reasons.join(", "))); + } + } + + else { + return { error: "Unknown intent: " + intentRaw + ". Valid: submit_form, close_dialog, primary_cta, search_field, next_step, dismiss, auth_action, back_navigation" }; + } + + // Sort by score descending, cap at 5 + candidates.sort(function(a, b) { return b.score - a.score; }); + candidates = candidates.slice(0, 5); + + return { intent: intentRaw, normalized: normalized, count: candidates.length, candidates: candidates }; +})()`; +} + +// --------------------------------------------------------------------------- +// Result types +// --------------------------------------------------------------------------- + +interface IntentCandidate { + score: number; + selector: string; + tag: string; + role: string; + name: string; + text: string; + reason: string; +} + +interface IntentScoringResult { + intent: string; + normalized: string; + count: number; + candidates: IntentCandidate[]; + error?: string; +} + +// --------------------------------------------------------------------------- +// Registration +// --------------------------------------------------------------------------- + +export function registerIntentTools(pi: ExtensionAPI, deps: ToolDeps): void { + + // ----------------------------------------------------------------------- + // browser_find_best + // ----------------------------------------------------------------------- + pi.registerTool({ + name: "browser_find_best", + label: "Find Best", + description: + "Find the best-matching element for a semantic intent. Returns up to 5 scored candidates (0-1) ranked by structural position, role, text signals, and visibility. Use this to discover which element the agent should interact with for a given goal — e.g. intent=\"submit_form\" finds submit buttons, intent=\"close_dialog\" finds close/dismiss buttons inside dialogs. Each candidate includes a CSS selector usable with browser_click.", + parameters: Type.Object({ + intent: StringEnum(INTENTS, { + description: + "Semantic intent: submit_form, close_dialog, primary_cta, search_field, next_step, dismiss, auth_action, back_navigation", + }), + scope: Type.Optional( + Type.String({ + description: + "CSS selector to narrow the search area. If omitted, searches the full page.", + }) + ), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + let actionId: number | null = null; + let beforeState: CompactPageState | null = null; + try { + const { page: p } = await deps.ensureBrowser(); + const target = deps.getActiveTarget(); + beforeState = await deps.captureCompactPageState(p, { + selectors: params.scope ? [params.scope] : [], + includeBodyText: false, + target, + }); + actionId = deps.beginTrackedAction("browser_find_best", params, beforeState.url).id; + + const script = buildIntentScoringScript(params.intent, params.scope); + const result = await target.evaluate(script) as IntentScoringResult; + + if (result.error) { + deps.finishTrackedAction(actionId, { + status: "error", + error: result.error, + beforeState, + }); + return { + content: [{ type: "text" as const, text: result.error }], + details: {}, + isError: true, + }; + } + + const afterState = await deps.captureCompactPageState(p, { + selectors: params.scope ? [params.scope] : [], + includeBodyText: false, + target, + }); + setLastActionBeforeState(beforeState); + setLastActionAfterState(afterState); + + deps.finishTrackedAction(actionId, { + status: "success", + afterUrl: afterState.url, + beforeState, + afterState, + }); + + // Format output + const lines: string[] = []; + lines.push(`Intent: ${params.intent} → ${result.count} candidate(s)`); + if (params.scope) lines.push(`Scope: ${params.scope}`); + lines.push(""); + + if (result.candidates.length === 0) { + lines.push("No candidates found for this intent on the current page."); + } else { + for (let i = 0; i < result.candidates.length; i++) { + const c = result.candidates[i]; + lines.push(`${i + 1}. **${c.score}** \`${c.selector}\``); + lines.push(` ${c.tag}${c.role ? ` [${c.role}]` : ""} — "${c.name || c.text}"`); + lines.push(` Reason: ${c.reason}`); + } + } + + return { + content: [{ type: "text" as const, text: lines.join("\n") }], + details: { intentResult: result }, + }; + } catch (err: unknown) { + const screenshot = await deps.captureErrorScreenshot( + (() => { try { return deps.getActivePage(); } catch { return null; } })() + ); + const errMsg = deps.firstErrorLine(err); + + if (actionId !== null) { + deps.finishTrackedAction(actionId, { + status: "error", + error: errMsg, + beforeState: beforeState ?? undefined, + }); + } + + const content: Array<{ type: "text"; text: string } | { type: "image"; data: string; mimeType: string }> = [ + { type: "text", text: `browser_find_best failed: ${errMsg}` }, + ]; + if (screenshot) { + content.push({ type: "image", data: screenshot.data, mimeType: screenshot.mimeType }); + } + return { content, details: {}, isError: true }; + } + }, + }); + + // ----------------------------------------------------------------------- + // browser_act + // ----------------------------------------------------------------------- + pi.registerTool({ + name: "browser_act", + label: "Browser Act", + description: + "Execute a semantic action in one call. Resolves the top candidate for the given intent (same scoring as browser_find_best), performs the action (click for buttons/links, focus for search fields), settles the page, and returns a before/after diff. Use when you know what you want to accomplish semantically — e.g. intent=\"submit_form\" finds and clicks the submit button, intent=\"close_dialog\" dismisses the dialog.", + parameters: Type.Object({ + intent: StringEnum(INTENTS, { + description: + "Semantic intent: submit_form, close_dialog, primary_cta, search_field, next_step, dismiss, auth_action, back_navigation", + }), + scope: Type.Optional( + Type.String({ + description: + "CSS selector to narrow the search area. If omitted, searches the full page.", + }) + ), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + let actionId: number | null = null; + let beforeState: CompactPageState | null = null; + try { + const { page: p } = await deps.ensureBrowser(); + const target = deps.getActiveTarget(); + beforeState = await deps.captureCompactPageState(p, { + selectors: params.scope ? [params.scope] : [], + includeBodyText: true, + target, + }); + actionId = deps.beginTrackedAction("browser_act", params, beforeState.url).id; + + // Score candidates + const script = buildIntentScoringScript(params.intent, params.scope); + const result = await target.evaluate(script) as IntentScoringResult; + + if (result.error) { + deps.finishTrackedAction(actionId, { + status: "error", + error: result.error, + beforeState, + }); + return { + content: [{ type: "text" as const, text: `browser_act failed: ${result.error}` }], + details: {}, + isError: true, + }; + } + + if (result.candidates.length === 0) { + deps.finishTrackedAction(actionId, { + status: "error", + error: `No candidates found for intent "${params.intent}"`, + beforeState, + }); + return { + content: [{ + type: "text" as const, + text: `browser_act: No candidates found for intent "${params.intent}" on the current page. The page may not have the expected elements (e.g. no dialog for close_dialog, no form for submit_form).`, + }], + details: { intentResult: result }, + isError: true, + }; + } + + // Take top candidate and execute action + const top = result.candidates[0]; + const normalizedIntent = params.intent.toLowerCase().replace(/[\s_-]+/g, ""); + + if (normalizedIntent === "searchfield") { + // Focus instead of click for search fields + try { + await target.locator(top.selector).first().focus({ timeout: 5000 }); + } catch { + // Fallback: click to focus + await target.locator(top.selector).first().click({ timeout: 5000 }); + } + } else { + // Click via Playwright locator (D021) + try { + await target.locator(top.selector).first().click({ timeout: 5000 }); + } catch { + // getByRole fallback from interaction.ts pattern + const nameMatch = top.selector.match(/\[(?:aria-label|name|placeholder)="([^"]+)"\]/i); + const roleName = nameMatch?.[1]; + let clicked = false; + for (const role of ["button", "link", "combobox", "textbox"] as const) { + try { + const loc = roleName + ? target.getByRole(role, { name: new RegExp(roleName, "i") }) + : target.getByRole(role, { name: new RegExp(top.name.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i") }); + await loc.first().click({ timeout: 3000 }); + clicked = true; + break; + } catch { /* try next role */ } + } + if (!clicked) { + throw new Error(`Could not click top candidate "${top.selector}" for intent "${params.intent}"`); + } + } + } + + // Settle after action + await deps.settleAfterActionAdaptive(p); + + // Capture after state and diff + const afterState = await deps.captureCompactPageState(p, { + selectors: params.scope ? [params.scope] : [], + includeBodyText: true, + target, + }); + const diff = diffCompactStates(beforeState, afterState); + const summary = deps.formatCompactStateSummary(afterState); + const jsErrors = deps.getRecentErrors(p.url()); + + setLastActionBeforeState(beforeState); + setLastActionAfterState(afterState); + + deps.finishTrackedAction(actionId, { + status: "success", + afterUrl: afterState.url, + diffSummary: diff.summary, + beforeState, + afterState, + }); + + // Format output + const lines: string[] = []; + lines.push(`Intent: ${params.intent}`); + lines.push(`Action: ${normalizedIntent === "searchfield" ? "focused" : "clicked"} top candidate (score: ${top.score})`); + lines.push(`Target: \`${top.selector}\` — "${top.name || top.text}"`); + lines.push(`Reason: ${top.reason}`); + lines.push(""); + lines.push(`Diff:\n${deps.formatDiffText(diff)}`); + if (jsErrors.trim()) { + lines.push(`\nJS Errors:\n${jsErrors}`); + } + lines.push(`\nPage summary:\n${summary}`); + + return { + content: [{ type: "text" as const, text: lines.join("\n") }], + details: { intentResult: result, topCandidate: top, diff }, + }; + } catch (err: unknown) { + const screenshot = await deps.captureErrorScreenshot( + (() => { try { return deps.getActivePage(); } catch { return null; } })() + ); + const errMsg = deps.firstErrorLine(err); + + if (actionId !== null) { + deps.finishTrackedAction(actionId, { + status: "error", + error: errMsg, + beforeState: beforeState ?? undefined, + }); + } + + const content: Array<{ type: "text"; text: string } | { type: "image"; data: string; mimeType: string }> = [ + { type: "text", text: `browser_act failed: ${errMsg}` }, + ]; + if (screenshot) { + content.push({ type: "image", data: screenshot.data, mimeType: screenshot.mimeType }); + } + return { content, details: {}, isError: true }; + } + }, + }); +}