Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,11 @@ Pixels only when observer data is insufficient:
```bash
interceptor canvas read 1 [--format png] [--region 10,20,300,120] [--webgl]
interceptor canvas diff 1
interceptor canvas ocr 1 # Experimental — fallback only
interceptor canvas ocr 1 # Native canvas text: aria/fallback + semantic model (no pixel OCR)
```

`canvas ocr` returns the canvas's *native* accessible text (aria-label / aria-labelledby / fallback subtree / figcaption) plus the page's semantic textbox model — no pixel OCR. For a canvas-rendered editor prefer `scene text`; for genuine pixel-only text use `interceptor macos vision text` (native macOS Vision OCR).

Canvas indexes are DOM canvas indexes.

## Scene (rich editors)
Expand Down
29 changes: 25 additions & 4 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions cli/commands/screenshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,31 @@ export function parseScreenshotCommand(filtered: string[]): Action {
return ssAction
}

case "ocr": {
// interceptor ocr "<css-selector>" | <ref> | --element N | --region X,Y,W,H
// Renders the target via the native screenshot path, then OCRs it with the
// bundled Tesseract engine (offline, cross-platform, no Mac, no agent).
const a: Action = { type: "ocr" }
const pos = filtered[1]
if (pos && !pos.startsWith("--")) {
if (/^e\d/.test(pos)) a.ref = pos
else a.selector = pos
}
if (filtered.includes("--selector")) a.selector = filtered[filtered.indexOf("--selector") + 1]
if (filtered.includes("--ref")) a.ref = filtered[filtered.indexOf("--ref") + 1]
if (filtered.includes("--element")) a.element = parseInt(filtered[filtered.indexOf("--element") + 1])
if (filtered.includes("--region")) {
const rp = filtered[filtered.indexOf("--region") + 1].split(",").map(Number)
a.region = { x: rp[0], y: rp[1], width: rp[2], height: rp[3] }
}
if (filtered.includes("--scale")) a.scale = parseFloat(filtered[filtered.indexOf("--scale") + 1])
if (filtered.includes("--target-max-long-edge")) {
const parsed = parseInt(filtered[filtered.indexOf("--target-max-long-edge") + 1])
if (Number.isFinite(parsed) && parsed > 0) a.target_max_long_edge = parsed
}
return a
}

case "canvas":
switch (filtered[1]) {
case "list":
Expand Down
6 changes: 4 additions & 2 deletions cli/help.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ Capture:
interceptor screenshot --quality 80 Encode quality 0-100 (defaults: png 92, jpeg 92, webp 85)
interceptor screenshot --target-max-long-edge 1568 Clamp output long edge in pixels (auto-resize at capture)
interceptor screenshot --clip X,Y,W,H [deprecated alias for --region]
interceptor ocr "<css>" OCR text from an element (bundled Tesseract — offline, cross-platform, no Mac)
interceptor ocr --region X,Y,W,H OCR a page region
interceptor ocr --element N OCR an element by ref
interceptor eval <code> Run JS in isolated world
interceptor eval <code> --main Run JS in page context

Expand Down Expand Up @@ -179,8 +182,7 @@ Canvas:
interceptor canvas objects --kind text Filter derived objects by kind
interceptor canvas model Inspect host-state and app-model signals
interceptor canvas routes Inspect candidate first-party canvas-related routes
interceptor canvas ocr N OCR text from canvas N
interceptor canvas ocr N --region X,Y,W,H OCR a canvas crop
interceptor canvas ocr N Native canvas text (aria/fallback + semantic model; no pixel OCR)
interceptor canvas read N Read canvas as data URL
interceptor canvas read N --format png PNG format
interceptor canvas read N --region X,Y,W,H Read pixel region
Expand Down
2 changes: 1 addition & 1 deletion cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ const ACTION_CMDS = new Set(["click", "type", "select", "focus", "blur", "hover"
const NAV_CMDS = new Set(["navigate", "back", "forward", "scroll", "wait", "wait-stable", "wait_for"])
const TAB_CMDS = new Set(["tabs", "tab", "window", "frames", "session"])
const NET_CMDS = new Set(["network", "net", "headers"])
const SS_CMDS = new Set(["screenshot", "canvas", "capture"])
const SS_CMDS = new Set(["screenshot", "canvas", "capture", "ocr"])
const DATA_CMDS = new Set(["cookies", "storage", "history", "bookmarks", "downloads", "clear", "clipboard"])
const META_CMDS = new Set(["status", "reload", "meta", "links", "images", "forms", "info", "page_info", "query", "exists", "count", "table", "attr", "style", "events", "search", "notify", "sessions", "capabilities", "modals", "panels"])
const EVAL_CMDS = new Set(["eval"])
Expand Down
5 changes: 4 additions & 1 deletion cli/transport.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ const ACTION_TIMEOUT_OVERRIDES_MS: Record<string, number> = {
screenshot: 45_000,
screenshot_background: 45_000,
canvas_read: 45_000,
canvas_ocr: 45_000,
canvas_ocr: 60_000,
canvas_diff: 45_000,
capture_frame: 45_000,
// OCR: native capture + Tesseract. First call also lazy-loads the WASM core +
// language data, so allow generous headroom.
ocr: 60_000,
}

function pickTimeoutForAction(actionType: string): number {
Expand Down
2 changes: 1 addition & 1 deletion cli/version.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Sentinel values used when running from source (`bun run cli`).
// scripts/build.sh stamps real build values into this file just before
// each `bun build --compile` and restores it afterwards via `git checkout`.
export const VERSION = "0.17.7"
export const VERSION = "0.18.3"
export const BUILD_SHA = "dev"
export const BUILD_DATE = "dev"
166 changes: 122 additions & 44 deletions extension/dist-mv2/background-electron.js
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ async function uninstallScreenshotCorsRule(tabId) {

// extension/src/background/capabilities/screenshot.ts
var CAPTURE_TIMEOUT_MS = 5000;
var DOM_RENDER_TIMEOUT_MS = 30000;
var VISIBILITY_HINT = "Chrome/Brave window may not be visible — bring it to the front and retry, or pass --tab <id> of a tab in a visible window.";

class CaptureTimeoutError extends Error {
Expand Down Expand Up @@ -539,18 +540,6 @@ function resolveDomMode(action) {
return "region";
return "full";
}
async function injectScreenshotRunner(tabId) {
try {
await chrome.scripting.executeScript({
target: { tabId },
world: "ISOLATED",
files: ["screenshot-runner.js"]
});
return { success: true };
} catch (err) {
return { success: false, error: `failed to inject screenshot-runner.js: ${err.message}` };
}
}
async function reencodeAsWebP(dataUrl, qualityPct) {
const res = await fetch(dataUrl);
const blob = await res.blob();
Expand Down Expand Up @@ -596,9 +585,6 @@ async function handleDomRenderScreenshot(action, tabId) {
}
await installScreenshotCorsRule(tabId);
try {
const inject = await injectScreenshotRunner(tabId);
if (!inject.success)
return { success: false, error: inject.error || "runner injection failed" };
const dsAction = { type: "dom_screenshot", mode, format: renderFormat, quality };
if (action.ref !== undefined)
dsAction.ref = action.ref;
Expand All @@ -612,7 +598,19 @@ async function handleDomRenderScreenshot(action, tabId) {
dsAction.scale = scale;
if (targetMaxLongEdge !== undefined)
dsAction.target_max_long_edge = targetMaxLongEdge;
const renderResult = await sendToContentScript(tabId, dsAction);
let renderResult;
try {
renderResult = await withCaptureTimeout("dom-render", sendToContentScript(tabId, dsAction), DOM_RENDER_TIMEOUT_MS);
} catch (err) {
if (err instanceof CaptureTimeoutError) {
return {
success: false,
error: `DOM-render timed out after ${DOM_RENDER_TIMEOUT_MS}ms — the content script did not return image data. The render stalled (e.g. a resource never settled); retry, or use --pixel for a compositor capture.`,
data: { layer: "dom-render-timeout" }
};
}
throw err;
}
if (!renderResult || !renderResult.success || !renderResult.data) {
return { success: false, error: renderResult?.error || "dom render returned no data" };
}
Expand Down Expand Up @@ -860,10 +858,38 @@ async function transformPixelDataUrl(dataUrl, requestedFormat, quality, targetMa
return { success: false, error: `transform failed: ${err.message}` };
}
}
async function handleOcr(action, tabId) {
const shot = { type: "screenshot", format: "png", save: false };
for (const k of ["selector", "element", "ref", "region", "clip", "scale", "target_max_long_edge"]) {
if (action[k] !== undefined)
shot[k] = action[k];
}
const rendered = await handleDomRenderScreenshot(shot, tabId);
if (!rendered.success)
return rendered;
const dataUrl = rendered.data?.dataUrl;
if (!dataUrl)
return { success: false, error: "capture for OCR produced no image" };
const ocr = await sendToOffscreen({ type: "ocr", dataUrl });
if (!ocr.success)
return { success: false, error: ocr.error || "OCR failed" };
return {
success: true,
data: {
text: (ocr.data?.text || "").trim(),
source: ocr.data?.source || "tesseract",
confidence: ocr.data?.confidence ?? null,
width: rendered.data?.width,
height: rendered.data?.height
}
};
}
async function handleScreenshotActions(action, tabId) {
switch (action.type) {
case "screenshot_background":
return handleScreenshotBackground(action, tabId);
case "ocr":
return handleOcr(action, tabId);
case "page_capture": {
const mhtml = await chrome.pageCapture.saveAsMHTML({ tabId });
const text = await mhtml.text();
Expand Down Expand Up @@ -1050,6 +1076,50 @@ function hostCanvasSignals(limit = 20) {
globals: candidateGlobalDetails
};
}
function canvasAccessibleText(canvasIndex) {
const canvases = Array.from(document.querySelectorAll("canvas"));
const c = canvases[canvasIndex];
if (!c)
return { found: false, error: "canvas index out of range", canvasCount: canvases.length };
const norm = (s) => (s || "").replace(/\s+/g, " ").trim();
const byIds = (ids) => !ids ? "" : ids.split(/\s+/).map((id) => norm(document.getElementById(id)?.textContent)).filter(Boolean).join(" ");
const sources = {};
const ariaLabel = norm(c.getAttribute("aria-label"));
if (ariaLabel)
sources.ariaLabel = ariaLabel;
const ariaLabelledby = byIds(c.getAttribute("aria-labelledby"));
if (ariaLabelledby)
sources.ariaLabelledby = ariaLabelledby;
const fallback = norm(c.textContent);
if (fallback)
sources.fallback = fallback;
const fig = c.closest("figure");
const figcaption = fig ? norm(fig.querySelector("figcaption")?.textContent) : "";
if (figcaption)
sources.figcaption = figcaption;
const ariaDescribedby = byIds(c.getAttribute("aria-describedby"));
if (ariaDescribedby)
sources.ariaDescribedby = ariaDescribedby;
const title = norm(c.getAttribute("title"));
if (title)
sources.title = title;
const text = [
sources.ariaLabel,
sources.ariaLabelledby,
sources.fallback,
sources.figcaption,
sources.ariaDescribedby,
sources.title
].filter(Boolean).join(`
`);
return {
found: !!text,
text,
role: c.getAttribute("role") || "",
sources,
canvasCount: canvases.length
};
}
function canvasObserverSummary(limit = 100, kinds, canvasIndex) {
function normalize(kind) {
return String(kind || "").trim();
Expand Down Expand Up @@ -1303,39 +1373,47 @@ async function handleCanvasActions(action, tabId) {
};
}
case "canvas_ocr": {
const readResult = await handleCanvasActions({
type: "canvas_read",
canvasIndex: action.canvasIndex,
region: action.region,
format: "png"
}, tabId);
if (!readResult.success)
return readResult;
const dataUrl = readResult.data.dataUrl;
if (!dataUrl)
return { success: false, error: "canvas OCR requires a readable canvas image" };
const ocrResult = await sendToOffscreen({
type: "ocr",
dataUrl
});
if (!ocrResult.success)
return { success: false, error: ocrResult.error || "canvas OCR failed" };
const a11y = await executeInMainWorld(tabId, canvasAccessibleText, [action.canvasIndex]);
if (a11y && a11y.error) {
return { success: false, error: a11y.error };
}
const hostSignals = await executeInMainWorld(tabId, hostCanvasSignals, [10]);
const semanticText = hostSignals?.docs?.textbox?.exists ? hostSignals.docs.textbox.textSample || "" : "";
const ocrText = ocrResult.data?.text || "";
const normalizedOcr = ocrText.replace(/\s+/g, " ").trim();
const normalizedSemantic = semanticText.replace(/\s+/g, " ").trim();
const matchedSemantic = normalizedOcr && normalizedSemantic ? normalizedSemantic.includes(normalizedOcr) || normalizedOcr.includes(normalizedSemantic) : false;
const a11yText = a11y && a11y.found ? a11y.text || "" : "";
let text = a11yText || semanticText;
let source = a11yText ? "accessibility" : semanticText ? "semantic-textbox" : "none";
let confidence = null;
let ocrFallbackUsed = false;
if (!text) {
const readResult = await handleCanvasActions({
type: "canvas_read",
canvasIndex: action.canvasIndex,
region: action.region,
format: "png"
}, tabId);
const dataUrl = readResult.success ? readResult.data.dataUrl : undefined;
if (dataUrl) {
const ocr = await sendToOffscreen({ type: "ocr", dataUrl });
if (ocr.success && (ocr.data?.text || "").trim()) {
text = ocr.data.text.trim();
source = "tesseract";
confidence = ocr.data?.confidence ?? null;
ocrFallbackUsed = true;
}
}
}
return {
success: true,
data: {
text: ocrText,
source: ocrResult.data?.source || "ocr",
confidence: ocrResult.data?.confidence ?? null,
text,
source,
confidence,
diagnostics: {
pixelSource: "canvas_read",
strongerSourceAvailable: !!semanticText,
strongerSourceMatched: matchedSemantic
accessibilityText: !!a11yText,
accessibilitySources: a11y?.sources || null,
semanticTextboxAvailable: !!semanticText,
ocrFallbackUsed,
hint: text ? undefined : "No accessible/semantic text and pixel OCR found nothing. For a canvas-rendered editor use `interceptor scene text`."
}
}
};
Expand Down Expand Up @@ -3304,7 +3382,7 @@ async function handleMonitorActions(action, tabId) {
registerMonitorListeners();
restorePageCommCaptureConfig();
var OS_INPUT_ACTIONS = new Set(["os_click", "os_key", "os_type", "os_move"]);
var SCREENSHOT_ACTIONS = new Set(["screenshot", "screenshot_background", "page_capture"]);
var SCREENSHOT_ACTIONS = new Set(["screenshot", "screenshot_background", "page_capture", "ocr"]);
var CAPTURE_STREAM_ACTIONS = new Set(["capture_start", "capture_frame", "capture_stop", "canvas_diff"]);
var CANVAS_ACTIONS = new Set([
"canvas_list",
Expand Down
Loading
Loading