From 6aa957c4cca398bddbefef8cc08b39d6fe38e371 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:12:02 +0300 Subject: [PATCH 01/25] types(scraper): add heal request/envelope/opts types --- src/types/scraper.ts | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 7e1ab32..4052053 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -99,6 +99,34 @@ type Batch_pending_response = { message?: string; }; +type Refactor_request = { + prompt: string; + custom_input: unknown[]; +}; + +type Heal_envelope = { + collector_id: string; + status: string; + completed_steps: string[]; + prompt: string; + view_url: string; + next_step: string; + error?: string; +}; + +type Scraper_heal_opts = { + url?: string; + timeout?: string; + output?: string; + json?: boolean; + pretty?: boolean; + timing?: boolean; + apiKey?: string; + legacyOutput?: boolean; + maxRetries?: string; + retry?: boolean; +}; + export type { Deliver_webhook, Create_template_request, @@ -114,4 +142,7 @@ export type { Scraper_run_opts, Batch_trigger_response, Batch_pending_response, + Refactor_request, + Heal_envelope, + Scraper_heal_opts, }; From aa7a25804dcb552ef46ba1c0e830a2647bb5795d Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:13:46 +0300 Subject: [PATCH 02/25] chore(scraper): add heal constants and test imports --- src/__tests__/commands/scraper.test.ts | 8 +++++++- src/commands/scraper.ts | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index db93225..29ac8a9 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -3,7 +3,7 @@ import {tmpdir} from 'node:os'; import {join} from 'node:path'; import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest'; import {Command} from 'commander'; -import type {Scraper_create_opts} from '../../types/scraper'; +import type {Scraper_create_opts, Scraper_heal_opts} from '../../types/scraper'; const mocks = vi.hoisted(()=>({ post: vi.fn(), @@ -75,6 +75,12 @@ import { read_input_file, resolve_run_inputs, is_valid_url, + validate_heal_prompt, + build_refactor_request, + build_next_step, + build_heal_envelope, + print_heal_recovery_note, + handle_heal_scraper, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 7922a42..664973f 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -52,6 +52,9 @@ const GET_RESULT_ENDPOINT = '/dca/get_result'; const SYNC_CRAWL_ENDPOINT = '/dca/crawl'; const BATCH_TRIGGER_ENDPOINT = '/dca/trigger'; const BATCH_DATASET_ENDPOINT = '/dca/dataset'; +const REFACTOR_TRIGGER_PATH = 'refactor_template'; +const REFACTOR_PROGRESS_PATH = 'refactor_template/progress'; +const PROMPT_MAX_LEN = 1000; const SYNC_TIMEOUT_MIN = 25; const SYNC_TIMEOUT_MAX = 50; const SYNC_TIMEOUT_DEFAULT = 50; From b965d3d18f16f68129e55b8ed015c8830eda8e1f Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:15:43 +0300 Subject: [PATCH 03/25] feat(scraper): validate_heal_prompt for heal command --- src/__tests__/commands/scraper.test.ts | 27 +++++++++++++++++++++----- src/commands/scraper.ts | 12 ++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 29ac8a9..17a08af 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -76,11 +76,6 @@ import { resolve_run_inputs, is_valid_url, validate_heal_prompt, - build_refactor_request, - build_next_step, - build_heal_envelope, - print_heal_recovery_note, - handle_heal_scraper, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1190,6 +1185,28 @@ describe('commands/scraper', ()=>{ }); }); + describe('validate_heal_prompt', ()=>{ + it('returns the trimmed prompt for valid input', ()=>{ + expect(validate_heal_prompt(' fix the price selector ')) + .toBe('fix the price selector'); + }); + + it('throws on empty / whitespace-only prompt', ()=>{ + expect(()=>validate_heal_prompt('')).toThrow(/prompt/i); + expect(()=>validate_heal_prompt(' ')).toThrow(/prompt/i); + }); + + it('throws when prompt exceeds 1000 chars', ()=>{ + expect(()=>validate_heal_prompt('x'.repeat(1001))) + .toThrow(/1000/); + }); + + it('accepts a prompt exactly at the 1000-char limit', ()=>{ + const p = 'x'.repeat(1000); + expect(validate_heal_prompt(p)).toBe(p); + }); + }); + describe('parse_urls_arg', ()=>{ it('splits, trims, and drops empties', ()=>{ expect(parse_urls_arg( diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 664973f..48830c7 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -78,6 +78,17 @@ const parse_max_retries = (raw: string|undefined): number=>{ return n; }; +const validate_heal_prompt = (raw: string): string=>{ + const prompt = (raw ?? '').trim(); + if (!prompt) + throw new Error('scraper heal requires a non-empty ' + +'describing what to fix.'); + if (prompt.length>PROMPT_MAX_LEN) + throw new Error(`Heal prompt is ${prompt.length} chars; the API ` + +`limit is ${PROMPT_MAX_LEN}. Shorten it.`); + return prompt; +}; + const build_ai_trigger_retry = ( opts: Pick ): Retry_config=>{ @@ -983,4 +994,5 @@ export { read_input_file, resolve_run_inputs, is_valid_url, + validate_heal_prompt, }; From c951a5937f561a2a792b21f067b63af0953998b4 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:19:47 +0300 Subject: [PATCH 04/25] feat(scraper): build_refactor_request body builder --- src/__tests__/commands/scraper.test.ts | 10 ++++++++++ src/commands/scraper.ts | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 17a08af..c395913 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -76,6 +76,7 @@ import { resolve_run_inputs, is_valid_url, validate_heal_prompt, + build_refactor_request, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1207,6 +1208,15 @@ describe('commands/scraper', ()=>{ }); }); + describe('build_refactor_request', ()=>{ + it('wraps the prompt with an empty custom_input array', ()=>{ + expect(build_refactor_request('fix selectors')).toEqual({ + prompt: 'fix selectors', + custom_input: [], + }); + }); + }); + describe('parse_urls_arg', ()=>{ it('splits, trims, and drops empties', ()=>{ expect(parse_urls_arg( diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 48830c7..5fb9726 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -11,6 +11,7 @@ import {print, success, fail, dim, is_tty} from '../utils/output'; import type { Create_template_request, Create_template_response, + Refactor_request, Trigger_ai_request, Trigger_ai_response, Ai_progress_response, @@ -89,6 +90,11 @@ const validate_heal_prompt = (raw: string): string=>{ return prompt; }; +const build_refactor_request = (prompt: string): Refactor_request=>({ + prompt, + custom_input: [], +}); + const build_ai_trigger_retry = ( opts: Pick ): Retry_config=>{ @@ -995,4 +1001,5 @@ export { resolve_run_inputs, is_valid_url, validate_heal_prompt, + build_refactor_request, }; From 580a258e943c7b378af4a4b95c71a240f1b3e416 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:22:21 +0300 Subject: [PATCH 05/25] feat(scraper): build_next_step verify-hint builder --- src/__tests__/commands/scraper.test.ts | 13 +++++++++++++ src/commands/scraper.ts | 5 +++++ 2 files changed, 18 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index c395913..ca7b5f0 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -77,6 +77,7 @@ import { is_valid_url, validate_heal_prompt, build_refactor_request, + build_next_step, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1217,6 +1218,18 @@ describe('commands/scraper', ()=>{ }); }); + describe('build_next_step', ()=>{ + it('bakes in the real url when provided', ()=>{ + expect(build_next_step('c_abc', 'https://x.com/p/1')) + .toBe('bdata scraper run c_abc https://x.com/p/1'); + }); + + it('uses a placeholder when no url is provided', ()=>{ + expect(build_next_step('c_abc', undefined)) + .toBe('bdata scraper run c_abc '); + }); + }); + describe('parse_urls_arg', ()=>{ it('splits, trims, and drops empties', ()=>{ expect(parse_urls_arg( diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 5fb9726..5020a1c 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -95,6 +95,10 @@ const build_refactor_request = (prompt: string): Refactor_request=>({ custom_input: [], }); +const build_next_step = (collector_id: string, + url: string|undefined): string=> + `bdata scraper run ${collector_id} ${url ?? ''}`; + const build_ai_trigger_retry = ( opts: Pick ): Retry_config=>{ @@ -1002,4 +1006,5 @@ export { is_valid_url, validate_heal_prompt, build_refactor_request, + build_next_step, }; From 2ab4abb599d3e2bfe10a07451fd9f5ec0b296e0a Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:25:06 +0300 Subject: [PATCH 06/25] feat(scraper): build_heal_envelope output shape --- src/__tests__/commands/scraper.test.ts | 50 ++++++++++++++++++++++++++ src/commands/scraper.ts | 19 ++++++++++ 2 files changed, 69 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index ca7b5f0..f34e99d 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -78,6 +78,7 @@ import { validate_heal_prompt, build_refactor_request, build_next_step, + build_heal_envelope, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1230,6 +1231,55 @@ describe('commands/scraper', ()=>{ }); }); + describe('build_heal_envelope', ()=>{ + it('returns the documented success shape', ()=>{ + const env = build_heal_envelope({ + collector_id: 'c_xyz', + status: 'done', + prompt: 'fix price', + progress: {status: 'done', + completed_steps: ['plan', 'patch']}, + url: 'https://x.com/p/1', + }); + expect(env).toEqual({ + collector_id: 'c_xyz', + status: 'done', + completed_steps: ['plan', 'patch'], + prompt: 'fix price', + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + next_step: 'bdata scraper run c_xyz https://x.com/p/1', + }); + }); + + it('uses a placeholder in next_step when no url given', ()=>{ + const env = build_heal_envelope({ + collector_id: 'c_xyz', + status: 'done', + prompt: 'fix price', + progress: {status: 'done', completed_steps: []}, + }); + expect(env.next_step) + .toBe('bdata scraper run c_xyz '); + }); + + it('records error + empty steps on failure, keeps view_url ' + +'and next_step', ()=>{ + const env = build_heal_envelope({ + collector_id: 'c_xyz', + status: 'heal_trigger_failed', + prompt: 'fix price', + error: 'Cannot run more than 3 jobs in parallel', + }); + expect(env.status).toBe('heal_trigger_failed'); + expect(env.error).toMatch(/parallel/); + expect(env.completed_steps).toEqual([]); + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + expect(env.next_step) + .toBe('bdata scraper run c_xyz '); + }); + }); + describe('parse_urls_arg', ()=>{ it('splits, trims, and drops empties', ()=>{ expect(parse_urls_arg( diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 5020a1c..b4310ea 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -17,6 +17,7 @@ import type { Ai_progress_response, Scraper_create_opts, Create_envelope, + Heal_envelope, Run_request, Trigger_immediate_response, Scraper_run_opts, @@ -209,6 +210,23 @@ const build_create_envelope = (params: { ...(params.error ? {error: params.error} : {}), }); +const build_heal_envelope = (params: { + collector_id: string; + status: string; + prompt: string; + progress?: Ai_progress_response; + url?: string; + error?: string; +}): Heal_envelope=>({ + collector_id: params.collector_id, + status: params.status, + completed_steps: params.progress?.completed_steps ?? [], + prompt: params.prompt, + view_url: `https://brightdata.com/cp/scrapers/${params.collector_id}`, + next_step: build_next_step(params.collector_id, params.url), + ...(params.error ? {error: params.error} : {}), +}); + const wants_machine_output = (opts: Scraper_create_opts): boolean=> !!(opts.json || opts.pretty || opts.output) || !is_tty; @@ -1007,4 +1025,5 @@ export { validate_heal_prompt, build_refactor_request, build_next_step, + build_heal_envelope, }; From c655af21cca30fc867c07d7f092e71f2ee0f19c1 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:27:55 +0300 Subject: [PATCH 07/25] feat(scraper): print_heal_recovery_note (non-destructive) --- src/__tests__/commands/scraper.test.ts | 26 ++++++++++++++++++++++++++ src/commands/scraper.ts | 13 +++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index f34e99d..d165e38 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -79,6 +79,7 @@ import { build_refactor_request, build_next_step, build_heal_envelope, + print_heal_recovery_note, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1280,6 +1281,31 @@ describe('commands/scraper', ()=>{ }); }); + describe('print_heal_recovery_note', ()=>{ + it('reassures that the scraper is unchanged and points to the UI', + ()=>{ + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + print_heal_recovery_note('c_xyz'); + const msg = error.mock.calls.map(c=>String(c[0])).join('\n'); + expect(msg).toContain('c_xyz'); + expect(msg).toMatch( + /https:\/\/brightdata\.com\/cp\/scrapers\/c_xyz/); + expect(msg).toMatch(/unchanged|still works|was not modified/i); + // Must NOT reuse create's destructive "half-built" wording. + expect(msg).not.toMatch(/half-built/); + error.mockRestore(); + }); + + it('does nothing when collector_id is empty', ()=>{ + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + print_heal_recovery_note(''); + expect(error).not.toHaveBeenCalled(); + error.mockRestore(); + }); + }); + describe('parse_urls_arg', ()=>{ it('splits, trims, and drops empties', ()=>{ expect(parse_urls_arg( diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index b4310ea..05afb2f 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -140,6 +140,18 @@ const print_stub_recovery_note = (collector_id: string): void=>{ )); }; +const print_heal_recovery_note = (collector_id: string): void=>{ + if (!collector_id) + return; + console.error(dim( + `Note: the heal did not complete, but scraper ${collector_id} ` + +'is unchanged and still works as it did before.\n' + +`Open https://brightdata.com/cp/scrapers/${collector_id} ` + +'to inspect it, or re-run `bdata scraper heal` with a sharper ' + +'prompt.' + )); +}; + const build_template_request = ( opts: Scraper_create_opts ): Create_template_request=>({ @@ -1026,4 +1038,5 @@ export { build_refactor_request, build_next_step, build_heal_envelope, + print_heal_recovery_note, }; From d4290cac1653c06b24b7f68652333bbd7282c6c3 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:34:38 +0300 Subject: [PATCH 08/25] refactor(scraper): tidy heal prompt validation and test object --- src/__tests__/commands/scraper.test.ts | 3 +-- src/commands/scraper.ts | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index d165e38..c4d2f96 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -1238,8 +1238,7 @@ describe('commands/scraper', ()=>{ collector_id: 'c_xyz', status: 'done', prompt: 'fix price', - progress: {status: 'done', - completed_steps: ['plan', 'patch']}, + progress: {status: 'done', completed_steps: ['plan', 'patch']}, url: 'https://x.com/p/1', }); expect(env).toEqual({ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 05afb2f..cff002c 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -81,7 +81,7 @@ const parse_max_retries = (raw: string|undefined): number=>{ }; const validate_heal_prompt = (raw: string): string=>{ - const prompt = (raw ?? '').trim(); + const prompt = raw.trim(); if (!prompt) throw new Error('scraper heal requires a non-empty ' +'describing what to fix.'); From 496ff54b465b07c5a97b39f8ea7922d411136651 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:39:08 +0300 Subject: [PATCH 09/25] feat(scraper): handle_heal_scraper orchestration --- src/__tests__/commands/scraper.test.ts | 165 +++++++++++++++++++++++++ src/commands/scraper.ts | 160 +++++++++++++++++++++++- 2 files changed, 324 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index c4d2f96..869d35f 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -80,6 +80,7 @@ import { build_next_step, build_heal_envelope, print_heal_recovery_note, + handle_heal_scraper, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1461,6 +1462,170 @@ describe('commands/scraper', ()=>{ }); }); + describe('handle_heal_scraper', ()=>{ + it('chains trigger → poll and prints the envelope in non-TTY', + async()=>{ + mocks.post.mockResolvedValueOnce({id: 'rh_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', + completed_steps: ['plan', 'patch']}, + attempts: 3, + }); + await handle_heal_scraper('c_abc', 'fix the price selector', + {url: 'https://x.com/p/1'}); + expect(mocks.post).toHaveBeenCalledWith( + 'api_key', + '/dca/collectors/c_abc/refactor_template', + {prompt: 'fix the price selector', custom_input: []}, + expect.objectContaining({hints: SCRAPER_BODY_HINTS}) + ); + expect(mocks.poll_until).toHaveBeenCalledWith( + expect.objectContaining({ + timeout_seconds: 600, + running_statuses: ['__running__'], + timeout_label: expect.stringContaining('c_abc'), + }) + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'done', + completed_steps: ['plan', 'patch'], + prompt: 'fix the price selector', + view_url: 'https://brightdata.com/cp/scrapers/c_abc', + next_step: + 'bdata scraper run c_abc https://x.com/p/1', + }), + {json: undefined, pretty: undefined, output: undefined} + ); + }); + + it('passes the AI-trigger retry config to the refactor post', + async()=>{ + mocks.post.mockResolvedValueOnce({id: 'rh_xyz'}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: []}, + attempts: 1, + }); + await handle_heal_scraper('c_abc', 'fix it', {maxRetries: '7'}); + const opts = mocks.post.mock.calls[0][3] as + {retry?: {max_attempts: number}}; + expect(opts.retry!.max_attempts).toBe(7); + }); + + it('fails fast on an empty prompt (no network call)', async()=>{ + await expect(handle_heal_scraper('c_abc', ' ', {})) + .rejects.toThrow(/prompt/i); + expect(mocks.fail).toHaveBeenCalledWith( + expect.stringMatching(/prompt/i)); + expect(mocks.post).not.toHaveBeenCalled(); + }); + + it('fails fast on an over-long prompt (no network call)', + async()=>{ + await expect( + handle_heal_scraper('c_abc', 'x'.repeat(1001), {})) + .rejects.toThrow(/1000/); + expect(mocks.post).not.toHaveBeenCalled(); + }); + + it('fails fast on an invalid --url (no network call)', async()=>{ + await expect( + handle_heal_scraper('c_abc', 'fix it', + {url: 'not-a-url'})) + .rejects.toThrow(/url/i); + expect(mocks.post).not.toHaveBeenCalled(); + }); + + it('emits the failure envelope + recovery note when trigger ' + +'fails', async()=>{ + mocks.post.mockRejectedValueOnce( + new Error('Cannot run more than 3 jobs in parallel')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_heal_scraper('c_abc', 'fix it', + {output: 'heal.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'heal_trigger_failed', + error: 'Cannot run more than 3 jobs in parallel', + }), + expect.objectContaining({output: 'heal.json'}) + ); + const msg = error.mock.calls.map(c=>String(c[0])).join('\n'); + expect(msg).toMatch(/unchanged|still works/i); + exit.mockRestore(); + error.mockRestore(); + }); + + it('emits the failure envelope when poll returns status != done', + async()=>{ + mocks.post.mockResolvedValueOnce({id: 'rh_xyz'}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'failed', completed_steps: ['plan']}, + attempts: 2, + }); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_heal_scraper('c_abc', 'fix it', + {output: 'heal.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'failed', + completed_steps: ['plan'], + error: expect.stringMatching(/finished with status/), + }), + expect.objectContaining({output: 'heal.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('emits the failure envelope when polling throws (timeout)', + async()=>{ + mocks.post.mockResolvedValueOnce({id: 'rh_xyz'}); + mocks.poll_until.mockRejectedValue( + new Error('Timeout after 600 seconds waiting for heal')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_heal_scraper('c_abc', 'fix it', + {output: 'heal.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'poll_failed', + error: expect.stringMatching(/Timeout/), + }), + expect.objectContaining({output: 'heal.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('--legacy-output emits the bare progress payload', async()=>{ + mocks.post.mockResolvedValueOnce({id: 'rh_xyz'}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: ['plan']}, + attempts: 1, + }); + await handle_heal_scraper('c_abc', 'fix it', + {output: 'heal.json', legacyOutput: true}); + const written = mocks.print.mock.calls[0][0] as + {collector_id?: unknown; status?: string}; + expect(written.collector_id).toBeUndefined(); + expect(written).not.toHaveProperty('next_step'); + expect(written.status).toBe('done'); + }); + }); + describe('handle_run_scraper multi-URL', ()=>{ let fetch_spy: ReturnType; let tmp_dir: string; diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index cff002c..85d4bfe 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -22,6 +22,7 @@ import type { Trigger_immediate_response, Scraper_run_opts, Batch_trigger_response, + Scraper_heal_opts, } from '../types/scraper'; // Scraper-studio body-pattern hints. Kept here, not in client.ts, so @@ -239,7 +240,9 @@ const build_heal_envelope = (params: { ...(params.error ? {error: params.error} : {}), }); -const wants_machine_output = (opts: Scraper_create_opts): boolean=> +const wants_machine_output = ( + opts: {json?: boolean; pretty?: boolean; output?: string} +): boolean=> !!(opts.json || opts.pretty || opts.output) || !is_tty; const emit_create_output = ( @@ -424,6 +427,160 @@ const handle_create_scraper = async( } }; +const emit_heal_output = ( + envelope: Heal_envelope, + progress: Ai_progress_response|null, + opts: Scraper_heal_opts +): boolean=>{ + if (!wants_machine_output(opts)) + return false; + const print_opts = {json: opts.json, pretty: opts.pretty, + output: opts.output}; + const payload = opts.legacyOutput && progress + ? (progress as unknown) : envelope; + print(payload, print_opts); + return true; +}; + +const format_heal_summary = ( + collector_id: string, + prompt: string, + next_step: string, + progress: Ai_progress_response +): string=>{ + const steps = progress.completed_steps?.length ?? 0; + return [ + `Scraper healed: ${collector_id}`, + ` Prompt: ${prompt}`, + ` Completed steps: ${steps}`, + ` Next: re-run to verify the fix → ${next_step}`, + ].join('\n'); +}; + +const handle_heal_scraper = async( + collector_id: string, + raw_prompt: string, + opts: Scraper_heal_opts +)=>{ + const api_key = ensure_authenticated(opts.apiKey); + let prompt = ''; + let timeout = 600; + let ai_retry: Retry_config; + try { + prompt = validate_heal_prompt(raw_prompt); + if (opts.url && !is_valid_url(opts.url)) + throw new Error(`Invalid --url "${opts.url}".`); + timeout = parse_timeout(opts.timeout); + ai_retry = build_ai_trigger_retry(opts); + } catch(e) { + fail((e as Error).message); + return; + } + const trigger_spinner = start_spinner('Triggering self-healing...'); + try { + await post( + api_key, + `/dca/collectors/${collector_id}/${REFACTOR_TRIGGER_PATH}`, + build_refactor_request(prompt), + {timing: opts.timing, hints: SCRAPER_BODY_HINTS, + retry: ai_retry} + ); + trigger_spinner.stop(); + } catch(e) { + trigger_spinner.stop(); + const msg = (e as Error).message; + console.error(`Failed to start self-healing for collector ` + +`${collector_id}: ${msg}`); + emit_heal_output( + build_heal_envelope({ + collector_id, + status: 'heal_trigger_failed', + prompt, + url: opts.url, + error: clean_error_message(msg), + }), + null, + opts + ); + print_heal_recovery_note(collector_id); + process.exit(1); + return; + } + const poll_spinner = start_spinner('Healing scraper...'); + try { + const poll_result = await poll_until({ + timeout_seconds: timeout, + fetch_once: ()=>get( + api_key, + `/dca/collectors/${collector_id}/${REFACTOR_PROGRESS_PATH}`, + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} + ), + get_status: extract_progress_status, + running_statuses: [RUNNING_SENTINEL], + timeout_label: `self-healing (collector ${collector_id})`, + on_running: ({attempt, timeout_seconds, result})=>{ + const step = result.step ?? 'pending'; + console.error(dim(`Step: ${step} — polling ` + +`(attempt ${attempt}/${timeout_seconds})`)); + }, + }); + poll_spinner.stop(); + const progress = poll_result.result; + if (progress.status != DONE_STATUS) + { + console.error(`Self-healing failed (collector ${collector_id}, ` + +`status: ${progress.status}).`); + emit_heal_output( + build_heal_envelope({ + collector_id, + status: progress.status, + prompt, + progress, + url: opts.url, + error: `Self-healing finished with status ` + +`"${progress.status}".`, + }), + progress, + opts + ); + print_heal_recovery_note(collector_id); + process.exit(1); + return; + } + const envelope = build_heal_envelope({ + collector_id, + status: progress.status, + prompt, + progress, + url: opts.url, + }); + if (emit_heal_output(envelope, progress, opts)) + return; + success(format_heal_summary( + collector_id, prompt, envelope.next_step, progress)); + } catch(e) { + poll_spinner.stop(); + const msg = (e as Error).message; + const suffix = msg.includes(collector_id) + ? '' : ` (collector ${collector_id})`; + console.error(`${msg}${suffix}`); + emit_heal_output( + build_heal_envelope({ + collector_id, + status: 'poll_failed', + prompt, + url: opts.url, + error: clean_error_message(msg), + }), + null, + opts + ); + print_heal_recovery_note(collector_id); + process.exit(1); + return; + } +}; + const parse_sync_timeout = (raw: string|undefined): number=>{ const value = raw == null ? SYNC_TIMEOUT_DEFAULT : +raw; if (!Number.isFinite(value) @@ -1039,4 +1196,5 @@ export { build_next_step, build_heal_envelope, print_heal_recovery_note, + handle_heal_scraper, }; From f63ebe5109458dd95f132b92986556da6fc1accf Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:46:43 +0300 Subject: [PATCH 10/25] feat(scraper): heal poll-attempt log and format_heal_summary test --- src/__tests__/commands/scraper.test.ts | 24 ++++++++++++++++++++++++ src/commands/scraper.ts | 3 +++ 2 files changed, 27 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 869d35f..1286bdc 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -81,6 +81,7 @@ import { build_heal_envelope, print_heal_recovery_note, handle_heal_scraper, + format_heal_summary, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -256,6 +257,29 @@ describe('commands/scraper', ()=>{ }); }); + describe('format_heal_summary', ()=>{ + it('includes the collector id, prompt, step count, and ' + +'next-step command', ()=>{ + const out = format_heal_summary( + 'c_abc', + 'fix the price selector', + 'bdata scraper run c_abc https://x.com/p/1', + {status: 'done', completed_steps: ['plan', 'patch']} + ); + expect(out).toContain('c_abc'); + expect(out).toContain('fix the price selector'); + expect(out).toContain('2'); + expect(out).toContain( + 'bdata scraper run c_abc https://x.com/p/1'); + }); + + it('handles missing completed_steps as zero', ()=>{ + const out = format_heal_summary('c_abc', 'p', + 'bdata scraper run c_abc ', {status: 'done'}); + expect(out).toContain('0'); + }); + }); + describe('build_create_envelope', ()=>{ it('returns the documented success shape', ()=>{ const env = build_create_envelope({ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 85d4bfe..4f53ba7 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -525,6 +525,8 @@ const handle_heal_scraper = async( }, }); poll_spinner.stop(); + console.error(dim( + `Done in ${poll_result.attempts} poll attempts.`)); const progress = poll_result.result; if (progress.status != DONE_STATUS) { @@ -1197,4 +1199,5 @@ export { build_heal_envelope, print_heal_recovery_note, handle_heal_scraper, + format_heal_summary, }; From 988d437b0ae6b6aa9cad3e469aa09287bd18729e Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 13:51:06 +0300 Subject: [PATCH 11/25] feat(scraper): wire heal subcommand with examples --- src/__tests__/commands/scraper.test.ts | 21 +++++++++++ src/commands/scraper.ts | 50 +++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 1286bdc..978004b 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -50,6 +50,7 @@ vi.mock('../../utils/config', ()=>({ })); import { + scraper_command, build_template_request, build_ai_request, extract_progress_status, @@ -1761,4 +1762,24 @@ describe('commands/scraper', ()=>{ {url: 'https://only.com'}); }); }); + + describe('heal_subcommand wiring', ()=>{ + it('is registered on scraper_command with required args', ()=>{ + const heal = scraper_command.commands + .find(c=>c.name()=='heal'); + expect(heal).toBeDefined(); + expect(heal!.usage()).toMatch(//); + expect(heal!.usage()).toMatch(//); + }); + + it('exposes --url, --timeout, --max-retries, --no-retry', ()=>{ + const heal = scraper_command.commands + .find(c=>c.name()=='heal')!; + const flags = heal.options.map(o=>o.long); + expect(flags).toContain('--url'); + expect(flags).toContain('--timeout'); + expect(flags).toContain('--max-retries'); + expect(flags).toContain('--no-retry'); + }); + }); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 4f53ba7..de10e32 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -1160,10 +1160,58 @@ add_examples(run_subcommand, [ }, ]); +const heal_subcommand = new Command('heal') + .description( + 'Fix an existing scraper in place via AI self-healing') + .argument('', + 'Collector ID of the scraper to fix (from `scraper create`)') + .argument('', + 'What is broken / what to fix (max 1000 chars)') + .option('--url ', + 'Verify target woven into the next-step hint. Not sent to the ' + +'heal call; heal only mutates the scraper.') + .option('--timeout ', + 'Polling timeout in seconds (default: 600)') + .option('--max-retries ', + 'Max retries on the AI-Flow concurrent-job cap 429 ' + +`(default: ${AI_TRIGGER_DEFAULT_RETRIES}). Each wait grows ` + +'exponentially with jitter, up to ~4 min between attempts.') + .option('--no-retry', + 'Fail immediately on 429 instead of waiting through the cap. ' + +'Equivalent to --max-retries 0.') + .option('-o, --output ', 'Write output to file') + .option('--json', 'Force JSON output') + .option('--pretty', 'Pretty-print JSON output') + .option('--legacy-output', + 'Emit the bare AI-progress payload instead of the ' + +'{collector_id, status, prompt, next_step, ...} envelope.') + .option('--timing', 'Show request timing') + .option('-k, --api-key ', 'Override API key') + .action(handle_heal_scraper); + +add_examples(heal_subcommand, [ + { + description: 'Fix a scraper whose price selector drifted, then ' + +'get a ready-to-run verify command back', + command: 'brightdata scraper heal c_mp3tuab31lswoxvpws ' + +'"The price field returns null — the selector moved into a ' + +'span with data-testid. Capture price and currency again." ' + +'--url https://example.com/product/1', + }, + { + description: 'Heal and save the result envelope (next_step tells ' + +'you how to verify)', + command: 'brightdata scraper heal c_mp3tuab31lswoxvpws ' + +'"Reviews stopped extracting after the page redesign" ' + +'--pretty -o heal.json', + }, +]); + const scraper_command = new Command('scraper') .description('Build and manage Bright Data scrapers') .addCommand(create_subcommand) - .addCommand(run_subcommand); + .addCommand(run_subcommand) + .addCommand(heal_subcommand); export { scraper_command, From 902497912ac170fe270d4e1273396a599f127f5d Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 14:07:06 +0300 Subject: [PATCH 12/25] test(scraper): harden heal fail-fast, progress-url, and exit assertions --- src/__tests__/commands/scraper.test.ts | 23 +++++++++++++++++++---- src/commands/scraper.ts | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 978004b..fcf4dca 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -1491,10 +1491,13 @@ describe('commands/scraper', ()=>{ it('chains trigger → poll and prints the envelope in non-TTY', async()=>{ mocks.post.mockResolvedValueOnce({id: 'rh_xyz', queued: false}); - mocks.poll_until.mockResolvedValue({ - result: {status: 'done', - completed_steps: ['plan', 'patch']}, - attempts: 3, + mocks.get.mockResolvedValue({status: 'done', + completed_steps: ['plan', 'patch']}); + mocks.poll_until.mockImplementation(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + await cfg.fetch_once(); + return {result: {status: 'done', + completed_steps: ['plan', 'patch']}, attempts: 3}; }); await handle_heal_scraper('c_abc', 'fix the price selector', {url: 'https://x.com/p/1'}); @@ -1511,6 +1514,11 @@ describe('commands/scraper', ()=>{ timeout_label: expect.stringContaining('c_abc'), }) ); + expect(mocks.get).toHaveBeenCalledWith( + 'api_key', + '/dca/collectors/c_abc/refactor_template/progress', + expect.objectContaining({hints: SCRAPER_BODY_HINTS}) + ); expect(mocks.print).toHaveBeenCalledWith( expect.objectContaining({ collector_id: 'c_abc', @@ -1552,6 +1560,8 @@ describe('commands/scraper', ()=>{ handle_heal_scraper('c_abc', 'x'.repeat(1001), {})) .rejects.toThrow(/1000/); expect(mocks.post).not.toHaveBeenCalled(); + expect(mocks.fail).toHaveBeenCalledWith( + expect.stringMatching(/1000/)); }); it('fails fast on an invalid --url (no network call)', async()=>{ @@ -1560,6 +1570,8 @@ describe('commands/scraper', ()=>{ {url: 'not-a-url'})) .rejects.toThrow(/url/i); expect(mocks.post).not.toHaveBeenCalled(); + expect(mocks.fail).toHaveBeenCalledWith( + expect.stringMatching(/url/i)); }); it('emits the failure envelope + recovery note when trigger ' @@ -1582,6 +1594,7 @@ describe('commands/scraper', ()=>{ ); const msg = error.mock.calls.map(c=>String(c[0])).join('\n'); expect(msg).toMatch(/unchanged|still works/i); + expect(exit).toHaveBeenCalledWith(1); exit.mockRestore(); error.mockRestore(); }); @@ -1608,6 +1621,7 @@ describe('commands/scraper', ()=>{ }), expect.objectContaining({output: 'heal.json'}) ); + expect(exit).toHaveBeenCalledWith(1); exit.mockRestore(); error.mockRestore(); }); @@ -1631,6 +1645,7 @@ describe('commands/scraper', ()=>{ }), expect.objectContaining({output: 'heal.json'}) ); + expect(exit).toHaveBeenCalledWith(1); exit.mockRestore(); error.mockRestore(); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index de10e32..2cd3754 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -86,7 +86,7 @@ const validate_heal_prompt = (raw: string): string=>{ if (!prompt) throw new Error('scraper heal requires a non-empty ' +'describing what to fix.'); - if (prompt.length>PROMPT_MAX_LEN) + if (prompt.length > PROMPT_MAX_LEN) throw new Error(`Heal prompt is ${prompt.length} chars; the API ` +`limit is ${PROMPT_MAX_LEN}. Shorten it.`); return prompt; From 9adf9a50b2090fc2feaf1a44fcb55bf9f4de9504 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 17:54:15 +0300 Subject: [PATCH 13/25] types(scraper): gate fields for heal approval + approve opts --- src/types/scraper.ts | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 4052053..a0b432b 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -34,6 +34,8 @@ type Ai_progress_response = { step?: string; completed_steps?: string[]; status: string; + diff?: unknown; + preview_result?: unknown; }; type Scraper_create_opts = { @@ -111,6 +113,8 @@ type Heal_envelope = { prompt: string; view_url: string; next_step: string; + preview_result?: unknown; + diff_summary?: string; error?: string; }; @@ -127,6 +131,22 @@ type Scraper_heal_opts = { retry?: boolean; }; +type Refactor_resume_request = { + message: boolean; +}; + +type Scraper_approve_opts = { + reject?: boolean; + url?: string; + timeout?: string; + output?: string; + json?: boolean; + pretty?: boolean; + timing?: boolean; + apiKey?: string; + legacyOutput?: boolean; +}; + export type { Deliver_webhook, Create_template_request, @@ -145,4 +165,6 @@ export type { Refactor_request, Heal_envelope, Scraper_heal_opts, + Refactor_resume_request, + Scraper_approve_opts, }; From c51d26986042eb3e102675775844b11eb092cba7 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 17:57:24 +0300 Subject: [PATCH 14/25] feat(scraper): recognize pending_answer approval gate in poll --- src/__tests__/commands/scraper.test.ts | 5 +++++ src/commands/scraper.ts | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index fcf4dca..0e4e243 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -237,6 +237,11 @@ describe('commands/scraper', ()=>{ expect(extract_progress_status(null as never)).toBeUndefined(); expect(extract_progress_status({} as never)).toBeUndefined(); }); + + it('returns the awaiting-approval sentinel for pending_answer', ()=>{ + expect(extract_progress_status({status: 'pending_answer'})) + .toBe('__awaiting_approval__'); + }); }); describe('format_create_summary', ()=>{ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 2cd3754..62cc96f 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -50,6 +50,9 @@ const AI_PROGRESS_PATH = 'automate_template/progress'; const RUNNING_SENTINEL = '__running__'; const DONE_STATUS = 'done'; const TERMINAL_FAIL_STATUSES = ['failed', 'error', 'cancelled']; +const AWAITING_APPROVAL = '__awaiting_approval__'; +const AWAITING_STATUS = 'pending_answer'; +const RESUME_JOB_PATH = 'resume_automation_job'; const TRIGGER_IMMEDIATE_ENDPOINT = '/dca/trigger_immediate'; const GET_RESULT_ENDPOINT = '/dca/get_result'; const SYNC_CRAWL_ENDPOINT = '/dca/crawl'; @@ -185,6 +188,9 @@ const extract_progress_status = ( { return result.status; } + // the self-healing flow pauses here awaiting user approval; stop polling. + if (result.status == AWAITING_STATUS) + return AWAITING_APPROVAL; return RUNNING_SENTINEL; }; From 96cc0b1793e9e597194e34507b7519dc0e30021d Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:00:07 +0300 Subject: [PATCH 15/25] feat(scraper): build_diff_summary for approval-gate envelope --- src/__tests__/commands/scraper.test.ts | 19 +++++++++++++++++++ src/commands/scraper.ts | 14 ++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 0e4e243..73136e4 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -79,6 +79,7 @@ import { validate_heal_prompt, build_refactor_request, build_next_step, + build_diff_summary, build_heal_envelope, print_heal_recovery_note, handle_heal_scraper, @@ -1263,6 +1264,24 @@ describe('commands/scraper', ()=>{ }); }); + describe('build_diff_summary', ()=>{ + it('summarizes a well-formed template diff by step count', ()=>{ + const diff = { + template_a: {steps: [{name: 'a'}, {name: 'b'}]}, + template_b: {steps: [{name: 'a'}, {name: 'b'}, {name: 'c'}]}, + }; + const out = build_diff_summary(diff); + expect(out).toMatch(/step/i); + expect(out).toContain('3'); + }); + + it('falls back to a generic note for a malformed diff', ()=>{ + expect(build_diff_summary(null)).toMatch(/see view_url/i); + expect(build_diff_summary({nope: 1})).toMatch(/see view_url/i); + expect(build_diff_summary('garbage')).toMatch(/see view_url/i); + }); + }); + describe('build_heal_envelope', ()=>{ it('returns the documented success shape', ()=>{ const env = build_heal_envelope({ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 62cc96f..9a83081 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -104,6 +104,19 @@ const build_next_step = (collector_id: string, url: string|undefined): string=> `bdata scraper run ${collector_id} ${url ?? ''}`; +// Compact, defensive summary of the refactor diff for the awaiting-approval +// envelope. The full diff (both templates) stays in the web UI at view_url. +const build_diff_summary = (diff: unknown): string=>{ + const generic = 'code change pending approval — see view_url'; + if (!diff || typeof diff != 'object') + return generic; + const b = (diff as {template_b?: {steps?: unknown}}).template_b; + const steps = b && Array.isArray(b.steps) ? b.steps.length : undefined; + if (steps == undefined) + return generic; + return `proposed template has ${steps} step(s) — review at view_url`; +}; + const build_ai_trigger_retry = ( opts: Pick ): Retry_config=>{ @@ -1250,6 +1263,7 @@ export { validate_heal_prompt, build_refactor_request, build_next_step, + build_diff_summary, build_heal_envelope, print_heal_recovery_note, handle_heal_scraper, From 1d91500b230f380643b509caeda91311b1f7868b Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:02:59 +0300 Subject: [PATCH 16/25] feat(scraper): build_approve_next_step hint builder --- src/__tests__/commands/scraper.test.ts | 14 ++++++++++++++ src/commands/scraper.ts | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 73136e4..02cf9dd 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -80,6 +80,7 @@ import { build_refactor_request, build_next_step, build_diff_summary, + build_approve_next_step, build_heal_envelope, print_heal_recovery_note, handle_heal_scraper, @@ -1282,6 +1283,19 @@ describe('commands/scraper', ()=>{ }); }); + describe('build_approve_next_step', ()=>{ + it('builds an approve command with --url when url given', ()=>{ + expect(build_approve_next_step('c_abc', 'https://x.com/p/1')) + .toBe('bdata scraper approve c_abc ' + +'--url https://x.com/p/1'); + }); + + it('omits --url when no url is provided', ()=>{ + expect(build_approve_next_step('c_abc', undefined)) + .toBe('bdata scraper approve c_abc'); + }); + }); + describe('build_heal_envelope', ()=>{ it('returns the documented success shape', ()=>{ const env = build_heal_envelope({ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9a83081..4765fe7 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -117,6 +117,11 @@ const build_diff_summary = (diff: unknown): string=>{ return `proposed template has ${steps} step(s) — review at view_url`; }; +const build_approve_next_step = (collector_id: string, + url: string|undefined): string=> + `bdata scraper approve ${collector_id}` + +(url ? ` --url ${url}` : ''); + const build_ai_trigger_retry = ( opts: Pick ): Retry_config=>{ @@ -1264,6 +1269,7 @@ export { build_refactor_request, build_next_step, build_diff_summary, + build_approve_next_step, build_heal_envelope, print_heal_recovery_note, handle_heal_scraper, From b2fa539b862019c95052a9a2a4f1a80f54ca1608 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:05:42 +0300 Subject: [PATCH 17/25] feat(scraper): heal envelope carries preview + diff on gate --- src/__tests__/commands/scraper.test.ts | 37 ++++++++++++++++++++++++++ src/commands/scraper.ts | 29 +++++++++++++------- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 02cf9dd..5c67302 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -1342,6 +1342,43 @@ describe('commands/scraper', ()=>{ expect(env.next_step) .toBe('bdata scraper run c_xyz '); }); + + it('on awaiting_approval includes preview_result, diff_summary, ' + +'and an approve next_step', ()=>{ + const env = build_heal_envelope({ + collector_id: 'c_xyz', + status: 'awaiting_approval', + prompt: 'fix it', + progress: { + status: 'pending_answer', + completed_steps: ['planner', 'code_fixer'], + preview_result: [{title: 'A Light in the Attic'}], + diff: {template_b: {steps: [{name: 'x'}, {name: 'y'}]}}, + }, + url: 'https://x.com/p/1', + }); + expect(env.status).toBe('awaiting_approval'); + expect(env.preview_result) + .toEqual([{title: 'A Light in the Attic'}]); + expect(env.diff_summary).toMatch(/2 step/); + expect(env.next_step) + .toBe('bdata scraper approve c_xyz ' + +'--url https://x.com/p/1'); + }); + + it('on done keeps the run next_step and omits gate fields', ()=>{ + const env = build_heal_envelope({ + collector_id: 'c_xyz', + status: 'done', + prompt: 'fix it', + progress: {status: 'done', completed_steps: ['patch']}, + url: 'https://x.com/p/1', + }); + expect(env.next_step) + .toBe('bdata scraper run c_xyz https://x.com/p/1'); + expect(env).not.toHaveProperty('preview_result'); + expect(env).not.toHaveProperty('diff_summary'); + }); }); describe('print_heal_recovery_note', ()=>{ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 4765fe7..ee8f8d8 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -254,15 +254,26 @@ const build_heal_envelope = (params: { progress?: Ai_progress_response; url?: string; error?: string; -}): Heal_envelope=>({ - collector_id: params.collector_id, - status: params.status, - completed_steps: params.progress?.completed_steps ?? [], - prompt: params.prompt, - view_url: `https://brightdata.com/cp/scrapers/${params.collector_id}`, - next_step: build_next_step(params.collector_id, params.url), - ...(params.error ? {error: params.error} : {}), -}); +}): Heal_envelope=>{ + const awaiting = params.status == 'awaiting_approval'; + const next_step = awaiting + ? build_approve_next_step(params.collector_id, params.url) + : build_next_step(params.collector_id, params.url); + return { + collector_id: params.collector_id, + status: params.status, + completed_steps: params.progress?.completed_steps ?? [], + prompt: params.prompt, + view_url: + `https://brightdata.com/cp/scrapers/${params.collector_id}`, + next_step, + ...(awaiting && params.progress + ? {preview_result: params.progress.preview_result, + diff_summary: build_diff_summary(params.progress.diff)} + : {}), + ...(params.error ? {error: params.error} : {}), + }; +}; const wants_machine_output = ( opts: {json?: boolean; pretty?: boolean; output?: string} From 735760c135e8f003c4ef7ec6f52658f2a9a37537 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:10:27 +0300 Subject: [PATCH 18/25] style(scraper): trailing comma on heal envelope gate spread --- src/commands/scraper.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index ee8f8d8..7442f3c 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -269,7 +269,7 @@ const build_heal_envelope = (params: { next_step, ...(awaiting && params.progress ? {preview_result: params.progress.preview_result, - diff_summary: build_diff_summary(params.progress.diff)} + diff_summary: build_diff_summary(params.progress.diff),} : {}), ...(params.error ? {error: params.error} : {}), }; From e45137c469d895b461901cc32b7a258aca557e39 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:13:14 +0300 Subject: [PATCH 19/25] feat(scraper): resume_and_poll shared resume+poll helper --- src/__tests__/commands/scraper.test.ts | 38 ++++++++++++++++++++++++++ src/commands/scraper.ts | 37 ++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 5c67302..c3414c6 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -85,6 +85,7 @@ import { print_heal_recovery_note, handle_heal_scraper, format_heal_summary, + resume_and_poll, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1741,6 +1742,43 @@ describe('commands/scraper', ()=>{ }); }); + describe('resume_and_poll', ()=>{ + it('posts resume_automation_job then polls progress, returns ' + +'the poll result', async()=>{ + mocks.post.mockResolvedValueOnce({ok: true}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: ['patch']}, + attempts: 2, + }); + const out = await resume_and_poll( + 'api_key', 'c_abc', true, {timing: undefined}, 600); + expect(mocks.post).toHaveBeenCalledWith( + 'api_key', + '/dca/collectors/c_abc/resume_automation_job', + {message: true}, + expect.objectContaining({hints: SCRAPER_BODY_HINTS}) + ); + expect(mocks.poll_until).toHaveBeenCalledWith( + expect.objectContaining({ + timeout_seconds: 600, + running_statuses: ['__running__'], + }) + ); + expect(out.result.status).toBe('done'); + }); + + it('sends message:false when approve=false (reject)', async()=>{ + mocks.post.mockResolvedValueOnce({ok: true}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: []}, + attempts: 1, + }); + await resume_and_poll('api_key', 'c_abc', false, + {timing: undefined}, 600); + expect(mocks.post.mock.calls[0][2]).toEqual({message: false}); + }); + }); + describe('handle_run_scraper multi-URL', ()=>{ let fetch_spy: ReturnType; let tmp_dir: string; diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 7442f3c..f242a72 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -6,7 +6,7 @@ import {post, get, type Body_hint, type Retry_config, import {load as load_config} from '../utils/config'; import {ensure_authenticated} from '../utils/auth'; import {start as start_spinner} from '../utils/spinner'; -import {parse_timeout, poll_until} from '../utils/polling'; +import {parse_timeout, poll_until, type Poll_result} from '../utils/polling'; import {print, success, fail, dim, is_tty} from '../utils/output'; import type { Create_template_request, @@ -492,6 +492,40 @@ const format_heal_summary = ( ].join('\n'); }; +// Resume a self-healing job parked at the approval gate, then poll the +// refactor progress to its next terminal/gate state. Shared by `approve` +// and `heal --auto-approve`. Throws on resume failure or poll timeout. +const resume_and_poll = async( + api_key: string, + collector_id: string, + approve: boolean, + opts: {timing?: boolean}, + timeout: number +): Promise>=>{ + await post( + api_key, + `/dca/collectors/${collector_id}/${RESUME_JOB_PATH}`, + {message: approve}, + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} + ); + return poll_until({ + timeout_seconds: timeout, + fetch_once: ()=>get( + api_key, + `/dca/collectors/${collector_id}/${REFACTOR_PROGRESS_PATH}`, + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} + ), + get_status: extract_progress_status, + running_statuses: [RUNNING_SENTINEL], + timeout_label: `self-healing (collector ${collector_id})`, + on_running: ({attempt, timeout_seconds, result})=>{ + const step = result.step ?? 'pending'; + console.error(dim(`Step: ${step} — polling ` + +`(attempt ${attempt}/${timeout_seconds})`)); + }, + }); +}; + const handle_heal_scraper = async( collector_id: string, raw_prompt: string, @@ -1285,4 +1319,5 @@ export { print_heal_recovery_note, handle_heal_scraper, format_heal_summary, + resume_and_poll, }; From a91d97e8a46b366f82b4aa9e778ebb738f0808a8 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:16:33 +0300 Subject: [PATCH 20/25] feat(scraper): heal stops at approval gate + --auto-approve --- src/__tests__/commands/scraper.test.ts | 64 +++++++++++++++++ src/commands/scraper.ts | 97 ++++++++++++++++++-------- src/types/scraper.ts | 1 + 3 files changed, 133 insertions(+), 29 deletions(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index c3414c6..fbfe7e6 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -1740,6 +1740,70 @@ describe('commands/scraper', ()=>{ expect(written).not.toHaveProperty('next_step'); expect(written.status).toBe('done'); }); + + it('stops at the approval gate with awaiting_approval (exit 0, ' + +'no resume call)', async()=>{ + mocks.post.mockResolvedValueOnce({id: 'rh_xyz'}); + mocks.poll_until.mockResolvedValue({ + result: { + status: 'pending_answer', + completed_steps: ['planner', 'code_fixer'], + preview_result: [{title: 'A Light in the Attic'}], + diff: {template_b: {steps: [{name: 'x'}]}}, + }, + attempts: 5, + }); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + await handle_heal_scraper('c_abc', 'fix it', + {url: 'https://x.com/p/1', output: 'heal.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'awaiting_approval', + preview_result: [{title: 'A Light in the Attic'}], + next_step: + 'bdata scraper approve c_abc --url https://x.com/p/1', + }), + expect.objectContaining({output: 'heal.json'}) + ); + expect(mocks.post).toHaveBeenCalledTimes(1); + expect(exit).not.toHaveBeenCalledWith(1); + exit.mockRestore(); + }); + + it('--auto-approve resumes at the gate and polls to done', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'rh_xyz'}) + .mockResolvedValueOnce({ok: true}); + mocks.poll_until + .mockResolvedValueOnce({ + result: {status: 'pending_answer', + completed_steps: ['code_fixer'], + preview_result: [{title: 't'}], diff: {}}, + attempts: 3, + }) + .mockResolvedValueOnce({ + result: {status: 'done', completed_steps: ['patch']}, + attempts: 2, + }); + await handle_heal_scraper('c_abc', 'fix it', + {url: 'https://x.com/p/1', autoApprove: true, + output: 'heal.json'}); + expect(mocks.post).toHaveBeenNthCalledWith( + 2, 'api_key', + '/dca/collectors/c_abc/resume_automation_job', + {message: true}, + expect.objectContaining({hints: SCRAPER_BODY_HINTS}) + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + status: 'done', + next_step: 'bdata scraper run c_abc https://x.com/p/1', + }), + expect.objectContaining({output: 'heal.json'}) + ); + }); }); describe('resume_and_poll', ()=>{ diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index f242a72..108d8e1 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -465,7 +465,8 @@ const handle_create_scraper = async( const emit_heal_output = ( envelope: Heal_envelope, progress: Ai_progress_response|null, - opts: Scraper_heal_opts + opts: {url?: string; json?: boolean; pretty?: boolean; + output?: string; legacyOutput?: boolean; timing?: boolean} ): boolean=>{ if (!wants_machine_output(opts)) return false; @@ -526,6 +527,50 @@ const resume_and_poll = async( }); }; +// Emit the terminal heal result: done → success envelope + run next_step; +// any other (genuine-failure) status → failure envelope + recovery note + +// exit 1. Shared by the default poll path and the --auto-approve path. +const emit_heal_terminal = ( + collector_id: string, + prompt: string, + opts: {url?: string; json?: boolean; pretty?: boolean; + output?: string; legacyOutput?: boolean; timing?: boolean}, + progress: Ai_progress_response +): void=>{ + if (progress.status != DONE_STATUS) + { + console.error(`Self-healing failed (collector ${collector_id}, ` + +`status: ${progress.status}).`); + emit_heal_output( + build_heal_envelope({ + collector_id, + status: progress.status, + prompt, + progress, + url: opts.url, + error: `Self-healing finished with status ` + +`"${progress.status}".`, + }), + progress, + opts + ); + print_heal_recovery_note(collector_id); + process.exit(1); + return; + } + const envelope = build_heal_envelope({ + collector_id, + status: progress.status, + prompt, + progress, + url: opts.url, + }); + if (emit_heal_output(envelope, progress, opts)) + return; + success(format_heal_summary( + collector_id, prompt, envelope.next_step, progress)); +}; + const handle_heal_scraper = async( collector_id: string, raw_prompt: string, @@ -597,38 +642,32 @@ const handle_heal_scraper = async( console.error(dim( `Done in ${poll_result.attempts} poll attempts.`)); const progress = poll_result.result; - if (progress.status != DONE_STATUS) + if (progress.status == AWAITING_STATUS && opts.autoApprove) { - console.error(`Self-healing failed (collector ${collector_id}, ` - +`status: ${progress.status}).`); - emit_heal_output( - build_heal_envelope({ - collector_id, - status: progress.status, - prompt, - progress, - url: opts.url, - error: `Self-healing finished with status ` - +`"${progress.status}".`, - }), - progress, - opts - ); - print_heal_recovery_note(collector_id); - process.exit(1); + const resumed = await resume_and_poll( + api_key, collector_id, true, opts, timeout); + emit_heal_terminal( + collector_id, prompt, opts, resumed.result); return; } - const envelope = build_heal_envelope({ - collector_id, - status: progress.status, - prompt, - progress, - url: opts.url, - }); - if (emit_heal_output(envelope, progress, opts)) + if (progress.status == AWAITING_STATUS) + { + console.error(dim(`Heal ready — awaiting approval ` + +`(collector ${collector_id}).`)); + const envelope = build_heal_envelope({ + collector_id, + status: 'awaiting_approval', + prompt, + progress, + url: opts.url, + }); + if (emit_heal_output(envelope, progress, opts)) + return; + success(format_heal_summary( + collector_id, prompt, envelope.next_step, progress)); return; - success(format_heal_summary( - collector_id, prompt, envelope.next_step, progress)); + } + emit_heal_terminal(collector_id, prompt, opts, progress); } catch(e) { poll_spinner.stop(); const msg = (e as Error).message; diff --git a/src/types/scraper.ts b/src/types/scraper.ts index a0b432b..0d3eca6 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -129,6 +129,7 @@ type Scraper_heal_opts = { legacyOutput?: boolean; maxRetries?: string; retry?: boolean; + autoApprove?: boolean; }; type Refactor_resume_request = { From 0dc2f2e7bb06613decedf791a3598b52cfda3195 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:21:57 +0300 Subject: [PATCH 21/25] feat(scraper): handle_approve_scraper resume/reject orchestration --- src/__tests__/commands/scraper.test.ts | 115 ++++++++++++++++++++++++- src/commands/scraper.ts | 84 ++++++++++++++++++ 2 files changed, 198 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index fbfe7e6..4ac3a8d 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -3,7 +3,8 @@ import {tmpdir} from 'node:os'; import {join} from 'node:path'; import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest'; import {Command} from 'commander'; -import type {Scraper_create_opts, Scraper_heal_opts} from '../../types/scraper'; +import type {Scraper_create_opts, Scraper_heal_opts, + Scraper_approve_opts} from '../../types/scraper'; const mocks = vi.hoisted(()=>({ post: vi.fn(), @@ -86,6 +87,7 @@ import { handle_heal_scraper, format_heal_summary, resume_and_poll, + handle_approve_scraper, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1843,6 +1845,117 @@ describe('commands/scraper', ()=>{ }); }); + describe('handle_approve_scraper', ()=>{ + it('resumes (message:true) and polls to done, emits run next_step', + async()=>{ + mocks.post.mockResolvedValueOnce({ok: true}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: ['patch']}, + attempts: 2, + }); + await handle_approve_scraper('c_abc', + {url: 'https://x.com/p/1', output: 'approve.json'}); + expect(mocks.post).toHaveBeenCalledWith( + 'api_key', + '/dca/collectors/c_abc/resume_automation_job', + {message: true}, + expect.objectContaining({hints: SCRAPER_BODY_HINTS}) + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'done', + next_step: 'bdata scraper run c_abc https://x.com/p/1', + }), + expect.objectContaining({output: 'approve.json'}) + ); + }); + + it('--reject sends message:false and reports rejected', async()=>{ + mocks.post.mockResolvedValueOnce({ok: true}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: []}, + attempts: 1, + }); + await handle_approve_scraper('c_abc', + {reject: true, output: 'approve.json'}); + expect(mocks.post.mock.calls[0][2]).toEqual({message: false}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({status: 'rejected'}), + expect.objectContaining({output: 'approve.json'}) + ); + }); + + it('re-gates: poll returns pending_answer again → ' + +'awaiting_approval (re-runnable)', async()=>{ + mocks.post.mockResolvedValueOnce({ok: true}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'pending_answer', + completed_steps: ['code_fixer'], + preview_result: [{title: 't'}], diff: {}}, + attempts: 3, + }); + await handle_approve_scraper('c_abc', + {url: 'https://x.com/p/1', output: 'approve.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + status: 'awaiting_approval', + next_step: + 'bdata scraper approve c_abc --url https://x.com/p/1', + }), + expect.objectContaining({output: 'approve.json'}) + ); + }); + + it('fails fast on an invalid --url (no resume call)', async()=>{ + await expect( + handle_approve_scraper('c_abc', {url: 'not-a-url'})) + .rejects.toThrow(/url/i); + expect(mocks.post).not.toHaveBeenCalled(); + }); + + it('emits resume_failed + recovery note + exit 1 when resume ' + +'fails', async()=>{ + mocks.post.mockRejectedValueOnce( + new Error('{"error":"job not awaiting approval"}')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_approve_scraper('c_abc', {output: 'approve.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'resume_failed', + }), + expect.objectContaining({output: 'approve.json'}) + ); + expect(exit).toHaveBeenCalledWith(1); + const msg = error.mock.calls.map(c=>String(c[0])).join('\n'); + expect(msg).toMatch(/unchanged|still works/i); + exit.mockRestore(); + error.mockRestore(); + }); + + it('emits poll_failed + recovery note on poll timeout', async()=>{ + mocks.post.mockResolvedValueOnce({ok: true}); + mocks.poll_until.mockRejectedValue( + new Error('Timeout after 600 seconds waiting for heal')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_approve_scraper('c_abc', {output: 'approve.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({status: 'poll_failed'}), + expect.objectContaining({output: 'approve.json'}) + ); + expect(exit).toHaveBeenCalledWith(1); + exit.mockRestore(); + error.mockRestore(); + }); + }); + describe('handle_run_scraper multi-URL', ()=>{ let fetch_spy: ReturnType; let tmp_dir: string; diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 108d8e1..94c786c 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -23,6 +23,7 @@ import type { Scraper_run_opts, Batch_trigger_response, Scraper_heal_opts, + Scraper_approve_opts, } from '../types/scraper'; // Scraper-studio body-pattern hints. Kept here, not in client.ts, so @@ -691,6 +692,88 @@ const handle_heal_scraper = async( } }; +const handle_approve_scraper = async( + collector_id: string, + opts: Scraper_approve_opts +)=>{ + const api_key = ensure_authenticated(opts.apiKey); + let timeout = 600; + try { + if (opts.url && !is_valid_url(opts.url)) + throw new Error(`Invalid --url "${opts.url}".`); + timeout = parse_timeout(opts.timeout); + } catch(e) { + fail((e as Error).message); + return; + } + const approve = !opts.reject; + const verb = approve ? 'Approving' : 'Rejecting'; + const spinner = start_spinner(`${verb} self-healing...`); + let poll_result: Poll_result; + try { + poll_result = await resume_and_poll( + api_key, collector_id, approve, opts, timeout); + spinner.stop(); + } catch(e) { + spinner.stop(); + const msg = (e as Error).message; + const is_timeout = /Timeout after/i.test(msg); + const status = is_timeout ? 'poll_failed' : 'resume_failed'; + const suffix = msg.includes(collector_id) + ? '' : ` (collector ${collector_id})`; + console.error(`Failed to ${approve ? 'approve' : 'reject'} ` + +`self-healing for collector ${collector_id}: ` + +`${msg}${suffix}`); + emit_heal_output( + build_heal_envelope({ + collector_id, + status, + prompt: '', + url: opts.url, + error: clean_error_message(msg), + }), + null, + opts + ); + print_heal_recovery_note(collector_id); + process.exit(1); + return; + } + const progress = poll_result.result; + // a resumed job can hit another approval gate (multi-step approval). + if (progress.status == AWAITING_STATUS) + { + const envelope = build_heal_envelope({ + collector_id, + status: 'awaiting_approval', + prompt: '', + progress, + url: opts.url, + }); + if (emit_heal_output(envelope, progress, opts)) + return; + success(format_heal_summary( + collector_id, '', envelope.next_step, progress)); + return; + } + if (!approve && progress.status == DONE_STATUS) + { + const envelope = build_heal_envelope({ + collector_id, + status: 'rejected', + prompt: '', + progress, + url: opts.url, + }); + if (emit_heal_output(envelope, progress, opts)) + return; + success(`Heal rejected for ${collector_id}. Re-run ` + +'`bdata scraper heal` with a sharper prompt to try again.'); + return; + } + emit_heal_terminal(collector_id, '', opts, progress); +}; + const parse_sync_timeout = (raw: string|undefined): number=>{ const value = raw == null ? SYNC_TIMEOUT_DEFAULT : +raw; if (!Number.isFinite(value) @@ -1359,4 +1442,5 @@ export { handle_heal_scraper, format_heal_summary, resume_and_poll, + handle_approve_scraper, }; From 6ee2e1d5066a3d8805fc70a6c1e210b53357ed0d Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:28:41 +0300 Subject: [PATCH 22/25] fix(scraper): label auto-approve resume failure as resume_failed --- src/__tests__/commands/scraper.test.ts | 28 ++++++++++++++++++++++++++ src/commands/scraper.ts | 27 +++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 4ac3a8d..5ddbd51 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -1805,6 +1805,34 @@ describe('commands/scraper', ()=>{ }), expect.objectContaining({output: 'heal.json'}) ); + expect(mocks.poll_until).toHaveBeenCalledTimes(2); + }); + + it('--auto-approve labels a resume POST failure as resume_failed ' + +'(not poll_failed)', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'rh_xyz'}) // trigger + .mockRejectedValueOnce( // resume + new Error('{"error":"job not awaiting approval"}')); + mocks.poll_until.mockResolvedValueOnce({ + result: {status: 'pending_answer', + completed_steps: ['code_fixer'], + preview_result: [{title: 't'}], diff: {}}, + attempts: 3, + }); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_heal_scraper('c_abc', 'fix it', + {autoApprove: true, output: 'heal.json'}); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({status: 'resume_failed'}), + expect.objectContaining({output: 'heal.json'}) + ); + expect(exit).toHaveBeenCalledWith(1); + exit.mockRestore(); + error.mockRestore(); }); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 94c786c..082a66e 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -645,8 +645,31 @@ const handle_heal_scraper = async( const progress = poll_result.result; if (progress.status == AWAITING_STATUS && opts.autoApprove) { - const resumed = await resume_and_poll( - api_key, collector_id, true, opts, timeout); + let resumed: Poll_result; + try { + resumed = await resume_and_poll( + api_key, collector_id, true, opts, timeout); + } catch(e) { + const msg = (e as Error).message; + const status = /Timeout after/i.test(msg) + ? 'poll_failed' : 'resume_failed'; + console.error(`Failed to auto-approve self-healing for ` + +`collector ${collector_id}: ${msg}`); + emit_heal_output( + build_heal_envelope({ + collector_id, + status, + prompt, + url: opts.url, + error: clean_error_message(msg), + }), + null, + opts + ); + print_heal_recovery_note(collector_id); + process.exit(1); + return; + } emit_heal_terminal( collector_id, prompt, opts, resumed.result); return; From c84da43eb17b9827d1ad95e5d37e7ab845e42beb Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:32:10 +0300 Subject: [PATCH 23/25] feat(scraper): wire approve subcommand and heal --auto-approve --- src/__tests__/commands/scraper.test.ts | 28 ++++++++++++++++++ src/commands/scraper.ts | 40 +++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 5ddbd51..6ec9167 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -2115,4 +2115,32 @@ describe('commands/scraper', ()=>{ expect(flags).toContain('--no-retry'); }); }); + + describe('approve_subcommand wiring', ()=>{ + it('is registered on scraper_command with a required ' + +'collector_id', ()=>{ + const approve = scraper_command.commands + .find(c=>c.name()=='approve'); + expect(approve).toBeDefined(); + expect(approve!.usage()).toMatch(//); + }); + + it('exposes --reject, --url, --timeout but NOT retry flags', ()=>{ + const approve = scraper_command.commands + .find(c=>c.name()=='approve')!; + const flags = approve.options.map(o=>o.long); + expect(flags).toContain('--reject'); + expect(flags).toContain('--url'); + expect(flags).toContain('--timeout'); + expect(flags).not.toContain('--max-retries'); + expect(flags).not.toContain('--no-retry'); + }); + + it('heal exposes --auto-approve', ()=>{ + const heal = scraper_command.commands + .find(c=>c.name()=='heal')!; + expect(heal.options.map(o=>o.long)) + .toContain('--auto-approve'); + }); + }); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 082a66e..cf8a6c1 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -1384,6 +1384,9 @@ const heal_subcommand = new Command('heal') .option('--url ', 'Verify target woven into the next-step hint. Not sent to the ' +'heal call; heal only mutates the scraper.') + .option('--auto-approve', + 'When the heal hits the approval gate, approve it automatically ' + +'and poll through to done (default: stop and let you review).') .option('--timeout ', 'Polling timeout in seconds (default: 600)') .option('--max-retries ', @@ -1421,11 +1424,46 @@ add_examples(heal_subcommand, [ }, ]); +const approve_subcommand = new Command('approve') + .description( + 'Approve (or --reject) a heal that is awaiting approval') + .argument('', + 'Collector ID of the scraper whose heal is awaiting approval') + .option('--reject', + 'Reject the proposed fix instead of approving it.') + .option('--url ', + 'Verify target woven into the next-step hint on success.') + .option('--timeout ', + 'Polling timeout in seconds (default: 600)') + .option('-o, --output ', 'Write output to file') + .option('--json', 'Force JSON output') + .option('--pretty', 'Pretty-print JSON output') + .option('--legacy-output', + 'Emit the bare AI-progress payload instead of the envelope.') + .option('--timing', 'Show request timing') + .option('-k, --api-key ', 'Override API key') + .action(handle_approve_scraper); + +add_examples(approve_subcommand, [ + { + description: 'Approve a heal that stopped at awaiting_approval, ' + +'then verify', + command: 'brightdata scraper approve c_mp3tuab31lswoxvpws ' + +'--url https://example.com/product/1', + }, + { + description: 'Reject a proposed fix and start over with a sharper ' + +'heal prompt', + command: 'brightdata scraper approve c_mp3tuab31lswoxvpws --reject', + }, +]); + const scraper_command = new Command('scraper') .description('Build and manage Bright Data scrapers') .addCommand(create_subcommand) .addCommand(run_subcommand) - .addCommand(heal_subcommand); + .addCommand(heal_subcommand) + .addCommand(approve_subcommand); export { scraper_command, From 41bdd1e6e76e2e97f3ae3525ba5d33efd0a56eff Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Wed, 27 May 2026 18:44:58 +0300 Subject: [PATCH 24/25] refactor(scraper): drop dead resume type, tidy approve summary --- src/commands/scraper.ts | 17 +++++++---------- src/types/scraper.ts | 5 ----- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index cf8a6c1..72b8ca8 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -486,12 +486,12 @@ const format_heal_summary = ( progress: Ai_progress_response ): string=>{ const steps = progress.completed_steps?.length ?? 0; - return [ - `Scraper healed: ${collector_id}`, - ` Prompt: ${prompt}`, - ` Completed steps: ${steps}`, - ` Next: re-run to verify the fix → ${next_step}`, - ].join('\n'); + const lines = [`Scraper healed: ${collector_id}`]; + if (prompt) + lines.push(` Prompt: ${prompt}`); + lines.push(` Completed steps: ${steps}`); + lines.push(` Next: re-run to verify the fix → ${next_step}`); + return lines.join('\n'); }; // Resume a self-healing job parked at the approval gate, then poll the @@ -742,11 +742,8 @@ const handle_approve_scraper = async( const msg = (e as Error).message; const is_timeout = /Timeout after/i.test(msg); const status = is_timeout ? 'poll_failed' : 'resume_failed'; - const suffix = msg.includes(collector_id) - ? '' : ` (collector ${collector_id})`; console.error(`Failed to ${approve ? 'approve' : 'reject'} ` - +`self-healing for collector ${collector_id}: ` - +`${msg}${suffix}`); + +`self-healing for collector ${collector_id}: ${msg}`); emit_heal_output( build_heal_envelope({ collector_id, diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 0d3eca6..6dfc588 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -132,10 +132,6 @@ type Scraper_heal_opts = { autoApprove?: boolean; }; -type Refactor_resume_request = { - message: boolean; -}; - type Scraper_approve_opts = { reject?: boolean; url?: string; @@ -166,6 +162,5 @@ export type { Refactor_request, Heal_envelope, Scraper_heal_opts, - Refactor_resume_request, Scraper_approve_opts, }; From e502789b16c195abe5d0cc1b4d58b9cb8de96d69 Mon Sep 17 00:00:00 2001 From: meirk-brd Date: Thu, 28 May 2026 11:28:25 +0300 Subject: [PATCH 25/25] docs(readme): document scraper heal and approve commands --- README.md | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/README.md b/README.md index 4144d02..39f1c74 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ | `brightdata discover` | AI-powered web discovery - find and rank results by intent with optional full-page content | | `brightdata scraper create` | Build a Bright Data scraper from a natural-language description using AI | | `brightdata scraper run` | Run a Bright Data scraper on a URL and return the data | +| `brightdata scraper heal` | Fix an existing scraper in place via AI self-healing (stops at an approval gate) | +| `brightdata scraper approve` | Approve (or reject) a self-healing fix that is awaiting approval | | `brightdata pipelines` | Extract structured data from 40+ platforms (Amazon, LinkedIn, TikTok…) | | `brightdata browser` | Control a real browser via Bright Data's Scraping Browser — navigate, snapshot, click, type, and more | | `brightdata zones` | List and inspect your Bright Data proxy zones | @@ -50,6 +52,8 @@ - [discover](#discover) - [scraper create](#scraper-create) - [scraper run](#scraper-run) + - [scraper heal](#scraper-heal) + - [scraper approve](#scraper-approve) - [pipelines](#pipelines) - [browser](#browser) - [status](#status) @@ -475,6 +479,105 @@ brightdata scraper run c_mp3tuab31lswoxvpws --input-file urls.json --- +### `scraper heal` + +Fix an existing scraper **in place** when it ran but returned wrong, empty, or partial data. The `collector_id` stays the same — the scraper is improved, not replaced. This is the maintenance twin of `scraper create`: it triggers Bright Data's AI self-healing flow (`POST /dca/collectors/{id}/refactor_template`), then polls progress. + +```bash +brightdata scraper heal "" [options] +``` + +**You are the detector.** The CLI never decides on its own that a scraper is broken — you inspect the run output and decide. The `` is required (max 1000 chars); name exactly what is wrong and what the correct output should be. Vague prompts produce vague heals. + +| Flag | Description | +|---|---| +| `--url ` | Verify target woven into the success `next_step` hint (not sent to the heal call) | +| `--auto-approve` | When the heal hits the approval gate, approve it automatically and poll through to `done` (default: stop and let you review) | +| `--timeout ` | Polling timeout (default: `600`) | +| `--max-retries ` | Max retries on the AI-Flow concurrent-job-cap `429` (default: `4`) | +| `--no-retry` | Fail immediately on `429` instead of waiting through the cap | +| `-o, --output ` | Write output to file | +| `--json` / `--pretty` | JSON output (raw / indented) | +| `--legacy-output` | Emit the bare AI-progress payload instead of the envelope | +| `--timing` | Show request timing | +| `-k, --api-key ` | Override API key | + +**The approval gate** + +Self-healing is human-in-the-loop. Without `--auto-approve`, `heal` runs the fix and then **stops at an approval gate** rather than committing it, exiting `0` with a `status: "awaiting_approval"` envelope: + +```json +{ + "collector_id": "c_mp3tuab31lswoxvpws", + "status": "awaiting_approval", + "prompt": "Price returns null — the selector moved …", + "preview_result": [ { "title": "…", "price": { "value": 51.77, "currency": "GBP" } }, … ], + "diff_summary": "proposed template has 1 step(s) — review at view_url", + "view_url": "https://brightdata.com/cp/scrapers/c_mp3tuab31lswoxvpws", + "next_step": "bdata scraper approve c_mp3tuab31lswoxvpws --url https://example.com/product/1" +} +``` + +`preview_result` shows the sample rows the fixed scraper would produce — review them, then run the `next_step` (`scraper approve`) to commit. `awaiting_approval` is **not** a failure; it means the fix is ready and waiting for your decision. A failed heal (`429` cap exhausted, timeout, terminal `failed`) is **non-destructive** — the existing scraper is unchanged and still works as before. + +**Examples** + +```bash +# Heal a scraper, stop at the gate, and get a ready-to-run verify command back +brightdata scraper heal c_mp3tuab31lswoxvpws \ + "The price field returns null — the selector moved into a span with \ + data-testid. Capture price and currency again." \ + --url https://example.com/product/1 --pretty -o heal.json + +# Fully autonomous: heal and approve in one command (no manual review) +brightdata scraper heal c_mp3tuab31lswoxvpws \ + "Reviews stopped extracting after the page redesign" --auto-approve +``` + +--- + +### `scraper approve` + +Commit (or reject) a self-healing fix that `scraper heal` left **awaiting approval**. Calls `POST /dca/collectors/{id}/resume_automation_job`, then polls the refactor job to `done`. + +```bash +brightdata scraper approve [options] +``` + +| Flag | Description | +|---|---| +| `--reject` | Reject the proposed fix instead of approving it | +| `--url ` | Verify target woven into the success `next_step` hint | +| `--timeout ` | Polling timeout (default: `600`) | +| `-o, --output ` | Write output to file | +| `--json` / `--pretty` | JSON output (raw / indented) | +| `--legacy-output` | Emit the bare AI-progress payload instead of the envelope | +| `--timing` | Show request timing | +| `-k, --api-key ` | Override API key | + +On success the job advances to `status: "done"` and the envelope hands back a `next_step` = `scraper run ` so you can verify the committed fix. `--reject` discards the proposed fix (`status: "rejected"`) — re-run `scraper heal` with a sharper prompt to try again. If a heal needs multiple approvals, `approve` may stop at `awaiting_approval` again — just run it once more. + +**The self-healing loop** + +```bash +# 1. Run and inspect the data +brightdata scraper run c_mp3tuab31lswoxvpws https://example.com/product/1 --json -o out.json + +# 2. If the data is wrong, heal (stops at the approval gate) +brightdata scraper heal c_mp3tuab31lswoxvpws \ + "Price returns null — the selector moved; capture price + currency." \ + --url https://example.com/product/1 --pretty -o heal.json + +# 3. Review heal.json's preview_result, then approve +brightdata scraper approve c_mp3tuab31lswoxvpws \ + --url https://example.com/product/1 --pretty -o approve.json + +# 4. Verify the committed fix +brightdata scraper run c_mp3tuab31lswoxvpws https://example.com/product/1 --pretty +``` + +--- + ### `pipelines` Extract structured data from 40+ platforms using Bright Data's Web Scraper API. Triggers an async collection job, polls until ready, and returns results.