From 0348b2933073b8e8940640dede8b4119a9dee130 Mon Sep 17 00:00:00 2001
From: Nathan <sandersn@gmail.com>
Date: Thu, 4 Jun 2026 07:01:07 -0400
Subject: [PATCH 1/4] initial plan

---
 .gitignore                            |   7 +
 docs/lobbying-disclosure-ingestion.md | 363 ++++++++++++++++++++++++++
 2 files changed, 370 insertions(+)
 create mode 100644 docs/lobbying-disclosure-ingestion.md

diff --git a/.gitignore b/.gitignore
index 571150641..7301e0ec2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,10 @@ cert.txt
 # local MCP server config (contains auth tokens)
 .mcp.json
 mcp-server/create-agent-key.ts
+
+# Claude
+CLAUDE.md
+
+#gcloud
+.gcloudignore
+
diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md
new file mode 100644
index 000000000..646cc4415
--- /dev/null
+++ b/docs/lobbying-disclosure-ingestion.md
@@ -0,0 +1,363 @@
+# Lobbying Disclosure Ingestion Pipeline
+
+## Overview
+
+The MA Secretary of State lobbying portal
+([sec.state.ma.us/LobbyistPublicSearch](https://www.sec.state.ma.us/LobbyistPublicSearch/))
+publishes semi-annual disclosure filings for all registered lobbyists and
+lobbying entities. This document describes the plan for scraping that data and
+storing it in Firestore in a way that allows joining to MAPLE bill data.
+
+The portal has three levels of pages:
+
+1. **Search page** → one row per registrant per year
+2. **Summary page** → registrant metadata + links to semi-annual disclosure
+   filings
+3. **CompleteDisclosure page** → per-client compensation table + per-client bill
+   activity tables
+
+Historical data goes back to 2005. MAPLE has bill data only from ~2020 onward,
+so bill joins will only resolve for filings from the 192nd General Court (2021)
+and later. All historical filings are ingested regardless.
+
+---
+
+## Terminology
+
+The portal has two registrant types:
+
+- **Lobbyist** — an individual person who lobbies directly on behalf of clients.
+- **Employer** — a lobbying firm that employs individual lobbyists and is
+  retained by clients. Called "Lobbyist Entity" on the portal.
+
+In both cases, the registrant reports compensation received from each **client**
+(the organization that hired them) and which bills they lobbied for that client.
+
+---
+
+## Firestore Data Model
+
+Two top-level collections, normalized by registrant and by lobbying activity
+record.
+
+### `/lobbyingRegistrants/{registrantId}`
+
+`registrantId` is a slugified `{entityName}_{year}` (stable, dedup-safe).
+
+One model covers both individual lobbyists and lobbying firms. A separate model
+is not needed because the portal search returns both under the same schema, and
+per-filing detail pages do not expose which individual lobbyists within a firm
+worked on which bill.
+
+```typescript
+interface LobbyingRegistrant {
+  registrantId: string // "{entityName}_{year}" slugified
+  entityName: string // firm name or individual lobbyist name (raw portal value)
+  entityNameNorm: string // normalized form; see Normalization section
+  year: number
+  generalCourt: number // computed from year
+  regType: "Lobbyist" | "Employer"
+  clients: LobbyingClient[]
+  disclosureUrls: string[] // source portal URLs, for audit trail
+  fetchedAt: Timestamp
+}
+
+interface LobbyingClient {
+  clientName: string
+  clientNameNorm: string // normalized form
+  compensation: number | null
+}
+```
+
+### `/lobbyingFilings/{filingId}`
+
+`filingId` is a slugified
+`{entityName}_{clientName}_{chamber}_{activityRef}_{generalCourt}`.
+
+```typescript
+type LobbyingChamber =
+  | "House Bill"
+  | "Senate Bill"
+  | "House Docket"
+  | "Senate Docket"
+  | "Executive" // lobbying of executive branch agencies
+  | "Other" // catch-all for rare legacy codes (FY, CMR, etc.)
+
+interface LobbyingFiling {
+  filingId: string
+  entityName: string // raw portal value
+  entityNameNorm: string // normalized form
+  clientName: string // raw portal value; "_total_salary_" sentinel for pre-2013
+  clientNameNorm: string // normalized form
+  year: number
+  generalCourt: number
+  chamber: LobbyingChamber
+  // For legislative chambers: the bill number string (e.g. "H1234", "HD56").
+  // For Executive: the agency name. Not a bill reference.
+  billId: string | null
+  activityTitle: string // bill title (legislative) or meeting description (executive)
+  position: string // "Support" | "Oppose" | "Neutral" | etc.; empty for executive
+  amount: number | null // compensation allocated to this activity
+  fetchedAt: Timestamp
+}
+```
+
+### Constructing `billId` from Raw Portal Data
+
+The portal stores bill numbers as bare integers and records the chamber
+separately. The `billId` field — which maps to `Bill.id` in MAPLE — is
+constructed during ingest by combining chamber prefix and integer:
+
+| `chamber`       | Prefix | Example raw | `billId` |
+| --------------- | ------ | ----------- | -------- |
+| `House Bill`    | `H`    | `1234`      | `H1234`  |
+| `Senate Bill`   | `S`    | `1234`      | `S1234`  |
+| `House Docket`  | `HD`   | `56`        | `HD56`   |
+| `Senate Docket` | `SD`   | `56`        | `SD56`   |
+| `Executive`     | —      | agency name | `null`   |
+| `Other`         | —      | varies      | `null`   |
+
+Note: `H1234` and `S1234` are distinct bills even though they share the same
+integer. The prefix is required to disambiguate. `billId` is `null` for
+non-legislative chambers.
+
+#### Legacy chamber code normalization
+
+The portal uses short-form codes in older filings, normalized during ingest:
+
+| Raw value | Stored as     |
+| --------- | ------------- |
+| `HB`      | `House Bill`  |
+| `SB`      | `Senate Bill` |
+
+Rare codes (`FY`, `C`, `CMR`, `HR`, etc.) are stored as `Other`.
+
+### Joining to Bill Data
+
+**The join only applies to legislative chambers** (`House Bill`, `Senate Bill`,
+`House Docket`, `Senate Docket`) where `billId` is non-null. For `Executive`
+and `Other`, no join should be attempted.
+
+```typescript
+// Only valid when filing.billId !== null
+db.collection(`/generalCourts/${filing.generalCourt}/bills`).doc(filing.billId)
+```
+
+---
+
+## Entity Name Normalization
+
+The portal does not enforce consistent name formatting. The same client or
+registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
+Acme Consulting", etc. across filings and years. Without normalization,
+grouping by entity is unreliable.
+
+Both `entityName` and `clientName` are normalized using the following pipeline,
+applied in order. The raw portal value is always preserved alongside the
+normalized form.
+
+### Normalization pipeline
+
+1. **Uppercase** — convert the entire string to upper case.
+2. **Strip d/b/a suffix** — remove everything from the first occurrence of
+   `D/B/A`, `D/B/A`, `DBA` (and spacing variants) onward, so the registered
+   name is used rather than a trade name.
+3. **Hyphen → space** — replace `-` with ` ` so `LAN-TEL` and `LAN TEL`
+   collapse to the same key.
+4. **Punctuation → space** — replace `,`, `.`, `'`, `'`, `'`, `(`, `)` with
+   space. Replacement with space (not empty string) prevents adjacent tokens
+   from concatenating (e.g. `,INC` becomes ` INC`, which is then caught by step
+   5).
+5. **Remove legal entity type words** — whole-word removal of: `LLC`, `LLP`,
+   `INC`, `INCORPORATED`, `CORPORATION`, `CORP`, `LTD`, `LIMITED`, `PC`,
+   `PLLC`.
+6. **Remove "THE"** — whole-word removal anywhere in the string (not just as a
+   leading prefix).
+7. **Ampersand → AND** — replace `&` with `AND`.
+8. **Fix known typo** — replace `ASSICIATES` with `ASSOCIATES` (legacy portal
+   data).
+9. **Remove professional suffix phrases** — whole-phrase removal of: `LAW
+OFFICE OF`, `AND ASSOCIATES`, `& ASSOCIATES`, `AND ASSOC`, `ATTORNEY AT
+LAW`, `ATTORNEY@LAW`, `ATTORNET AT LAW`, `AND PARTNERS`, `PUBLIC POLICY
+GROUP`, `LEGISLATIVE SERVICES`, `POLICY GROUP`, `ASSOCIATES`, `COUNSELLORS
+AT LAW`.
+10. **Collapse whitespace** — replace runs of whitespace with a single space and
+    strip leading/trailing whitespace.
+
+### Usage
+
+`entityNameNorm` and `clientNameNorm` are stored on every document and filing.
+They should be used for grouping, deduplication, and display-level matching.
+Raw names are preserved for provenance and audit.
+
+---
+
+## Deduplication and Amount Aggregation
+
+### Does lobbying the same bill multiple times mean we should sum amounts?
+
+The portal collects two semi-annual disclosure filings per registrant per year
+(one for each 6-month period). In theory, a registrant could report the same
+bill in both H1 and H2 filings with separate compensation amounts that should
+be summed. Analysis of the actual data shows this does not occur: after
+processing, zero rows share the same `(entityName, clientName, year,
+generalCourt, billId, position)` — each (registrant, client, bill, year)
+combination appears exactly once. The semi-annual periods report different
+activity, not the same activity twice.
+
+The same registrant can lobby the same bill across multiple General Courts
+(observed up to 6 times across years). These are stored as separate documents
+per `generalCourt` and should not be summed — each court is a distinct
+legislative session.
+
+### Null-bill row deduplication
+
+The one real duplication artifact in the portal data is **null-bill rows** —
+entries filed when a registrant had no specific bills to report for a client in
+a period. These appear in both the H1 and H2 disclosures as identical rows and
+should be collapsed. During ingest, if the same `(entityName, clientName, year,
+generalCourt, chamber, position)` with a null `billId` is encountered more than
+once, keep the row with the highest `amount` so no spend is lost if the two
+copies carry different values (in practice amounts are usually both zero).
+
+### Ingest strategy
+
+When processing multiple disclosure URLs for the same registrant+year, write
+`lobbyingFilings` documents using the logical key as the document ID. A
+subsequent disclosure URL that produces the same document ID will naturally
+upsert (overwrite) rather than duplicate. For null-bill rows, since `billId` is
+null, include `chamber` in the document ID to avoid false merges between
+executive and legislative null rows.
+
+---
+
+## Scraper Architecture
+
+The lobbying portal is an HTML scraper, not a REST API. It does not fit the
+`createScraper` factory (which assumes list-IDs → fetch-per-ID against the MA
+Legislature API). Instead, we use a custom scheduled function following the
+`scrapeEvents` pattern.
+
+### Cloud Function: `scrapeLobbying`
+
+**File:** `functions/src/lobbying/scrapeLobbying.ts`
+
+- Schedule: `every 24 hours`
+- Scrapes the current year and prior year (new filers arrive semi-annually)
+- Persists a cursor in `/scrapers/lobbying`:
+  - `lastFetchedAt: Timestamp`
+  - `processedDiscUrls: string[]` — already-fetched disclosure URLs (skipped on
+    re-runs)
+- For each new disclosure URL:
+  - Parse registrant + client compensation rows → upsert `lobbyingRegistrants`
+    doc
+  - Parse bill activity rows → batch-write `lobbyingFilings` docs
+- Uses `axios` (existing dependency) with an iPad `User-Agent` header to match
+  portal expectations
+- Uses `jsdom` for HTML table parsing (already a dependency; used by events scraper)
+- 1s delay between requests; exponential backoff on failure (matching existing
+  scraper retry pattern)
+- Function timeout: 540s
+
+### Incremental Strategy
+
+Processed disclosure URLs are stored in `/scrapers/lobbying.processedDiscUrls`.
+At ~2 disclosure URLs per registrant × ~500 registrants per year, the
+current+prior year window stays well within Firestore document limits.
+Historical years beyond current-1 are stable (filings are frozen after year
+closes) and are handled by the backfill script only.
+
+The backfill script uses a separate Firestore document
+(`/scrapers/lobbyingBackfill`) for its own cursor so it does not interfere with
+the live scraper.
+
+### Legacy Format (pre-2013)
+
+The portal uses a different HTML layout for filings before ~2013: total salary
+is not broken down by client, and all bill activity is in a single table. These
+are stored with `clientName: "_total_salary_"` so callers can detect and filter
+them. No bill-level compensation amount is available for these years.
+
+---
+
+## New Files
+
+```
+functions/src/lobbying/
+  types.ts            — Runtypes definitions for LobbyingRegistrant, LobbyingFiling
+  scrapeLobbying.ts   — Scheduled Cloud Function + shared parsing/normalization logic
+  index.ts            — Re-exports
+```
+
+---
+
+## Firebase Admin Script
+
+**File:** `scripts/firebase-admin/backfillLobbying.ts`
+
+Ingests all historical filings from 2005 to the present. This is the primary
+path for all data before the current and prior year. Accepts `--year` and
+`--limit` CLI args for targeted re-runs or testing. Calls the same parsing
+logic exported from `functions/src/lobbying/scrapeLobbying.ts` and writes
+directly to Firestore via the firebase-admin SDK.
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  yarn firebase-admin run-script backfillLobbying --env dev
+```
+
+---
+
+## Firestore Rules
+
+Add read-only public rules alongside the existing `generalCourts` rule:
+
+```
+match /lobbyingRegistrants/{doc} { allow read: if true; }
+match /lobbyingFilings/{doc}     { allow read: if true; }
+```
+
+---
+
+## Firestore Indexes
+
+Add composite indexes for common query patterns:
+
+| Collection        | Fields                                 | Use case                                 |
+| ----------------- | -------------------------------------- | ---------------------------------------- |
+| `lobbyingFilings` | `generalCourt ASC, billId ASC`         | Fetch all legislative filings for a bill |
+| `lobbyingFilings` | `generalCourt ASC, chamber ASC`        | Filter by chamber within a court         |
+| `lobbyingFilings` | `generalCourt ASC, entityNameNorm ASC` | Fetch all filings for a registrant       |
+| `lobbyingFilings` | `generalCourt ASC, clientNameNorm ASC` | Fetch all filings for a client           |
+
+Note: bill-join queries should always filter on `chamber` (or check
+`billId !== null`) to exclude `Executive` and `Other` rows before treating
+`billId` as a MAPLE bill reference.
+
+---
+
+## Function Export
+
+Add to `functions/src/index.ts`:
+
+```typescript
+export { scrapeLobbying } from "./lobbying"
+```
+
+---
+
+## Design Decisions
+
+| Decision                    | Choice                                                                       | Rationale                                                                                                                                                                |
+| --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Collection placement        | Top-level `/lobbyingRegistrants`, `/lobbyingFilings`                         | Lobbying data spans multiple General Courts and is not scoped to a single court like bills/members                                                                       |
+| Single registrant model     | One type, `regType: "Lobbyist" \| "Employer"`                                | Individual lobbyists and firms share the same portal schema; per-bill individual attribution is not available                                                            |
+| `billId` construction       | `{chamberPrefix}{billNumber}` at ingest time                                 | Raw portal data stores chamber and integer separately; the composite is what matches MAPLE's `Bill.id`                                                                   |
+| `billId` null for Executive | `null` instead of agency name                                                | Prevents accidental bill lookups; makes join guard explicit at the type level                                                                                            |
+| Normalized name fields      | Store both raw and `*Norm` fields                                            | Raw names preserved for provenance; normalized names used for grouping and matching                                                                                      |
+| HTML parser                 | `jsdom`                                                                      | Already in `functions/package.json` (used by events scraper); no need to add cheerio                                                                                     |
+| Live scraper cursor         | Array in `/scrapers/lobbying` doc                                            | ~1,000 URLs/year fits well within the 1 MB Firestore doc limit; simple and atomic with other scraper state                                                               |
+| Backfill cursor             | Firestore subcollection `/scrapers/lobbyingBackfill/processedUrls/{urlHash}` | Full 2005-present history (~50,000 URLs) would exceed the 1 MB doc limit; subcollection scales without bound and is durable, inspectable, and resumable from any machine |
+| Incremental strategy        | Skip already-processed disclosure URLs; write docs by logical key (upsert)   | Survives function restarts and re-runs without re-fetching already-scraped pages; natural upsert prevents duplicates without an explicit dedup pass                      |
+| Legacy format (pre-2013)    | Store with `clientName: "_total_salary_"` sentinel                           | Preserves data completeness; callers can filter on this value                                                                                                            |
+| Historical data             | Admin backfill script (2005 → present)                                       | Full history is ingested once; Cloud Function maintains current+prior year going forward                                                                                 |

From 774568e3371a20e09abf58a3a87ff8d588052b61 Mon Sep 17 00:00:00 2001
From: Nathan <sandersn@gmail.com>
Date: Thu, 4 Jun 2026 08:38:21 -0400
Subject: [PATCH 2/4] feat: add lobbying disclosure ingestion pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scrapes the MA Secretary of State lobbying portal (sec.state.ma.us/LobbyistPublicSearch)
and writes structured data to Firestore for joining with MAPLE bill data.

New collections:
- /lobbyingRegistrants — one doc per (registrant, year), regType Lobbyist|Employer
- /lobbyingFilings — one doc per (registrant, client, bill, court), with billId
  null for Executive/Other chambers so the join guard is type-level

Key design points:
- billId is constructed as {chamberPrefix}{integer} (e.g. H1234, SD56) to match
  Bill.id in the existing bills collection; raw integer + chamber stored separately
- Entity name normalization pipeline ported from reference implementation (10 steps:
  d/b/a stripping, legal entity words, punctuation, THE, ampersand, typo fix, etc.)
- Both raw and *Norm name fields stored for provenance and grouping
- Live Cloud Function scrapes current+prior year on a 24h schedule with a
  summaryDiscCache to avoid re-fetching summary pages in steady state
- Backfill admin script handles full 2005-present history with a Firestore
  subcollection cursor (/scrapers/lobbyingBackfill/processedUrls) that scales
  to ~50k URLs and is safely resumable

Files:
- functions/src/lobbying/{types,normalize,portal,scrapeLobbying,index}.ts
- scripts/firebase-admin/backfillLobbying.ts
- firestore.rules + firestore.indexes.json updated
- docs/lobbying-disclosure-ingestion.md: full plan, test plan, future work

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/lobbying-disclosure-ingestion.md      | 240 ++++++++++
 firestore.indexes.json                     |  91 +++-
 firestore.rules                            |   8 +
 functions/src/index.ts                     |   2 +
 functions/src/lobbying/index.ts            |  12 +
 functions/src/lobbying/normalize.ts        |  73 +++
 functions/src/lobbying/portal.ts           | 491 +++++++++++++++++++++
 functions/src/lobbying/scrapeLobbying.ts   | 274 ++++++++++++
 functions/src/lobbying/types.ts            | 101 +++++
 scripts/firebase-admin/backfillLobbying.ts | 156 +++++++
 10 files changed, 1441 insertions(+), 7 deletions(-)
 create mode 100644 functions/src/lobbying/index.ts
 create mode 100644 functions/src/lobbying/normalize.ts
 create mode 100644 functions/src/lobbying/portal.ts
 create mode 100644 functions/src/lobbying/scrapeLobbying.ts
 create mode 100644 functions/src/lobbying/types.ts
 create mode 100644 scripts/firebase-admin/backfillLobbying.ts

diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md
index 646cc4415..ad67fe397 100644
--- a/docs/lobbying-disclosure-ingestion.md
+++ b/docs/lobbying-disclosure-ingestion.md
@@ -346,6 +346,246 @@ export { scrapeLobbying } from "./lobbying"
 
 ---
 
+## Implementation Status
+
+| File                                         | Status  |
+| -------------------------------------------- | ------- |
+| `functions/src/lobbying/types.ts`            | ✅ Done |
+| `functions/src/lobbying/normalize.ts`        | ✅ Done |
+| `functions/src/lobbying/portal.ts`           | ✅ Done |
+| `functions/src/lobbying/scrapeLobbying.ts`   | ✅ Done |
+| `functions/src/lobbying/index.ts`            | ✅ Done |
+| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done |
+| `functions/src/index.ts` (export)            | ✅ Done |
+| `firestore.rules`                            | ✅ Done |
+| `firestore.indexes.json`                     | ✅ Done |
+
+### Document ID scheme
+
+Both `registrantId` and `filingId` are SHA-256 hashes (first 40 hex chars) of
+their respective logical keys. Hashes are used rather than slugified strings
+because entity names and client names contain arbitrary Unicode and punctuation
+that would require aggressive sanitization to fit Firestore ID constraints. The
+hash is stable across runs for the same logical record.
+
+---
+
+## Future Work (Subsequent PRs)
+
+### Frontend
+
+- **Dedicated lobbying pages**
+
+  - `/lobbyists` index: searchable list of registrants with total compensation,
+    client count, and year filter
+  - `/lobbyists/{registrantId}` profile: full client list, all bills lobbied,
+    compensation over time
+  - `/clients/{clientNameNorm}` profile: registrants hired, bills lobbied,
+    total spend per year
+
+- **Bill page integration** (`/bills/{court}/{billId}`)
+
+  - "Lobbying activity" section listing registrants + clients that lobbied this
+    bill, with position (Support / Oppose / Neutral) and compensation where
+    available
+  - Link to registrant profile pages
+
+- **Organization profile page integration**
+  - If an organization's normalized name matches a `clientNameNorm` in
+    `lobbyingFilings`, surface a "Lobbying history" panel showing which bills
+    they lobbied and which registrants they hired
+
+### MCP Tools
+
+Expose lobbying data via the MAPLE MCP server so that AI agents and Claude can
+answer questions like "who lobbied bill H1234?" or "what did Acme Corp lobby
+for in 2024?".
+
+- **`get_lobbying_filings_for_bill`** — given `generalCourt` + `billId`, return
+  all `lobbyingFilings` for that bill with registrant, client, position, and
+  amount
+- **`get_lobbying_registrant`** — given `registrantId`, return the registrant
+  document with client list and disclosure URLs
+- **`search_lobbying_by_client`** — given a client name (raw or normalized),
+  return matching filings across all courts
+- **`get_lobbying_summary_for_bill`** — aggregate view: unique registrant count,
+  unique client count, total compensation (where non-null), position breakdown
+
+---
+
+## Incremental Test Plan
+
+Testing proceeds from the inside out: unit logic first, then live portal
+fetches against the real site, then a small Firestore write, then a full
+backfill year, then steady-state function operation.
+
+### Step 1 — Unit test: normalization
+
+Run the normalization pipeline against known inputs and verify the outputs match
+the reference implementation.
+
+```bash
+# In a Node REPL or ts-node session:
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { normalizeEntityName } = require('./functions/src/lobbying/normalize')
+console.log(normalizeEntityName('Acme Corp., Inc. d/b/a Acme Consulting'))
+// Expected: 'ACME'
+console.log(normalizeEntityName('LAN-TEL COMMUNICATIONS, INC.'))
+// Expected: 'LAN TEL COMMUNICATIONS'
+console.log(normalizeEntityName('Law Office of Jane Smith, LLC'))
+// Expected: 'JANE SMITH'
+"
+```
+
+### Step 2 — Unit test: chamber normalization and billId construction
+
+```bash
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { normalizeChamber, constructBillId } = require('./functions/src/lobbying/portal')
+console.log(normalizeChamber('HB'))           // House Bill
+console.log(normalizeChamber('SB'))           // Senate Bill
+console.log(normalizeChamber('Executive'))    // Executive
+console.log(normalizeChamber('FY2024'))       // Other
+console.log(constructBillId('House Bill', '1234'))   // H1234
+console.log(constructBillId('Senate Bill', '567'))   // S567
+console.log(constructBillId('House Docket', '89'))   // HD89
+console.log(constructBillId('Executive', 'EOEEA'))   // null
+"
+```
+
+### Step 3 — Live portal fetch: summary links
+
+Verify the portal is reachable and returns results for the current year. Use
+`--limit 1` to minimize requests.
+
+```bash
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { makePortalClient, fetchSummaryLinks } = require('./functions/src/lobbying/portal')
+const client = makePortalClient()
+fetchSummaryLinks(client, 2024).then(urls => {
+  console.log('Summary links for 2024:', urls.length)
+  console.log('First URL:', urls[0])
+}).catch(console.error)
+"
+```
+
+Expected: ~400–600 URLs, each containing `Summary.aspx`.
+
+### Step 4 — Live portal fetch: summary meta + one disclosure
+
+Pick the first summary URL from Step 3 and fetch its meta and first disclosure.
+
+```bash
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { makePortalClient, fetchSummaryLinks, fetchDisclosureMeta, fetchDisclosureDetail } = require('./functions/src/lobbying/portal')
+async function main() {
+  const client = makePortalClient()
+  const [summaryUrl] = await fetchSummaryLinks(client, 2024)
+  const meta = await fetchDisclosureMeta(client, summaryUrl)
+  console.log('Meta:', JSON.stringify(meta, null, 2))
+  if (meta.disclosureUrls[0]) {
+    const detail = await fetchDisclosureDetail(client, meta.disclosureUrls[0], 2024)
+    console.log('Compensation rows:', detail.compensation.length)
+    console.log('Bill rows:', detail.bills.length)
+    console.log('First bill:', detail.bills[0])
+  }
+}
+main().catch(console.error)
+"
+```
+
+Verify: `meta.entityName` is non-empty, `meta.regType` is `"Lobbyist"` or
+`"Employer"`, bill rows have `billId` set correctly for legislative chambers.
+
+### Step 5 — Backfill: single year, small limit against dev Firestore
+
+Write a small batch to the dev Firestore emulator or dev project.
+
+```bash
+# Against local emulator:
+conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env local -- --year 2024 --limit 3
+
+# Against dev project (writes real Firestore):
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env dev -- --year 2024 --limit 3
+```
+
+Verify in Firestore console or emulator UI:
+
+- `lobbyingRegistrants` has 3 documents with `entityName`, `entityNameNorm`,
+  `regType`, `clients`, `generalCourt`
+- `lobbyingFilings` has documents with `billId` non-null for legislative rows
+  and null for Executive rows
+- `/scrapers/lobbyingBackfill/processedUrls` has entries with `url` and
+  `processedAt` fields
+- Re-running the same command skips already-processed URLs (output shows 0 new
+  disclosures)
+
+### Step 6 — Spot-check: bill join
+
+Pick a `lobbyingFiling` document with a non-null `billId` and a `generalCourt`
+≥ 192. Verify the bill exists in MAPLE:
+
+```
+/generalCourts/{filing.generalCourt}/bills/{filing.billId}
+```
+
+If the bill is found, the join key is correct. If not found, check: (a) whether
+MAPLE has data for that court, (b) whether the bill number format matches
+(prefix + integer, no leading zeros).
+
+### Step 7 — Backfill: full current year
+
+Once Step 5 passes, run without `--limit` for the current year:
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env dev -- --year 2024
+```
+
+Monitor progress via console output. Expected: ~500–600 registrants, ~1,000
+disclosure pages, several thousand filing documents written.
+
+### Step 8 — Backfill: full history (2005–present)
+
+Run without `--year` to process all years. Can be interrupted and resumed:
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env dev
+```
+
+Expected runtime: several hours at 1s/request. The subcollection cursor at
+`/scrapers/lobbyingBackfill/processedUrls` allows safe interruption and
+resumption.
+
+### Step 9 — Deploy and verify Cloud Function
+
+Deploy the function to the dev project:
+
+```bash
+conda run -n maple-2025 firebase deploy \
+  --only functions:maple:scrapeLobbying \
+  --project digital-testimony-dev
+```
+
+Trigger a manual run via the Firebase console or:
+
+```bash
+conda run -n maple-2025 yarn firebase-admin run-script runScrapers \
+  --env local --targets scrapeLobbying
+```
+
+Verify: Cloud Function logs show the expected number of new disclosures (should
+be near zero if backfill completed, since current+prior year are already
+processed).
+
+---
+
 ## Design Decisions
 
 | Decision                    | Choice                                                                       | Rationale                                                                                                                                                                |
diff --git a/firestore.indexes.json b/firestore.indexes.json
index 83cb3fa6d..c267a6868 100644
--- a/firestore.indexes.json
+++ b/firestore.indexes.json
@@ -788,25 +788,46 @@
       "collectionGroup": "ballotQuestions",
       "queryScope": "COLLECTION",
       "fields": [
-        { "fieldPath": "electionYear", "order": "ASCENDING" },
-        { "fieldPath": "ballotStatus", "order": "ASCENDING" }
+        {
+          "fieldPath": "electionYear",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "ballotStatus",
+          "order": "ASCENDING"
+        }
       ]
     },
     {
       "collectionGroup": "publishedTestimony",
       "queryScope": "COLLECTION_GROUP",
       "fields": [
-        { "fieldPath": "ballotQuestionId", "order": "ASCENDING" },
-        { "fieldPath": "publishedAt", "order": "DESCENDING" }
+        {
+          "fieldPath": "ballotQuestionId",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "publishedAt",
+          "order": "DESCENDING"
+        }
       ]
     },
     {
       "collectionGroup": "publishedTestimony",
       "queryScope": "COLLECTION",
       "fields": [
-        { "fieldPath": "billId", "order": "ASCENDING" },
-        { "fieldPath": "court", "order": "ASCENDING" },
-        { "fieldPath": "ballotQuestionId", "order": "ASCENDING" }
+        {
+          "fieldPath": "billId",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "court",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "ballotQuestionId",
+          "order": "ASCENDING"
+        }
       ]
     },
     {
@@ -898,6 +919,62 @@
           }
         }
       ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "billId",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "chamber",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "entityNameNorm",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "clientNameNorm",
+          "order": "ASCENDING"
+        }
+      ]
     }
   ],
   "fieldOverrides": [
diff --git a/firestore.rules b/firestore.rules
index a95586279..42db67276 100644
--- a/firestore.rules
+++ b/firestore.rules
@@ -103,6 +103,14 @@ service cloud.firestore {
       allow read: if true;
       allow write: if false;
     }
+    match /lobbyingRegistrants/{id} {
+      allow read: if true;
+      allow write: if false;
+    }
+    match /lobbyingFilings/{id} {
+      allow read: if true;
+      allow write: if false;
+    }
     match /transcriptions/{tid} {
       // public, read-only
       allow read: if true
diff --git a/functions/src/index.ts b/functions/src/index.ts
index 641255bf4..6c52b78c1 100644
--- a/functions/src/index.ts
+++ b/functions/src/index.ts
@@ -60,6 +60,8 @@ export {
 
 export { transcription } from "./webhooks"
 
+export { scrapeLobbying } from "./lobbying"
+
 export * from "./triggerPubsubFunction"
 
 // Export the health check last so it is loaded last.
diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts
new file mode 100644
index 000000000..5e594cb34
--- /dev/null
+++ b/functions/src/lobbying/index.ts
@@ -0,0 +1,12 @@
+export { scrapeLobbying } from "./scrapeLobbying"
+export * from "./types"
+export { normalizeEntityName } from "./normalize"
+export {
+  constructBillId,
+  fetchDisclosureDetail,
+  fetchDisclosureMeta,
+  fetchSummaryLinks,
+  makePortalClient,
+  normalizeChamber,
+  yearToGeneralCourt
+} from "./portal"
diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts
new file mode 100644
index 000000000..8d3d0a0ba
--- /dev/null
+++ b/functions/src/lobbying/normalize.ts
@@ -0,0 +1,73 @@
+/**
+ * Entity name normalization pipeline.
+ *
+ * The SoS portal does not enforce consistent name formatting. The same client or
+ * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
+ * Acme Consulting", etc. across filings and years.
+ *
+ * This pipeline is a direct port of the reference implementation used in the
+ * companion data analysis project. The steps must be applied in the exact order
+ * listed here; changing the order produces different (incorrect) output.
+ */
+
+// Step 2: strip d/b/a trade-name suffix before any other transforms so the
+// trade name doesn't bleed into the canonical form.
+const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i
+
+// Step 5: remove legal entity type words with whole-word matching so
+// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC".
+const LEGAL_ENTITY_RE =
+  /\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g
+
+// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix).
+const THE_RE = /\bTHE\b/g
+
+// Step 9: professional suffix phrases to remove wholesale.
+const MISC_PHRASES = [
+  "LAW OFFICE OF",
+  "AND ASSOCIATES",
+  "& ASSOCIATES",
+  "AND ASSOC",
+  "ATTORNEY AT LAW",
+  "ATTORNEY@LAW",
+  "ATTORNET AT LAW", // known portal typo
+  "AND PARTNERS",
+  "PUBLIC POLICY GROUP",
+  "LEGISLATIVE SERVICES",
+  "POLICY GROUP",
+  "ASSOCIATES",
+  "COUNSELLORS AT LAW"
+]
+
+export function normalizeEntityName(raw: string | null | undefined): string {
+  if (!raw) return ""
+
+  let x = raw.toUpperCase() // Step 1: uppercase
+
+  x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix
+
+  x = x.replace(/-/g, " ") // Step 3: hyphen → space
+
+  // Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught
+  // by step 5's whole-word removal).
+  for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) {
+    x = x.split(ch).join(" ")
+  }
+
+  x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words
+
+  x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere
+
+  x = x.replace(/&/g, "AND") // Step 7: ampersand → AND
+
+  x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo
+
+  // Step 9: remove professional suffix phrases
+  for (const phrase of MISC_PHRASES) {
+    x = x.split(phrase).join(" ")
+  }
+
+  x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace
+
+  return x
+}
diff --git a/functions/src/lobbying/portal.ts b/functions/src/lobbying/portal.ts
new file mode 100644
index 000000000..e441522b8
--- /dev/null
+++ b/functions/src/lobbying/portal.ts
@@ -0,0 +1,491 @@
+/**
+ * HTTP client and HTML parser for the MA Secretary of State lobbying portal.
+ *
+ * Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/
+ *
+ * Page flow:
+ *   1. Search POST  → grdvSearchResultByTypeAndCategory table
+ *                     One row per registrant; each row has a Summary.aspx link.
+ *   2. Summary.aspx → registrant name/year/type + CompleteDisclosure links
+ *   3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity
+ *
+ * Two disclosure HTML formats exist:
+ *   Modern (≥~2013): per-client compensation in grdvClientPaidToEntity;
+ *     per-client bill tables as grdvActivitiesNew{year}_{n}.
+ *   Legacy (<~2013): total salary in grdvSalaryPaid (no client breakdown);
+ *     all bill activity in a single grdvActivities table.
+ */
+
+import axios, { AxiosInstance } from "axios"
+import { JSDOM } from "jsdom"
+import { sha256 } from "js-sha256"
+import {
+  CHAMBER_PREFIXES,
+  LEGACY_CHAMBER_MAP,
+  LEGACY_TOTAL_CLIENT,
+  LobbyingChamber
+} from "./types"
+
+// ─── Constants ──────────────────────────────────────────────────────────────
+
+const BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/"
+const SEARCH_URL = BASE_URL + "Default.aspx"
+const REQUEST_DELAY_MS = 1000
+const MAX_RETRIES = 5
+
+const IPAD_UA =
+  "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " +
+  "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
+
+const FIRST_GC = 183
+const FIRST_GC_START_YEAR = 2003
+
+// ─── Public types ───────────────────────────────────────────────────────────
+
+export interface RawCompensation {
+  clientName: string
+  amount: number | null
+}
+
+export interface RawBillActivity {
+  clientName: string
+  chamber: LobbyingChamber
+  rawBillNumber: string
+  billId: string | null // pre-computed from chamber + rawBillNumber
+  activityTitle: string
+  position: string
+  amount: number | null
+}
+
+export interface DisclosureMeta {
+  entityName: string
+  year: number | null
+  /** Portal reg_type mapped to our vocabulary */
+  regType: "Lobbyist" | "Employer"
+  disclosureUrls: string[]
+}
+
+export interface DisclosureDetail {
+  compensation: RawCompensation[]
+  bills: RawBillActivity[]
+}
+
+// ─── HTTP helpers ────────────────────────────────────────────────────────────
+
+export function makePortalClient(): AxiosInstance {
+  return axios.create({
+    headers: { "User-Agent": IPAD_UA },
+    timeout: 60_000
+  })
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise(resolve => setTimeout(resolve, ms))
+}
+
+async function getHtml(
+  client: AxiosInstance,
+  url: string,
+  retries = MAX_RETRIES
+): Promise<Document> {
+  for (let attempt = 0; attempt < retries; attempt++) {
+    await sleep(
+      attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt
+    )
+    try {
+      const res = await client.get<string>(url, {
+        responseType: "text",
+        headers: { Accept: "text/html" }
+      })
+      return new JSDOM(res.data).window.document
+    } catch (e) {
+      if (attempt === retries - 1) throw e
+      if (axios.isAxiosError(e)) continue
+      throw e
+    }
+  }
+  throw new Error("unreachable")
+}
+
+async function postHtml(
+  client: AxiosInstance,
+  url: string,
+  data: Record<string, string>,
+  retries = MAX_RETRIES
+): Promise<Document> {
+  const body = new URLSearchParams(data).toString()
+  for (let attempt = 0; attempt < retries; attempt++) {
+    await sleep(
+      attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt
+    )
+    try {
+      const res = await client.post<string>(url, body, {
+        responseType: "text",
+        headers: {
+          "Content-Type": "application/x-www-form-urlencoded",
+          Accept: "text/html"
+        },
+        timeout: 180_000
+      })
+      return new JSDOM(res.data).window.document
+    } catch (e) {
+      if (attempt === retries - 1) throw e
+      if (axios.isAxiosError(e)) continue
+      throw e
+    }
+  }
+  throw new Error("unreachable")
+}
+
+// ─── Year / General Court helpers ────────────────────────────────────────────
+
+export function yearToGeneralCourt(year: number): number {
+  return FIRST_GC + Math.floor((year - FIRST_GC_START_YEAR) / 2)
+}
+
+// ─── Chamber normalization ────────────────────────────────────────────────────
+
+/** Normalize raw portal chamber string to a canonical LobbyingChamber value. */
+export function normalizeChamber(raw: string): LobbyingChamber {
+  const trimmed = raw.trim()
+  if (LEGACY_CHAMBER_MAP[trimmed]) return LEGACY_CHAMBER_MAP[trimmed]
+  const known: LobbyingChamber[] = [
+    "House Bill",
+    "Senate Bill",
+    "House Docket",
+    "Senate Docket",
+    "Executive"
+  ]
+  if (known.includes(trimmed as LobbyingChamber))
+    return trimmed as LobbyingChamber
+  return "Other"
+}
+
+/**
+ * Construct the MAPLE-compatible billId from the portal's chamber + raw integer.
+ *
+ * The portal stores bill numbers as bare integers; the chamber prefix is what
+ * distinguishes H1234 from S1234. Returns null for Executive and Other chambers
+ * where no bill join is possible.
+ */
+export function constructBillId(
+  chamber: LobbyingChamber,
+  rawBillNumber: string
+): string | null {
+  const prefix = CHAMBER_PREFIXES[chamber]
+  if (!prefix) return null
+  const n = parseInt(rawBillNumber, 10)
+  if (isNaN(n)) return null
+  return `${prefix}${n}`
+}
+
+// ─── Document ID generation ───────────────────────────────────────────────────
+
+/** Stable Firestore document ID for a registrant (entity + year). */
+export function registrantId(entityName: string, year: number): string {
+  return sha256(`${year}|${entityName}`).slice(0, 40)
+}
+
+/**
+ * Stable Firestore document ID for a filing.
+ *
+ * Uses a hash of the logical deduplication key. For null-bill rows (billId is
+ * null) the chamber is included in the key to avoid merging executive null rows
+ * with legislative null rows.
+ */
+export function filingId(
+  entityName: string,
+  clientName: string,
+  chamber: LobbyingChamber,
+  billId: string | null,
+  generalCourt: number,
+  position: string
+): string {
+  const key = [
+    entityName,
+    clientName,
+    chamber,
+    billId ?? "__null__",
+    generalCourt,
+    position
+  ].join("|")
+  return sha256(key).slice(0, 40)
+}
+
+// ─── Amount parsing ───────────────────────────────────────────────────────────
+
+function parseAmount(text: string): number | null {
+  const cleaned = text.replace(/[$,]/g, "").trim()
+  const n = parseFloat(cleaned)
+  return isNaN(n) ? null : n
+}
+
+// ─── Portal scraping functions ────────────────────────────────────────────────
+
+/** Extract ASP.NET WebForms ViewState hidden inputs from a page. */
+function extractViewState(doc: Document): Record<string, string> {
+  const fields: Record<string, string> = {}
+  doc.querySelectorAll('input[type="hidden"]').forEach(el => {
+    const input = el as HTMLInputElement
+    if (input.name) fields[input.name] = input.value ?? ""
+  })
+  return fields
+}
+
+/**
+ * Fetch all Summary.aspx URLs for a given year.
+ * Sends a single search POST with page size 20000 to get all registrants at once.
+ */
+export async function fetchSummaryLinks(
+  client: AxiosInstance,
+  year: number
+): Promise<string[]> {
+  const searchPage = await getHtml(client, SEARCH_URL)
+  const vs = extractViewState(searchPage)
+
+  const postData: Record<string, string> = {
+    ...vs,
+    __EVENTTARGET: "",
+    __EVENTARGUMENT: "",
+    ctl00$ContentPlaceHolder1$Search: "rdbSearchByType",
+    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear: String(year),
+    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame: "",
+    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown:
+      "3",
+    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType: "L",
+    ctl00$ContentPlaceHolder1$drpPageSize: "20000",
+    ctl00$ContentPlaceHolder1$btnSearch: "Search"
+  }
+
+  const resultsPage = await postHtml(client, SEARCH_URL, postData)
+
+  const table = resultsPage.querySelector(
+    '[id*="grdvSearchResultByTypeAndCategory"]'
+  )
+  if (!table) return []
+
+  const links: string[] = []
+  table.querySelectorAll("a[href]").forEach(el => {
+    const href = (el as HTMLAnchorElement).href
+    if (href && href.includes("Summary.aspx")) {
+      // href from JSDOM is already absolute when base is set; handle both cases
+      const url = href.startsWith("http") ? href : BASE_URL + href
+      links.push(url)
+    }
+  })
+  return links
+}
+
+/**
+ * Fetch a Summary.aspx page and return the registrant metadata + disclosure URLs.
+ */
+export async function fetchDisclosureMeta(
+  client: AxiosInstance,
+  summaryUrl: string
+): Promise<DisclosureMeta> {
+  const doc = await getHtml(client, summaryUrl)
+
+  const text = (id: string) => {
+    const el = doc.getElementById(id)
+    return el?.textContent?.trim() ?? ""
+  }
+
+  const entityName = text("ContentPlaceHolder1_lblRegistrantName")
+  const yearText = text("ContentPlaceHolder1_lblYear")
+  const regTypeRaw = text("ContentPlaceHolder1_lblRegType")
+
+  const year = parseInt(yearText, 10)
+  const regType: "Lobbyist" | "Employer" = regTypeRaw.includes("Entity")
+    ? "Employer"
+    : "Lobbyist"
+
+  const disclosureUrls: string[] = []
+  doc.querySelectorAll("a[href]").forEach(el => {
+    const raw = (el as HTMLAnchorElement).getAttribute("href") ?? ""
+    if (raw.includes("CompleteDisclosure")) {
+      const url = raw.startsWith("http") ? raw : BASE_URL + raw
+      disclosureUrls.push(url)
+    }
+  })
+
+  return {
+    entityName,
+    year: isNaN(year) ? null : year,
+    regType,
+    disclosureUrls
+  }
+}
+
+/**
+ * Parse a CompleteDisclosure.aspx page.
+ *
+ * Handles both modern (≥~2013) and legacy (<~2013) HTML layouts.
+ */
+export async function fetchDisclosureDetail(
+  client: AxiosInstance,
+  discUrl: string,
+  year: number
+): Promise<DisclosureDetail> {
+  const doc = await getHtml(client, discUrl)
+  const compensation: RawCompensation[] = []
+  const bills: RawBillActivity[] = []
+
+  // ── Modern format ──────────────────────────────────────────────────────────
+  const compTable = doc.querySelector('[id*="grdvClientPaidToEntity"]')
+  if (compTable) {
+    compTable
+      .querySelectorAll("tr.GridRow, tr.GridAlternatingRow")
+      .forEach(row => {
+        const cells = Array.from(row.querySelectorAll("td")).map(
+          td => td.textContent?.trim() ?? ""
+        )
+        if (cells.length >= 2) {
+          compensation.push({
+            clientName: cells[0],
+            amount: parseAmount(cells[1])
+          })
+        }
+      })
+  }
+
+  // Bill activity tables — one per client per reporting period. Two ID patterns:
+  //   2014–2018: …rptActivityNew_grdvActivitiesNew_0      (no year suffix)
+  //   2019+:     …rptActivityNew2020_grdvActivitiesNew2020_0 (year suffix)
+  doc.querySelectorAll('[id*="grdvActivitiesNew"]').forEach(actTable => {
+    // The client name lives in the nearest preceding span with lblClientName
+    let clientName = ""
+    let node: Element | null = actTable
+    while ((node = node.previousElementSibling ?? node.parentElement)) {
+      const span = node.id?.includes("lblClientName")
+        ? node
+        : node.querySelector?.('[id*="lblClientName"]')
+      if (span) {
+        clientName = span.textContent?.trim() ?? ""
+        break
+      }
+      if (node === node.parentElement) break
+    }
+
+    actTable
+      .querySelectorAll("tr.GridRow, tr.GridAlternatingRow")
+      .forEach(row => {
+        const cells = Array.from(row.querySelectorAll("td")).map(
+          td => td.textContent?.trim() ?? ""
+        )
+        // Columns: House/Senate, Bill Number, Bill title, Position, Amount, Direct business
+        if (cells.length < 4) return
+        const chamber = normalizeChamber(cells[0])
+        const rawBillNumber = cells[1]
+        const billId = constructBillId(chamber, rawBillNumber)
+        bills.push({
+          clientName,
+          chamber,
+          rawBillNumber,
+          billId,
+          activityTitle: cells[2] ?? "",
+          position: cells[3] ?? "",
+          amount: cells.length > 4 ? parseAmount(cells[4]) : null
+        })
+      })
+  })
+
+  if (compTable || bills.length > 0) {
+    return { compensation, bills }
+  }
+
+  // ── Legacy format (<~2013) ─────────────────────────────────────────────────
+  const salaryTable = doc.querySelector('[id*="grdvSalaryPaid"]')
+  if (salaryTable) {
+    let total = 0
+    salaryTable.querySelectorAll("tr").forEach(row => {
+      const cells = Array.from(row.querySelectorAll("td")).map(
+        td => td.textContent?.trim() ?? ""
+      )
+      if (cells.length >= 2 && !cells[0].includes("Total")) {
+        const amt = parseAmount(cells[1])
+        if (amt !== null) total += amt
+      }
+    })
+    if (total > 0) {
+      compensation.push({ clientName: LEGACY_TOTAL_CLIENT, amount: total })
+    }
+  }
+
+  // Legacy bill activity: single grdvActivities table. Three known column layouts:
+  //   2009 4-col:               Date | Bill+Title | Lobbyist | Client
+  //   2010+ individual 5-col:   Activity | Position | DirectBiz | Client | Compensation
+  //   2010+ entity 6-col:       Activity | Lobbyist | Position | DirectBiz | Client | Compensation
+  const actTable = doc.querySelector('[id$="grdvActivities"]')
+  if (actTable) {
+    const allRows = Array.from(actTable.querySelectorAll("tr"))
+    const headerCells = Array.from(
+      allRows[0]?.querySelectorAll("th, td") ?? []
+    ).map(el => el.textContent?.trim() ?? "")
+
+    let billCol = 1
+    let positionCol: number | null = null
+    let clientCol = 3
+
+    if (headerCells[0]?.includes("Activity")) {
+      if (headerCells[1]?.includes("Lobbyist")) {
+        // 6-col entity layout
+        billCol = 0
+        positionCol = 2
+        clientCol = 4
+      } else {
+        // 5-col individual layout
+        billCol = 0
+        positionCol = 1
+        clientCol = 3
+      }
+    }
+
+    const chamberMap: Record<string, LobbyingChamber> = {
+      H: "House Bill",
+      S: "Senate Bill",
+      HD: "House Docket",
+      SD: "Senate Docket"
+    }
+
+    allRows.slice(1).forEach(row => {
+      const cells = Array.from(row.querySelectorAll("td")).map(
+        td => td.textContent?.trim() ?? ""
+      )
+      if (cells.length <= Math.max(billCol, clientCol)) return
+
+      const billCell = cells[billCol]
+      const skipValues = new Set([
+        "Activity or Bill No and Title",
+        "N/A",
+        "None",
+        "",
+        "Total amount"
+      ])
+      if (!billCell || skipValues.has(billCell)) return
+
+      const parts = billCell.split(/\s+/)
+      const billNo = parts[0]
+      const activityTitle = parts.slice(1).join(" ")
+      const match = billNo.match(/^([A-Z]+)(\d+)$/)
+      if (!match) return
+
+      const [, prefix, number] = match
+      const chamber: LobbyingChamber = chamberMap[prefix] ?? "Other"
+      const billId = constructBillId(chamber, number)
+      const position = positionCol !== null ? cells[positionCol] ?? "" : ""
+      const clientName = cells[clientCol] ?? ""
+
+      bills.push({
+        clientName,
+        chamber,
+        rawBillNumber: number,
+        billId,
+        activityTitle,
+        position,
+        amount: null
+      })
+    })
+  }
+
+  return { compensation, bills }
+}
diff --git a/functions/src/lobbying/scrapeLobbying.ts b/functions/src/lobbying/scrapeLobbying.ts
new file mode 100644
index 000000000..7a6140e8e
--- /dev/null
+++ b/functions/src/lobbying/scrapeLobbying.ts
@@ -0,0 +1,274 @@
+import { logger } from "firebase-functions"
+import { runWith } from "firebase-functions/v1"
+import { db, Timestamp } from "../firebase"
+import type { Database } from "../types"
+import { normalizeEntityName } from "./normalize"
+import {
+  fetchDisclosureDetail,
+  fetchDisclosureMeta,
+  fetchSummaryLinks,
+  filingId,
+  makePortalClient,
+  registrantId,
+  yearToGeneralCourt
+} from "./portal"
+import {
+  FILINGS_COLLECTION,
+  FIRST_LOBBYING_YEAR,
+  LobbyingFiling,
+  LobbyingRegistrant,
+  REGISTRANTS_COLLECTION,
+  SCRAPER_DOC
+} from "./types"
+
+/**
+ * Scraper state stored in Firestore at /scrapers/lobbying.
+ *
+ * processedDiscUrls: disc URLs already fetched; skip on re-runs.
+ * summaryDiscCache:  maps summaryUrl → its known disc URLs so we can skip
+ *                    summary page GETs for registrants with no new filings.
+ */
+interface ScraperState {
+  processedDiscUrls: string[]
+  summaryDiscCache: Record<string, string[]>
+}
+
+/**
+ * Maximum number of new disclosure pages to fetch per function invocation.
+ * Each page takes ~1s; this keeps the run well within the 540s timeout.
+ * Remaining work is picked up on the next scheduled run.
+ */
+const MAX_DISCLOSURES_PER_RUN = 200
+
+/**
+ * Scrape lobbying disclosure data for the current and prior calendar year.
+ *
+ * Runs every 24 hours. New filers arrive semi-annually so daily polling is
+ * more than sufficient for steady-state freshness. For initial historical
+ * ingestion (2005-present) use the backfillLobbying admin script instead.
+ *
+ * Progress is checkpointed to Firestore after every disclosure page so the
+ * function is fully resumable if it times out or is interrupted.
+ */
+export const scrapeLobbying = runWith({ timeoutSeconds: 540, maxInstances: 1 })
+  .pubsub.schedule("every 24 hours")
+  .onRun(async () => {
+    const currentYear = new Date().getFullYear()
+    const years = [currentYear, currentYear - 1]
+
+    const scraperRef = db.doc(SCRAPER_DOC)
+    const scraperDoc = await scraperRef.get()
+    const state: ScraperState = {
+      processedDiscUrls: scraperDoc.data()?.processedDiscUrls ?? [],
+      summaryDiscCache: scraperDoc.data()?.summaryDiscCache ?? {}
+    }
+    const processedSet = new Set<string>(state.processedDiscUrls)
+    const summaryCache: Record<string, string[]> = state.summaryDiscCache
+
+    const client = makePortalClient()
+    let newDiscCount = 0
+
+    for (const year of years) {
+      if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break
+
+      logger.info(`scrapeLobbying: fetching summary links for ${year}`)
+      let summaryUrls: string[]
+      try {
+        summaryUrls = await fetchSummaryLinks(client, year)
+      } catch (e) {
+        logger.error(
+          `scrapeLobbying: failed to fetch summary links for ${year}`,
+          e
+        )
+        continue
+      }
+      logger.info(
+        `scrapeLobbying: ${summaryUrls.length} registrants for ${year}`
+      )
+
+      for (const summaryUrl of summaryUrls) {
+        if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break
+
+        // Use cached disc URLs when available to avoid re-fetching summary pages.
+        // For current year we always re-check (new filings arrive mid-year).
+        let discUrls = summaryCache[summaryUrl]
+        if (!discUrls || year === currentYear) {
+          try {
+            const meta = await fetchDisclosureMeta(client, summaryUrl)
+            discUrls = meta.disclosureUrls
+
+            // Write registrant doc (upsert); don't wait for individual writes to
+            // finish — use a bulkWriter for the doc contents but checkpoint the
+            // scraper state separately so interruptions are recoverable.
+            if (meta.entityName && meta.year) {
+              await writeRegistrant(
+                db,
+                meta.entityName,
+                meta.year,
+                meta.regType,
+                discUrls
+              )
+            }
+
+            summaryCache[summaryUrl] = discUrls
+            await scraperRef.set(
+              { summaryDiscCache: summaryCache },
+              { merge: true }
+            )
+          } catch (e) {
+            logger.warn(
+              `scrapeLobbying: failed to fetch summary ${summaryUrl}`,
+              e
+            )
+            continue
+          }
+        }
+
+        const newDiscUrls = discUrls.filter(u => !processedSet.has(u))
+        if (newDiscUrls.length === 0) continue
+
+        for (const discUrl of newDiscUrls) {
+          if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break
+          try {
+            await processDisclosure(db, client, summaryUrl, discUrl, year)
+            processedSet.add(discUrl)
+            newDiscCount++
+
+            // Checkpoint after every disclosure so restarts lose at most one page
+            await scraperRef.set(
+              { processedDiscUrls: Array.from(processedSet) },
+              { merge: true }
+            )
+          } catch (e) {
+            logger.warn(
+              `scrapeLobbying: failed to process disclosure ${discUrl}`,
+              e
+            )
+          }
+        }
+      }
+    }
+
+    logger.info(`scrapeLobbying: processed ${newDiscCount} new disclosures`)
+  })
+
+// ─── Shared write helpers (also used by backfillLobbying) ────────────────────
+
+/**
+ * Write or update a LobbyingRegistrant document. Client list is assembled from
+ * the disclosure meta; filing documents are written separately per-bill.
+ */
+export async function writeRegistrant(
+  database: Database,
+  entityName: string,
+  year: number,
+  regType: "Lobbyist" | "Employer",
+  disclosureUrls: string[]
+): Promise<void> {
+  const id = registrantId(entityName, year)
+  const ref = database.collection(REGISTRANTS_COLLECTION).doc(id)
+  const partial: Omit<LobbyingRegistrant, "clients" | "fetchedAt"> & {
+    fetchedAt: FirebaseFirestore.Timestamp
+  } = {
+    registrantId: id,
+    entityName,
+    entityNameNorm: normalizeEntityName(entityName),
+    year,
+    generalCourt: yearToGeneralCourt(year),
+    regType,
+    disclosureUrls,
+    fetchedAt: Timestamp.now()
+  }
+  // Merge so repeated runs don't wipe clients accumulated from multiple disclosures
+  await ref.set(partial, { merge: true })
+}
+
+/**
+ * Fetch one CompleteDisclosure page and write LobbyingFiling documents.
+ * Also updates the registrant's client list.
+ */
+export async function processDisclosure(
+  database: Database,
+  client: ReturnType<typeof makePortalClient>,
+  summaryUrl: string,
+  discUrl: string,
+  year: number
+): Promise<void> {
+  const meta = await fetchDisclosureMeta(client, summaryUrl)
+  const detail = await fetchDisclosureDetail(client, discUrl, year)
+
+  const { entityName, regType } = meta
+  const gc = yearToGeneralCourt(year)
+  const entityNameNorm = normalizeEntityName(entityName)
+  const now = Timestamp.now()
+
+  // Update registrant's client list
+  if (entityName && year) {
+    const regRef = database
+      .collection(REGISTRANTS_COLLECTION)
+      .doc(registrantId(entityName, year))
+
+    const clients = detail.compensation.map(c => ({
+      clientName: c.clientName,
+      clientNameNorm: normalizeEntityName(c.clientName),
+      compensation: c.amount
+    }))
+
+    await regRef.set(
+      {
+        registrantId: registrantId(entityName, year),
+        entityName,
+        entityNameNorm,
+        year,
+        generalCourt: gc,
+        regType: regType ?? "Lobbyist",
+        clients,
+        disclosureUrls: [discUrl],
+        fetchedAt: now
+      },
+      { merge: true }
+    )
+  }
+
+  // Write one LobbyingFiling doc per bill row
+  if (detail.bills.length === 0) return
+
+  const writer = database.bulkWriter()
+  for (const bill of detail.bills) {
+    const fid = filingId(
+      entityName,
+      bill.clientName,
+      bill.chamber,
+      bill.billId,
+      gc,
+      bill.position
+    )
+    const doc: LobbyingFiling = {
+      filingId: fid,
+      entityName,
+      entityNameNorm,
+      clientName: bill.clientName,
+      clientNameNorm: normalizeEntityName(bill.clientName),
+      year,
+      generalCourt: gc,
+      chamber: bill.chamber,
+      billId: bill.billId,
+      activityTitle: bill.activityTitle,
+      position: bill.position,
+      amount: bill.amount,
+      fetchedAt: now
+    }
+    writer.set(database.collection(FILINGS_COLLECTION).doc(fid), doc, {
+      merge: false
+    })
+  }
+  await writer.close()
+}
+
+/** All years to scrape, for use by the backfill script. */
+export function allLobbyingYears(): number[] {
+  const current = new Date().getFullYear()
+  const years: number[] = []
+  for (let y = FIRST_LOBBYING_YEAR; y <= current; y++) years.push(y)
+  return years
+}
diff --git a/functions/src/lobbying/types.ts b/functions/src/lobbying/types.ts
new file mode 100644
index 000000000..83eaab761
--- /dev/null
+++ b/functions/src/lobbying/types.ts
@@ -0,0 +1,101 @@
+import {
+  Array,
+  InstanceOf,
+  Literal,
+  Number,
+  Null,
+  Record,
+  Static,
+  String,
+  Union
+} from "runtypes"
+import { Timestamp } from "../firebase"
+
+export type LobbyingChamber = Static<typeof LobbyingChamber>
+export const LobbyingChamber = Union(
+  Literal("House Bill"),
+  Literal("Senate Bill"),
+  Literal("House Docket"),
+  Literal("Senate Docket"),
+  Literal("Executive"),
+  Literal("Other")
+)
+
+export type LobbyingClient = Static<typeof LobbyingClient>
+export const LobbyingClient = Record({
+  clientName: String,
+  clientNameNorm: String,
+  compensation: Null.Or(Number)
+})
+
+export type LobbyingRegistrant = Static<typeof LobbyingRegistrant>
+export const LobbyingRegistrant = Record({
+  registrantId: String,
+  entityName: String,
+  entityNameNorm: String,
+  year: Number,
+  generalCourt: Number,
+  regType: Union(Literal("Lobbyist"), Literal("Employer")),
+  clients: Array(LobbyingClient),
+  disclosureUrls: Array(String),
+  fetchedAt: InstanceOf(Timestamp)
+})
+
+export type LobbyingFiling = Static<typeof LobbyingFiling>
+export const LobbyingFiling = Record({
+  filingId: String,
+  entityName: String,
+  entityNameNorm: String,
+  clientName: String,
+  clientNameNorm: String,
+  year: Number,
+  generalCourt: Number,
+  chamber: LobbyingChamber,
+  // Non-null only for legislative chambers (House Bill, Senate Bill, House Docket,
+  // Senate Docket). For Executive and Other, no bill join should be attempted.
+  billId: Null.Or(String),
+  activityTitle: String,
+  position: String,
+  amount: Null.Or(Number),
+  fetchedAt: InstanceOf(Timestamp)
+})
+
+/** Firestore path for lobbying registrant documents */
+export const REGISTRANTS_COLLECTION = "lobbyingRegistrants"
+
+/** Firestore path for lobbying filing documents */
+export const FILINGS_COLLECTION = "lobbyingFilings"
+
+/** Firestore path for the live scraper cursor document */
+export const SCRAPER_DOC = "/scrapers/lobbying"
+
+/** Firestore path for the backfill cursor subcollection */
+export const BACKFILL_DOC = "/scrapers/lobbyingBackfill"
+export const BACKFILL_URLS_COLLECTION = "processedUrls"
+
+/** Earliest year with portal data */
+export const FIRST_LOBBYING_YEAR = 2005
+
+/**
+ * Sentinel clientName used for pre-2013 legacy filings where compensation is
+ * reported as a single total rather than broken down per client.
+ */
+export const LEGACY_TOTAL_CLIENT = "_total_salary_"
+
+/**
+ * Chamber prefix map for constructing billId values that match MAPLE's Bill.id.
+ * Typed as a plain index signature so portal.ts can look up any LobbyingChamber
+ * without triggering "Property X does not exist" on the Partial.
+ */
+export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = {
+  "House Bill": "H",
+  "Senate Bill": "S",
+  "House Docket": "HD",
+  "Senate Docket": "SD"
+}
+
+/** Canonical chamber values for legacy short-form codes found in older filings */
+export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = {
+  HB: "House Bill",
+  SB: "Senate Bill"
+}
diff --git a/scripts/firebase-admin/backfillLobbying.ts b/scripts/firebase-admin/backfillLobbying.ts
new file mode 100644
index 000000000..f7914dd84
--- /dev/null
+++ b/scripts/firebase-admin/backfillLobbying.ts
@@ -0,0 +1,156 @@
+/**
+ * Backfill lobbying disclosure data from 2005 to the present.
+ *
+ * This script is the primary ingestion path for all historical data. The live
+ * Cloud Function (scrapeLobbying) only handles the current and prior year in
+ * steady state. Run this once to populate the full history, and re-run with
+ * --year to refresh specific years.
+ *
+ * Usage:
+ *   GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+ *     yarn firebase-admin run-script backfillLobbying --env dev
+ *
+ * Options:
+ *   --year  NUMBER   Only process this year (useful for testing or re-runs)
+ *   --limit NUMBER   Max registrants to process per year (for testing)
+ *
+ * Cursor storage:
+ *   Processed disclosure URLs are stored as documents in the Firestore
+ *   subcollection /scrapers/lobbyingBackfill/processedUrls/{urlHash}.
+ *   This scales to the full historical URL set (~50,000+) without hitting the
+ *   1MB Firestore document size limit. Restart the script at any time; it will
+ *   resume from where it left off.
+ */
+
+import { createHash } from "crypto"
+import { z } from "zod"
+import {
+  allLobbyingYears,
+  processDisclosure,
+  writeRegistrant
+} from "../../functions/src/lobbying/scrapeLobbying"
+import {
+  fetchDisclosureMeta,
+  fetchSummaryLinks,
+  makePortalClient
+} from "../../functions/src/lobbying/portal"
+import {
+  BACKFILL_DOC,
+  BACKFILL_URLS_COLLECTION,
+  FIRST_LOBBYING_YEAR
+} from "../../functions/src/lobbying/types"
+import { Script } from "./types"
+
+const Args = z
+  .object({
+    year: z.number().int().min(FIRST_LOBBYING_YEAR).optional(),
+    limit: z.number().int().positive().optional()
+  })
+  .passthrough()
+
+export const script: Script = async ({ db, args }) => {
+  const { year: onlyYear, limit } = Args.parse(args)
+
+  const years = onlyYear ? [onlyYear] : allLobbyingYears()
+  console.log(
+    `backfillLobbying: processing years ${years[0]}–${years[years.length - 1]}`
+  )
+
+  // Load already-processed disc URLs from the subcollection cursor.
+  const backfillRef = db.doc(BACKFILL_DOC)
+  const processedSnap = await backfillRef
+    .collection(BACKFILL_URLS_COLLECTION)
+    .select() // fetch only doc IDs (the URL hash), no field data needed
+    .get()
+  const processedHashes = new Set(processedSnap.docs.map(d => d.id))
+  console.log(
+    `backfillLobbying: ${processedHashes.size} disc URLs already processed`
+  )
+
+  const client = makePortalClient()
+  let totalNew = 0
+
+  for (const year of years) {
+    console.log(`\n── ${year} ──`)
+
+    let summaryUrls: string[]
+    try {
+      summaryUrls = await fetchSummaryLinks(client, year)
+    } catch (e) {
+      console.error(`  Failed to fetch summary links for ${year}:`, e)
+      continue
+    }
+
+    if (limit) summaryUrls = summaryUrls.slice(0, limit)
+    console.log(`  ${summaryUrls.length} registrants on portal`)
+
+    let yearNew = 0
+
+    for (let i = 0; i < summaryUrls.length; i++) {
+      const summaryUrl = summaryUrls[i]
+      let meta: Awaited<ReturnType<typeof fetchDisclosureMeta>>
+
+      try {
+        meta = await fetchDisclosureMeta(client, summaryUrl)
+      } catch (e) {
+        console.warn(
+          `  [${i + 1}/${
+            summaryUrls.length
+          }] Failed to fetch summary: ${summaryUrl}`,
+          e
+        )
+        continue
+      }
+
+      if (meta.entityName && meta.year) {
+        try {
+          await writeRegistrant(
+            db,
+            meta.entityName,
+            meta.year,
+            meta.regType,
+            meta.disclosureUrls
+          )
+        } catch (e) {
+          console.warn(`  Failed to write registrant ${meta.entityName}:`, e)
+        }
+      }
+
+      for (const discUrl of meta.disclosureUrls) {
+        const urlHash = createHash("sha256")
+          .update(discUrl)
+          .digest("hex")
+          .slice(0, 40)
+        if (processedHashes.has(urlHash)) continue
+
+        try {
+          await processDisclosure(db, client, summaryUrl, discUrl, year)
+
+          // Mark as processed in the subcollection cursor
+          await backfillRef
+            .collection(BACKFILL_URLS_COLLECTION)
+            .doc(urlHash)
+            .set({ url: discUrl, processedAt: new Date().toISOString() })
+
+          processedHashes.add(urlHash)
+          totalNew++
+          yearNew++
+        } catch (e) {
+          console.warn(`  Failed to process disclosure ${discUrl}:`, e)
+        }
+      }
+
+      if ((i + 1) % 50 === 0 || i + 1 === summaryUrls.length) {
+        console.log(
+          `  [${i + 1}/${
+            summaryUrls.length
+          }] ${yearNew} new disclosures this year`
+        )
+      }
+    }
+
+    console.log(`  ${year} complete: ${yearNew} new disclosures`)
+  }
+
+  console.log(`\nbackfillLobbying complete: ${totalNew} new disclosures total`)
+}

From 0074294861aece13daea9c90be3af634afd407fa Mon Sep 17 00:00:00 2001
From: Nathan <sandersn@gmail.com>
Date: Fri, 5 Jun 2026 16:34:07 -0400
Subject: [PATCH 3/4] feat: add Python Cloud Run scraper for lobbying
 disclosures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting
to classify HTTP clients before examining headers. Python's requests library
produces a fingerprint that Imperva allows through; Node.js does not. A
standalone Cloud Run container (Python 3.12) is therefore used for the
scheduled ingestion instead of a Cloud Function.

lobbying-scraper/ — Cloud Run container (3 pip deps: requests, beautifulsoup4,
google-cloud-firestore):
- scrape.py: entry point with --mode weekly (incremental, fast exit if nothing
  new) and --mode backfill (full 2005-present history, resumable subcollection
  cursor). Weekly mode caches summary URL→disc URL mappings so prior-year
  registrants with no new filings require zero additional HTTP requests.
- portal.py: HTTP session management + HTML parsing for all three portal page
  levels (search POST, summary GET, disclosure GET). Handles both modern
  (>=2013) and legacy (<2013) disclosure formats.
- normalize.py: port of functions/src/lobbying/normalize.ts — 10-step entity
  name normalization pipeline, must match the TypeScript version exactly.
- writer.py: Firestore document construction and batch writes. Schema matches
  types.ts (lobbyingRegistrants, lobbyingFilings collections).

scripts/firebase-admin/backfillLobbying.ts — simplified to spawn scrape.py
as a subprocess; all HTTP and Firestore logic moved to Python.

functions/src/lobbying/http/ — thin Python HTTP helper kept for reference;
not used in the current architecture.

Note: server-side IP reputation behavior with Imperva untested. Build and run
the container on Cloud Run with --dry-run to validate before full deploy.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/lobbying-disclosure-ingestion.md         | 167 +++++---
 functions/src/lobbying/http/.gitignore        |   3 +
 functions/src/lobbying/http/fetch.py          |  81 ++++
 functions/src/lobbying/http/requirements.txt  |   1 +
 functions/src/lobbying/normalize.ts           |   3 +-
 functions/src/lobbying/portal.ts              |  96 ++++-
 lobbying-scraper/.dockerignore                |   4 +
 lobbying-scraper/Dockerfile                   |  14 +
 .../__pycache__/normalize.cpython-37.pyc      | Bin 0 -> 1412 bytes
 .../__pycache__/portal.cpython-37.pyc         | Bin 0 -> 11941 bytes
 lobbying-scraper/normalize.py                 |  50 +++
 lobbying-scraper/portal.py                    | 376 ++++++++++++++++++
 lobbying-scraper/requirements.txt             |   3 +
 lobbying-scraper/scrape.py                    | 269 +++++++++++++
 lobbying-scraper/writer.py                    | 126 ++++++
 scripts/firebase-admin/backfillLobbying.ts    | 168 ++------
 16 files changed, 1157 insertions(+), 204 deletions(-)
 create mode 100644 functions/src/lobbying/http/.gitignore
 create mode 100644 functions/src/lobbying/http/fetch.py
 create mode 100644 functions/src/lobbying/http/requirements.txt
 create mode 100644 lobbying-scraper/.dockerignore
 create mode 100644 lobbying-scraper/Dockerfile
 create mode 100644 lobbying-scraper/__pycache__/normalize.cpython-37.pyc
 create mode 100644 lobbying-scraper/__pycache__/portal.cpython-37.pyc
 create mode 100644 lobbying-scraper/normalize.py
 create mode 100644 lobbying-scraper/portal.py
 create mode 100644 lobbying-scraper/requirements.txt
 create mode 100644 lobbying-scraper/scrape.py
 create mode 100644 lobbying-scraper/writer.py

diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md
index ad67fe397..264c77c52 100644
--- a/docs/lobbying-disclosure-ingestion.md
+++ b/docs/lobbying-disclosure-ingestion.md
@@ -233,43 +233,57 @@ executive and legislative null rows.
 
 ## Scraper Architecture
 
-The lobbying portal is an HTML scraper, not a REST API. It does not fit the
-`createScraper` factory (which assumes list-IDs → fetch-per-ID against the MA
-Legislature API). Instead, we use a custom scheduled function following the
-`scrapeEvents` pattern.
-
-### Cloud Function: `scrapeLobbying`
-
-**File:** `functions/src/lobbying/scrapeLobbying.ts`
-
-- Schedule: `every 24 hours`
-- Scrapes the current year and prior year (new filers arrive semi-annually)
+### Why a standalone Cloud Run container
+
+The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to
+classify HTTP clients at the network layer before examining any headers. Node.js
+produces a TLS fingerprint that Imperva challenges with a JavaScript
+verification page; Python's `requests` library produces a fingerprint that
+Imperva allows through without challenge. This is a runtime-level constraint
+that cannot be addressed by header configuration or cipher reordering alone.
+
+The scraper therefore runs as a standalone **Cloud Run container** written in
+Python, deployed alongside the existing MCP server container. All data modeling,
+Firestore collection/field names, and normalization logic are documented here and
+kept consistent between the Python container and the TypeScript type definitions
+in `functions/src/lobbying/types.ts`.
+
+### Cloud Run container: `lobbying-scraper/`
+
+**Files:** `lobbying-scraper/{scrape,portal,normalize,writer}.py`
+
+- Scheduled weekly by Cloud Scheduler
+- Runs an incremental check: fetches the current and prior year's summary links
+  (one POST), compares disc URLs against the Firestore cursor, and **exits
+  immediately if nothing is new** (fast path, typically seconds)
+- When new or updated disclosures are found, fetches and processes them
 - Persists a cursor in `/scrapers/lobbying`:
-  - `lastFetchedAt: Timestamp`
-  - `processedDiscUrls: string[]` — already-fetched disclosure URLs (skipped on
-    re-runs)
+  - `processedDiscUrls: string[]` — disc URLs already written; skipped on
+    re-runs
+  - `summaryDiscCache: {[summaryUrl]: string[]}` — maps summary page URLs to
+    their disc URLs so summary page GETs are skipped for prior-year registrants
+    whose disclosures are all already processed
 - For each new disclosure URL:
   - Parse registrant + client compensation rows → upsert `lobbyingRegistrants`
-    doc
-  - Parse bill activity rows → batch-write `lobbyingFilings` docs
-- Uses `axios` (existing dependency) with an iPad `User-Agent` header to match
-  portal expectations
-- Uses `jsdom` for HTML table parsing (already a dependency; used by events scraper)
-- 1s delay between requests; exponential backoff on failure (matching existing
-  scraper retry pattern)
-- Function timeout: 540s
-
-### Incremental Strategy
-
-Processed disclosure URLs are stored in `/scrapers/lobbying.processedDiscUrls`.
-At ~2 disclosure URLs per registrant × ~500 registrants per year, the
-current+prior year window stays well within Firestore document limits.
-Historical years beyond current-1 are stable (filings are frozen after year
-closes) and are handled by the backfill script only.
-
-The backfill script uses a separate Firestore document
-(`/scrapers/lobbyingBackfill`) for its own cursor so it does not interfere with
-the live scraper.
+  - Parse bill activity rows → batch-write `lobbyingFilings`
+- 1s delay between requests; exponential backoff on transient failures
+
+### Incremental strategy
+
+In steady state (after the initial backfill), each weekly run:
+
+1. One POST to fetch all summary links for current + prior year
+2. For prior-year registrants with all disc URLs in the cursor: zero GETs
+3. For current-year registrants: one GET per summary page to check for new
+   disclosure periods
+4. For any new disc URLs: one GET per disclosure page
+
+New filings arrive twice a year (semi-annual reporting periods). Between
+periods, the run completes in under a minute.
+
+The backfill script (`--mode backfill`) uses a separate subcollection cursor at
+`/scrapers/lobbyingBackfill/processedUrls/{urlHash}` so it does not interfere
+with the live scraper state.
 
 ### Legacy Format (pre-2013)
 
@@ -284,26 +298,64 @@ them. No bill-level compensation amount is available for these years.
 
 ```
 functions/src/lobbying/
-  types.ts            — Runtypes definitions for LobbyingRegistrant, LobbyingFiling
-  scrapeLobbying.ts   — Scheduled Cloud Function + shared parsing/normalization logic
-  index.ts            — Re-exports
+  types.ts          — Runtypes definitions for LobbyingRegistrant, LobbyingFiling
+  normalize.ts      — Entity name normalization pipeline
+  portal.ts         — Reference implementation (HTTP layer not used in production)
+  scrapeLobbying.ts — Reference implementation (superseded by Cloud Run container)
+  index.ts          — Re-exports
+
+lobbying-scraper/
+  scrape.py         — Entry point: --mode weekly (incremental) | --mode backfill
+  portal.py         — HTTP + HTML parsing
+  normalize.py      — Port of normalize.ts
+  writer.py         — Firestore document construction + writes
+  requirements.txt  — requests, beautifulsoup4, google-cloud-firestore
+  Dockerfile        — Python 3.12-slim image
 ```
 
 ---
 
-## Firebase Admin Script
+## Deploying the Cloud Run Container
+
+Follows the same pattern as the MCP server. Requires the
+`maple-lobbying-scraper` Artifact Registry repository to exist.
+
+```bash
+cd lobbying-scraper
+IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest
+docker build -t $IMAGE . && docker push $IMAGE
+
+gcloud run jobs create maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=digital-testimony-dev \
+  --region=us-central1 \
+  --service-account=<scraper-sa>@digital-testimony-dev.iam.gserviceaccount.com
+
+# Schedule weekly via Cloud Scheduler
+gcloud scheduler jobs create http maple-lobbying-weekly \
+  --schedule="0 6 * * 1" \
+  --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/digital-testimony-dev/jobs/maple-lobbying-scraper:run" \
+  --http-method=POST \
+  --oauth-service-account-email=<scheduler-sa>@digital-testimony-dev.iam.gserviceaccount.com \
+  --location=us-central1
+```
 
-**File:** `scripts/firebase-admin/backfillLobbying.ts`
+## Historical Backfill (Admin Script)
 
-Ingests all historical filings from 2005 to the present. This is the primary
-path for all data before the current and prior year. Accepts `--year` and
-`--limit` CLI args for targeted re-runs or testing. Calls the same parsing
-logic exported from `functions/src/lobbying/scrapeLobbying.ts` and writes
-directly to Firestore via the firebase-admin SDK.
+Ingests all historical filings from 2005 to the present. Delegates to
+`scrape.py --mode backfill` via subprocess. Resumable — the subcollection
+cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks what has been
+processed. Run directly on the machine (requires `lobbying-scraper/` deps
+installed or the `maple-2025` conda environment).
 
 ```bash
 GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
   yarn firebase-admin run-script backfillLobbying --env dev
+
+# Or call scrape.py directly for more control:
+cd lobbying-scraper
+python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run
+python3 scrape.py --mode backfill --year 2024
 ```
 
 ---
@@ -348,17 +400,22 @@ export { scrapeLobbying } from "./lobbying"
 
 ## Implementation Status
 
-| File                                         | Status  |
-| -------------------------------------------- | ------- |
-| `functions/src/lobbying/types.ts`            | ✅ Done |
-| `functions/src/lobbying/normalize.ts`        | ✅ Done |
-| `functions/src/lobbying/portal.ts`           | ✅ Done |
-| `functions/src/lobbying/scrapeLobbying.ts`   | ✅ Done |
-| `functions/src/lobbying/index.ts`            | ✅ Done |
-| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done |
-| `functions/src/index.ts` (export)            | ✅ Done |
-| `firestore.rules`                            | ✅ Done |
-| `firestore.indexes.json`                     | ✅ Done |
+| File                                         | Status  | Notes                                                      |
+| -------------------------------------------- | ------- | ---------------------------------------------------------- |
+| `functions/src/lobbying/types.ts`            | ✅ Done | TypeScript type definitions; source of truth for schema    |
+| `functions/src/lobbying/normalize.ts`        | ✅ Done | Normalization pipeline (also ported to `normalize.py`)     |
+| `functions/src/lobbying/portal.ts`           | ✅ Done | Kept for reference; HTTP layer not used (see architecture) |
+| `functions/src/lobbying/scrapeLobbying.ts`   | ✅ Done | Not deployed; superseded by Cloud Run container            |
+| `functions/src/lobbying/index.ts`            | ✅ Done |                                                            |
+| `functions/src/index.ts` (export)            | ✅ Done |                                                            |
+| `firestore.rules`                            | ✅ Done |                                                            |
+| `firestore.indexes.json`                     | ✅ Done |                                                            |
+| `lobbying-scraper/normalize.py`              | ✅ Done | Port of normalize.ts                                       |
+| `lobbying-scraper/portal.py`                 | ✅ Done | HTTP + HTML parsing                                        |
+| `lobbying-scraper/writer.py`                 | ✅ Done | Firestore document construction                            |
+| `lobbying-scraper/scrape.py`                 | ✅ Done | Entry point; `--mode weekly` and `--mode backfill`         |
+| `lobbying-scraper/Dockerfile`                | ✅ Done | Python 3.12 slim                                           |
+| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done | Calls `scrape.py --mode backfill` as subprocess            |
 
 ### Document ID scheme
 
diff --git a/functions/src/lobbying/http/.gitignore b/functions/src/lobbying/http/.gitignore
new file mode 100644
index 000000000..d0ee3b17c
--- /dev/null
+++ b/functions/src/lobbying/http/.gitignore
@@ -0,0 +1,3 @@
+venv/
+__pycache__/
+*.pyc
diff --git a/functions/src/lobbying/http/fetch.py b/functions/src/lobbying/http/fetch.py
new file mode 100644
index 000000000..4e6c2c4ec
--- /dev/null
+++ b/functions/src/lobbying/http/fetch.py
@@ -0,0 +1,81 @@
+"""Minimal HTTP fetch helper for the lobbying portal.
+
+Handles the portal's session cookie requirements that standard Node.js HTTP
+clients cannot satisfy due to TLS-layer constraints.
+
+Usage:
+    python3 fetch.py --url URL [--method GET|POST] [--jar PATH]
+
+POST body is read from stdin as application/x-www-form-urlencoded.
+Cookies are persisted to/from the JSON file at --jar so the session survives
+across multiple subprocess invocations.
+HTML response is written to stdout. Errors go to stderr with exit code 1.
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+import requests
+
+_UA = (
+    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) "
+    "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
+)
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--url", required=True)
+    p.add_argument("--method", default="GET", choices=["GET", "POST"])
+    p.add_argument("--jar", default=None, help="Path to JSON cookie-jar file")
+    args = p.parse_args()
+
+    session = requests.Session()
+    session.headers.update(
+        {
+            "User-Agent": _UA,
+            "Accept": "*/*",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Connection": "keep-alive",
+        }
+    )
+
+    if args.jar:
+        jar = Path(args.jar)
+        if jar.exists():
+            try:
+                session.cookies.update(json.loads(jar.read_text()))
+            except Exception as e:
+                print(f"warning: could not read cookie jar: {e}", file=sys.stderr)
+
+    try:
+        if args.method == "POST":
+            body = sys.stdin.buffer.read()
+            resp = session.post(
+                args.url,
+                data=body,
+                headers={"Content-Type": "application/x-www-form-urlencoded"},
+                timeout=180,
+            )
+        else:
+            resp = session.get(args.url, timeout=60)
+
+        resp.raise_for_status()
+
+        if args.jar:
+            Path(args.jar).write_text(json.dumps(dict(session.cookies)))
+
+        sys.stdout.buffer.write(resp.content)
+
+    except requests.exceptions.HTTPError as e:
+        print(f"HTTP error {e.response.status_code}: {args.url}", file=sys.stderr)
+        sys.exit(1)
+    except requests.exceptions.RequestException as e:
+        print(f"request failed: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/functions/src/lobbying/http/requirements.txt b/functions/src/lobbying/http/requirements.txt
new file mode 100644
index 000000000..b18d51347
--- /dev/null
+++ b/functions/src/lobbying/http/requirements.txt
@@ -0,0 +1 @@
+requests>=2.28
diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts
index 8d3d0a0ba..a7beb338f 100644
--- a/functions/src/lobbying/normalize.ts
+++ b/functions/src/lobbying/normalize.ts
@@ -5,8 +5,7 @@
  * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
  * Acme Consulting", etc. across filings and years.
  *
- * This pipeline is a direct port of the reference implementation used in the
- * companion data analysis project. The steps must be applied in the exact order
+ *  The steps must be applied in the exact order
  * listed here; changing the order produces different (incorrect) output.
  */
 
diff --git a/functions/src/lobbying/portal.ts b/functions/src/lobbying/portal.ts
index e441522b8..64d65831b 100644
--- a/functions/src/lobbying/portal.ts
+++ b/functions/src/lobbying/portal.ts
@@ -19,6 +19,7 @@
 import axios, { AxiosInstance } from "axios"
 import { JSDOM } from "jsdom"
 import { sha256 } from "js-sha256"
+import { CookieJar } from "tough-cookie"
 import {
   CHAMBER_PREFIXES,
   LEGACY_CHAMBER_MAP,
@@ -72,19 +73,68 @@ export interface DisclosureDetail {
 
 // ─── HTTP helpers ────────────────────────────────────────────────────────────
 
-export function makePortalClient(): AxiosInstance {
-  return axios.create({
-    headers: { "User-Agent": IPAD_UA },
-    timeout: 60_000
+/**
+ * Create an axios instance pre-configured for the MA SoS portal.
+ *
+ * Includes a cookie jar via interceptors so ASP.NET session state (ViewState,
+ * anti-forgery tokens) is preserved across the GET → POST page flow without
+ * requiring the axios-cookiejar-support package.
+ */
+export interface PortalClient {
+  jar: CookieJar
+  client: AxiosInstance
+}
+
+/**
+ * Create a portal client pre-configured for the MA SoS portal.
+ *
+ * Uses maxRedirects: 0 so our manual redirect loop (inside getHtml / postHtml)
+ * can extract Set-Cookie headers at each hop before following. This is necessary
+ * because the portal is protected by Incapsula, which issues a 302 challenge on
+ * first contact and requires the session cookies to be sent on the retried request.
+ * Axios's built-in redirect following happens before response interceptors fire,
+ * so the cookies from the challenge response are never captured automatically.
+ */
+export function makePortalClient(): PortalClient {
+  const jar = new CookieJar()
+  const client = axios.create({
+    headers: {
+      "User-Agent": IPAD_UA,
+      Accept: "*/*",
+      "Accept-Encoding": "gzip, deflate, br",
+      Connection: "keep-alive"
+    },
+    timeout: 60_000,
+    maxRedirects: 10, // let axios handle ordinary redirects; only Incapsula challenges need manual handling
+    validateStatus: s => s < 500 // surface 4xx so we can log them
   })
+  return { jar, client }
 }
 
 function sleep(ms: number): Promise<void> {
   return new Promise(resolve => setTimeout(resolve, ms))
 }
 
+function cookieHeader(jar: CookieJar, url: string): string {
+  return jar
+    .getCookiesSync(url)
+    .map(c => c.cookieString())
+    .join("; ")
+}
+
+function saveCookies(
+  jar: CookieJar,
+  url: string,
+  headers: Record<string, string | string[] | undefined>
+): void {
+  const raw = headers["set-cookie"]
+  if (!raw) return
+  const list = Array.isArray(raw) ? raw : [raw]
+  for (const c of list) jar.setCookieSync(c, url)
+}
+
 async function getHtml(
-  client: AxiosInstance,
+  pc: PortalClient,
   url: string,
   retries = MAX_RETRIES
 ): Promise<Document> {
@@ -93,10 +143,16 @@ async function getHtml(
       attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt
     )
     try {
-      const res = await client.get<string>(url, {
+      const res = await pc.client.get<string>(url, {
         responseType: "text",
-        headers: { Accept: "text/html" }
+        headers: { Cookie: cookieHeader(pc.jar, url) }
       })
+      saveCookies(
+        pc.jar,
+        url,
+        res.headers as Record<string, string | string[] | undefined>
+      )
+      if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`)
       return new JSDOM(res.data).window.document
     } catch (e) {
       if (attempt === retries - 1) throw e
@@ -108,7 +164,7 @@ async function getHtml(
 }
 
 async function postHtml(
-  client: AxiosInstance,
+  pc: PortalClient,
   url: string,
   data: Record<string, string>,
   retries = MAX_RETRIES
@@ -119,14 +175,20 @@ async function postHtml(
       attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt
     )
     try {
-      const res = await client.post<string>(url, body, {
+      const res = await pc.client.post<string>(url, body, {
         responseType: "text",
         headers: {
           "Content-Type": "application/x-www-form-urlencoded",
-          Accept: "text/html"
+          Cookie: cookieHeader(pc.jar, url)
         },
         timeout: 180_000
       })
+      saveCookies(
+        pc.jar,
+        url,
+        res.headers as Record<string, string | string[] | undefined>
+      )
+      if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`)
       return new JSDOM(res.data).window.document
     } catch (e) {
       if (attempt === retries - 1) throw e
@@ -237,10 +299,10 @@ function extractViewState(doc: Document): Record<string, string> {
  * Sends a single search POST with page size 20000 to get all registrants at once.
  */
 export async function fetchSummaryLinks(
-  client: AxiosInstance,
+  pc: PortalClient,
   year: number
 ): Promise<string[]> {
-  const searchPage = await getHtml(client, SEARCH_URL)
+  const searchPage = await getHtml(pc, SEARCH_URL)
   const vs = extractViewState(searchPage)
 
   const postData: Record<string, string> = {
@@ -257,7 +319,7 @@ export async function fetchSummaryLinks(
     ctl00$ContentPlaceHolder1$btnSearch: "Search"
   }
 
-  const resultsPage = await postHtml(client, SEARCH_URL, postData)
+  const resultsPage = await postHtml(pc, SEARCH_URL, postData)
 
   const table = resultsPage.querySelector(
     '[id*="grdvSearchResultByTypeAndCategory"]'
@@ -280,10 +342,10 @@ export async function fetchSummaryLinks(
  * Fetch a Summary.aspx page and return the registrant metadata + disclosure URLs.
  */
 export async function fetchDisclosureMeta(
-  client: AxiosInstance,
+  pc: PortalClient,
   summaryUrl: string
 ): Promise<DisclosureMeta> {
-  const doc = await getHtml(client, summaryUrl)
+  const doc = await getHtml(pc, summaryUrl)
 
   const text = (id: string) => {
     const el = doc.getElementById(id)
@@ -322,11 +384,11 @@ export async function fetchDisclosureMeta(
  * Handles both modern (≥~2013) and legacy (<~2013) HTML layouts.
  */
 export async function fetchDisclosureDetail(
-  client: AxiosInstance,
+  pc: PortalClient,
   discUrl: string,
   year: number
 ): Promise<DisclosureDetail> {
-  const doc = await getHtml(client, discUrl)
+  const doc = await getHtml(pc, discUrl)
   const compensation: RawCompensation[] = []
   const bills: RawBillActivity[] = []
 
diff --git a/lobbying-scraper/.dockerignore b/lobbying-scraper/.dockerignore
new file mode 100644
index 000000000..9460c99c4
--- /dev/null
+++ b/lobbying-scraper/.dockerignore
@@ -0,0 +1,4 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile
new file mode 100644
index 000000000..738293459
--- /dev/null
+++ b/lobbying-scraper/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY normalize.py portal.py writer.py scrape.py ./
+
+# Cloud Run sets PORT; we don't use it (this is a job, not a server).
+# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally.
+ENV PYTHONUNBUFFERED=1
+
+CMD ["python3", "scrape.py", "--mode", "weekly"]
diff --git a/lobbying-scraper/__pycache__/normalize.cpython-37.pyc b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47c3ba707ebdca4ef16776e15507d9f385437cab
GIT binary patch
literal 1412
zcmZux-EP}96c#1_Dt4N-N!xWp!EHbnrK|k{!&XIB5^bt&SWbfq83b8mVimHaQqr0j
z3ebx^!Y;O97tZA#W3Qph4j8ak@YN2LIL*aK@bKd~JRc7ae|Fn#S%C87pU>a?UK51B
z`CwJr0A2_iLh=&;BG>{!L=4bOv_-_R6iBv2rA38QsPv|KDcZGiRJ}xYy+k!YjS|%X
zHA#asiA>f=i?qo)xkEO{U2>1yC!42jTP9oN0eSd`NFI?-ey-SSM6v|sarPH1u7|0g
zp5xG+Q5;5W<_7-UP5mguu^-dG4{1l1dp@IHisOi-I6A>6S?KX8NjG6$H;5*ab3Z)o
zuH0!SO*+_0X`J9$mZW$>u^Y#MPYL!zIi30mre`h?MvPE~KfM11ds8<&1rnT2>9sm$
z5y?E7VB(*gP)5TPZ~LJaF&>M8qb!ZHv?Kq^bE=?X%?-mSEkjTG^DmB*XT9TOr~7QT
zl<Ie&=<MWNy{ndZt{qRdO;gKFbC?@_jkm#QI2fs8y_c_`G3b{#H^;r)G+r9qIMnhX
zcq+$}dHr}YKF|S3Kss-m>Jc97?HQU5-F#i`_poYN1I++ZYu<W#d(^zK&hMz>@nF=~
zUt@KQfn(lUDPHi=$~^vP2I8SQ8u#^)HNQVR+%*ji4+kJ$<NeX#a5&#I^?k!K)v@tP
z$Cf^N1u;Pcx91e{t=R5qgTub1o919-t(K9Kl;IrrqmUL7U_~q>=DsUxjHVe2J;ZYl
zr2?%AP2*R30U#HW20<3|3*l0@Kq*>?1m);bya2h7h)5)k&mNIVPxx`;Lgd=~TKl$h
zB%~Gakghz^>PK99S4r)LsZJVKOlhrT0v}+KuK3@c0aJbDy=d@Qa)?W5_6uQ&o-Ywr
z>Pz%>fjUd{RVG2`AKw3Z-T$^k+e@S@(NkN3qq2FnjH{Dme5TSYs#zRUR@5E1RL)3;
zBxRGL&S)IC9xa*<oPla`L*0Q}!a>`4X;_*wJQ%5#ZWT4>$l~fMWY>?C3gc-k(3vfI
zQ@a*VnVZl*1@;)U%kR6>XhypsO}f3v%VzK*lJ0?NIlDh-&K?bd=v^}P<L=Cj1A6l;
zpC=x3A;??LGd?f2ZXT-xH1GpB{0I0FmPwv~5Kskb1GS*aXrt^`cmuJ|V209yO*tg;
z9LJU&=Olw0K^=#QU@JsMi@F!hVn3j^Z0z?3BVB_Hv%6qrn_O&h@sNuLTyV;Ao43TL
lpeq{R%-|mg=(jvG$qtB$*b*D!Is{k~Wh6@?`u|$1{09r)g5CfC

literal 0
HcmV?d00001

diff --git a/lobbying-scraper/__pycache__/portal.cpython-37.pyc b/lobbying-scraper/__pycache__/portal.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..413885e3d657f24ebf306be5b8d9a45a80835a47
GIT binary patch
literal 11941
zcmcIq+ix7#d7tavxJ#}mN}{eF-7Kvxq$JCZE6cJek`^6PE1IM%X>4UQ+%sH`ID65V
zS&3ZFx@qLL2~rh`YXm`~6lBqd)Q?4g0xi&oqJKe)2F0L2ffNm(zO+D#JQ#iG?>n=*
zyogMiq}j!sIp;f<Ip@2b@B7ZVn#*M*{EGi}`2M$Fm85^ALg$x4=4~l2`Ts3T(oKm;
zro?2XcydKC6`As?r<$sGYo;dNx~b!>d5KEWOjc56O0?6xbR}bE+I@P=9<)h#*-FmL
zReH@{z>{X5)5lVKBs0&_<_?xI`&o~<lV!~TmNR#;UUN6=GxxAO+rj$TPB!pNGY8o&
zXD{1*Sz>$GAlv&)G4~0`zRQx6ye6^z?7*iIJMdU}ESviU<RCi)$RPnaARsR>0}w+%
z4hqO&b_9?k0&<8Q#sAndg&k)vf2x@;u@m^8M7h9Dp=^lqX?EtB%FePOKn??P6wfdq
z=h%5bj)*oV*l7Vj!7e;YvRBwe)Es56ve%x;>?`bbl#Zcvg8de|gr0A(%YYmgkT(Il
z@=RlIv9|$x8KpPb2xf4VjiP)4ZH}@rHjZBJuxo&v6uk^K0m!@TtAG?R?t5$!eXg@2
z%BR>3HpSjYd790jd|LE4%8syEeCH<n0I)MmoskN+8fit6CT3@+jFRU%)xfZ;%$S(H
zK55i#?mOI=uW=(-aE$9C#!PL-@M?2&%Wk!7)N4Gjy`fBIN@Ok>3qer#FAWbrdh}?>
zcS=KkU<b}n#U5(-!;_+yA51moJhwFC*u1nbjIMUsG3LG6qe~gXI6q{>6~@$!nOVd5
z$?tt;_>D@%=F5iXRu_FEu;)AnEia&DTrp((^(E1gJ7w1oxQ!K5?TRzJjBgAF%XP;%
zZH(3`b<YW$G1o77HNU}~ct8yG%Fq_IYopd3ezuL55(aUqz8$!=D#lz_Gv|7qVV45;
zp&Kk`GP93r23z}*z$K7au>+r$dA-IQUNuf$dGnJC=gz-UxMY+$dpIhPKV`dYwl-c3
zFc7{$6(jMuuHzS-M<3IwTGq$aR*VaO2m?+!WxKR&oV?r_PxPCyJs^$-G)@LJP|B!P
zz2(9SqZ%h|%$}Cu7lty=Ni!7WsK>5W(TUdT7Yj<1W_DngJlpppecp9EhBm1ib*i^L
z)aR}`b|Y}-8{SN<QIFJFyWBkgOEW|>JLb&W4KEM`XtL|ICgx=izdCfzIO$H=><wdd
z>ZWmH#yEe$x&Rt3f-UV50d5zJkvfQc%b9!64TfJk_v+C3q4TdAC*LFdpNZREb4rV~
z0w{pxI>YB*9Y25Z^>VBW|MRWqI&$tmCMD_lUsCQLkTZKNfji3=*t8;fB9dn!WnwH+
zX2uFyGc!?Z_>OTE(`@$4I8|_bT*$^1W3|$v6Et(}s&=uLxuxJkaO)WZ$=xVFIx%wn
z>iD!ZH9h{$ySK+@wjCF!Rwk~R%FNZMe{%fV$mkub)A#zwR4HCdoT_*U-F^reM>1t-
zcLh3Lc2uT7qpEmoc<ayti6~1l(qZpN#k8WHnDH$Vr)FP=*dIb%?ci>duq(AjH7FLe
zC}oK@mK9|zt5RbP&p|$GSq~bvN4-(%v#fP2Wm$hN@x7RrNWQ!_yikM4RvmvB7;03Y
zhWz1)$r<bFd!yDnjt2$dFSzyLiVX@6cQnvhzr<~b;c#0M)tC7W4EYuwUqg~nQnG+-
zjh1)NWDB$qdQ_l=-U2;%C)(v?`<=o&4R{9c9-uE9Wr+jYqPoocF*M(aBubVR?8=<O
zqdsmwvPjCTYJ&jHTo<8&1(Kr9Wl?WOkyuayo)e|&HQyyUdw~Re0AB$9`7Zk0Zc6r0
zGDt}mxp8hE!S*8=$K&@RA?&REMcvkTdL50oz*C2P2v0h;W6jZ7;)!e~SP~F{Az&sG
z^{y%1>rP+`+(bQ~FjQR3T5Fhat6(gRbX=5zKDS7HL(%lHSSwibz)g)WH`5+eJ;~2q
z)O9tj)!-hKRdP#w5JN;7_2VxiyCr@Oqhbn=Pi9dEKb)XjW99;C)-dC!4h$q<Kxc}R
zARg*Ml1u{xSYip_BpKz`a54s*xL!f#$1zy5Uo33BDBg{->o=Zui}l=gH`0lBu=3RA
zmD8g62}%l-2*x-?xvjY4Br0y>Y0FN(%n1^yAWA{uM=77+PSD`h5{yOhqqmA@luXDx
zGJzCIfjkK5*(dFhuvVq`9OebyQR<y{r)OrZYopQ5PR^Q{9hsiB?u?I20|E4fbrU0w
zjdl?)Km}W6r|NLqvr4rFtPWK9G^`}^b4UeG;S(xkjGqCZJ3*QpsdX{I*N_RN6=_v!
z$)Oy`iwQm$Ah#D2g|(_M2t3ml^p+CJp@P~Ap^O?qvf{QHx%u-neUu(wa!S}3A3BkK
z16CBrG<EQxTjzwYg38b170H3AFIH=hs+$q8qe_N@=QbUytw|iT_k~5@7T;c()5<m}
z6pVZTP=1gSffIo+ln5tjJOU?gBNIp~awx6JShwQ$Gpu<zA1dFM0^s4X{C(*M%8aB*
zff{J@%3_9Jk|p|6yW1iyD2+jI!YE9IfHz7(+ghBO96w8}3nKzno3Jq6WyU<ORgCsh
zV2gla1{lmKI~<mzf#0-XG2bZGsxS><dv-A>EK`BR&H%pg2q<-oYE2Ap+^@M+!}VdB
z`aXS>tRusiIDg@li=y*PoI?+2jliv9u0nJS=b=-@v}=v>0?mHe@E2^fBD2@W{?(bH
z@sqEAE{3FvI(O#XCHfq99yDC;Fe4C#9~8b@ZjkLO+7=5sCxsiSWhdaLDVO=c_8QJO
z=QSRsXQAXoc0I|WL_GWp{7~v(Io?cMR>6Qmp-ah(tl_W7c^Sg{2H+U+2wt&hy}$&d
z9Rw4QoQEJZ0~>(cYD*C;p{yvYV2c%X6^<cOepewuDX7IS<?uOm!Oovtu>A$kor@Ac
z*o9YLixN(?1p78hFE~rgEjxa&eh!hk=qztuQf|$1w_Mi3z(NB_G8lW5K9h_WLHlXU
z(`3*@xeY3T8&oTnm_UwxFyEvU9<fB)#F)GS`6t;9)q0*OkbUj39O~r+cs~&+D{7cn
z(ZrH<v1GNqWUa_AqrXXZP6Io*#ot5`Mw3Pi7w`600O4eM@V6-$L9+G4`54t3lw74`
zO%)4`m!5aw>y&|sZ=lu}i%I1P7i6zA^Tcbwsy{U2fX8c9C{D3zREi&|WLmjcH4&dk
z35|!9-^U6A^5KQ1d#0_*GN2%-UTmtThEFwjl$&n-jDg+O1NWXW=6Ex+=s5MWP(TkI
zvUYF6Av`+*%m-#-q*QY1L9;J@pB*Q{W9DWCilFMiYr*7n+Q;Jwo5}W!!8DEy7s!ti
zjXK;4CsM7OBiQC;e={<&6?@UK{CK6GVKFhIL-8fR=Jd}fd=hX@cq*t-vfVaPVGC?j
zd$bMo;1P)VAu>yU8ep$_ERRWdzYC5h@+<rZAt+RKN&`|=2?UCkj{tRgDA4vqX{n*Q
zl30ZT43eRGKgG1iv3gA3&)n|`D9s8r`6sf}(qyT5Dg>i3eNb9d_``2VL5?Lth3R`B
zX)GCPkT)=)EZrg*Lyu3i+tMBBLFP6df`L4#B0r1yJ|`Xonv#KAacYgA*|QK-yrI}j
ze@=97W)0)o_$>5M0GGL`Trvv1kq(!*?7$koK62Zd9-p0lcYG$&Xbfyb9;h4j0{s5X
zacs_G<C7zI_*XDgl;^hVI~EkMMSgt47br$YXDQ&<s71zEA_f4N{U|vb&nfC#BYv#j
z^g1U4Lg&}fhZm`1(hdTrQV$}TN3ye-<{I|;;0I{ulX**8U?1q*3z}!;19%jf-vd0-
z$cKXQyEd*yK59&K+T-X)Y|$n97bq<KS(oVFYpH_h(2;`Z>IR}8i-`_WL0X=uEiKf#
zWY>NU*;hbzU65Vvl6|R7b{NWSvg@13u79F$OZFdO+AzT8@ZwWQHqf2FPmr7uhfLNJ
zo=;PU8A@g;xk<?fNK6geW591wnefBO=o2L6cPM94@*yQ3At@x*FdWa}E>(Sv5{r^e
z`b)QHbVBt8`sLTtkD(TJkme&)5Yvmemwxo_+G}L4lD7JkbQk<pvd*3;W71<-9)(1)
zYvd_tM+=!DE0i`S7ai^zWaO<tf>vd&i5!Mng-OGtz0w~k4>a1#B8}YBNPkHCaRliS
z%#XP#L+4CbzV%JZ`!aKx*pAGaD}RkIiJdB;D01_V_0V-5#UZcDUae$%{*|G%#*vP}
z8qk^xE@KX~HY6!_Zc_7Zm05_v;NxW5ZTccy>h8dQ)%l_%Clp>qX`Ls8eO8FVQ5s{z
zUcW&IJXFV4`eTK(UWm{skPO@eDw%u82}CpaanI7<g}w_kvgt$hu@Hj$I!m!M%OHT-
z18t~>x<6Y6>RFcMKq&owg7vaKmhTd3NAp^2(is$I+7Pz8IX&qM6VEov*kGy#S^LIA
z*EVd!$5sNX(FdfoAh>&oFJSIwA7^uMnDFW{nNE@XN+eI3`h|1&nTfcnkT!FcHU7bP
zaTYEHbfSs)gD9eO^E$vyqZD}O&K-lk3b36`d3MQ}sCn3z&L4}%XuiIEi$*(yIm+F@
z;jSG|<rrh$9Zb0STfcPAU@0hC6lb~iOLz4ciziIjI5x&>^)WaSzsl@+orY*0-+l^=
z*C}K>1AnA>bo*v=K{cLIq>GSkS5!&9NnC}fW|0Y{T87NdMe?$gqIsA?o^ju4$44YE
z#_ULyjlxnW!}%BTLDs_3W=Tn1_F#H5`^w*A;QzuSHIL6bL21EiN0BWNMIKrc&vtu^
zrnsy-Ntz%jAu-8I$dpN+TxdaaKzqPlglh<mLOJLcWmWcv$c?1(9;up$bKhxem}Yi8
z%SYM*yauEQl|##9pf7mfKXQYGD0OvYX54~EK-atYXe8U476|1-;Q^t1evB5e^1+7K
zJn;*b+3Cfb=r#*Q^AK;Cr}yxRGBfbMM<?jhLZ`&`Ea~V-Bi+O+)VRG32<;-&6zxZ5
zk_#gF`~gavrW@_tVpe~PcK#70l9Ew6MqKAj=3ExO3%nTA@lVjIt<|EQ4hdhd>&TTQ
zGhZd`|2Z<G-Jr>oZ_(xfz4lD`eQ923wNJC;;xhfQ3@c7y3A{DDlX&aP+1MRuC2mR8
zLAbl*>Y8m=S865hz#fFVn+kPMe}Sc|3d^*TtcPVkmA)z6S6PnqUaa+6Dd<c%6Jat;
zK@-wWv!mDvQs|Bc<9_3|M;u2c%T8K@fec2(Dkz#nyBI$Z%J;Dn7$VsE7>%bFLB$4I
z5;ZU4EtDfzhy14~1ry35OAUap17Ir7#_eSv^`L$KU+}cut*qQU8dGdtNUehK)SAH1
zkmm6%RHo(4dDGnkm?9LJxqq8xq#(^h+f-r<QbA^7T*L>>)Hn`qYRib9raC7`zZo$#
z`$w^-Z#*LsQo<hrLqOvaVQe|hlXwbI+QjkvV^sVHo;ArV`~r-wN%Th9)Yduf$LMn}
zBY_otkMa~l;IC6cqA3EDg>=08|26@Lu%f@2j^k%UwrnS$YEhI-$T1CUrit1jbShTU
zu|kMLqFr0g{-045%P{11Kodb^m_S(==@})jCggrCcFtg`KStXZ0@pyNn4+*!I;J1V
z0LiiN&q(W1sE*`!Rlyra7I>2*D=(+wz*HQRf|OqDNT~=&L2VZ_#3+sliTF}JLtzT;
z)M<kkl7a&)I>8VKY|q3@mORI<I&2d`qnst&E9}g4*6}}>RXg-H2(_Ccul$>UadNoZ
zrWg*s?qXD-4>zoj)~VqU>+45G7_~yGVM}RG;6*~JBPHNn4&Sg_4u~!_#OG+Y#9P5T
ziB4+K)UI)tnTZMESGUO+$)!!J;Xa0dZzAykndSkjj9wPU*~*%Pcec<10{P;&&jLe;
z4M5&f$7j|CfwS9<%6=OkX!~YSs>5LG!ce5_=yw<o#l{3uUIThM%DbhO;SA%^p_XBy
zwi0l*6=Dab#^`D#!zBOuQW0h=6gZsttF08n5d@6T)QZNEtAL?jI?%&3%-YnV&M%P<
z2e&=V(tD*Kg?DBz+K?ejv5*IsZb|qGMM$#UuVkLeOZhM}COwt!CR#l#*UE-Ht=!Vp
zFvtHr%&nw_xtlDjh#}>|od0)I5}+BDhuiT5+Y#niUKk8tgt_}Ybguy_Jv${m_~Mqd
z^dro@N6h_)nESxCbKkXyuk%9>eimfoxno_u@$6fDK`!hK`(VuHLmjK@4YO1Z^WAl8
z=z9;XQ+u(6v0NLzdy(KvfbZS_-zDIm3+(J7?4S>BUpxUmfuWz>k{<jqu#|i%`+pJY
z7~yY%zEBHwHb@NIrqSLm_I89jo+^Br?Q8WfeJ$Jpy6h*q+?~RD4{W<$&<Z2}lpTy$
zyNIvxpRUoXwKLpFkgz}8`4y?v|5SePQ_QKKz7$h9$cMOF5>vH3r$gO2^@Kf7HGV4W
ziD%OqU@wW8450sK;Q;XlY7CYvgDa{^SrUA-E8K<ppRmK>E^4uupb~JoE9Nbfe$0-b
z4jd)we$0-3q_%dq_Ou488Fp-4{&*+PUdFABlXTDInbz8i+aqUi!z9Osa00OpTz3d_
z$uNmxkb?WNz}fJ2I16@!yTif9%A)*K<zIcOJ{SzsAt>A0AMOWaFx(&R!@N{JFY5NM
z^n<FIRi$+xJV5Jd9SnA^46t+ILBI}$hn}kbQ_=F!3f8nLhxAUZ%8Jx_sbz$FS2E#C
z>^$1`3jV|GnhyWP{1py@k5v9on8jesU*JkZaODADEF)@vhHnF3#FgP5CM~8xm%(=b
zJ!pXv`tONbtZ^aD>z1^h?+O=+LOX4(Iq9GUenu~8&U%*NWGB{|ySKh~A@+^VUdYjR
zD>b^Kt43ek;h8wn+e)Vji>*6J+VQ%6#vls9&&ZR-hj53m{@PxI(y{WOjDH6WvB3xz
zK?yeRSi%Gu=xE}E)I9yd7XIS&y6(XfTJ2d%FoGmvOZe1I%G37KojKLgUM4)G&c;N6
zW`Saa)>mmaQ;W@mFWg```4Rj0Vyp0$^$|PWwpk;EJ2*fPo<0`19m)^lt77L8-4H#h
z+f`gma~}$<nmL%D@g*8EKRXEPPWjhHHW$d*80`|+KC_1H=Y*8!w7J2#5@<Vse0&B+
zhZg)LN(@Q}Yrt)R@~4n=CrY@b$qL+V$GW}3P43?C`9HDj+a3IsF}nEiZ{gMb@NW)7
zV`~^P^;x=$)cq)}mhdH3OO^lxH5*%gs}$x#NI<N0?G_ZA^^{X(!fF0F+%nl&ZTC~Z
z_WEF(Eo{~WWbxnVE>Gs4qim+S7eT&(B3xKnWh__USqFHZpr6G6$`?i9S378iv6Z3W
zDA>&@u)=dlng=^GF>sJdmj-aGDo#RY>266=Ee?-F8lAK@vw|*0Z0P>~;cQ!}xz|M+
z{DZHZ{BY!~dG~Yy+TnEJ7=HzGFYM;`XdUM$A+MH`Q7LTENcEg5E+Ex$VTp+wKEez)
zb0U<AgVTpDZe@v3u27fad|Vueifb<My@s9b>r=BgW=AHi(aCoa6pYl0y@X?ty5|P`
z5AnH3$F056LZp{DV%3Giv6J-&G?Al}h)76RhE1hhGBXsPu|(L*RAF43dI=Ypd^3Z)
zD>2wN)7`w#5~fzAJAtGt5K+NBAzTsSq%X{D``%eRoG$L7nW|k0%oL)j!gl8q)G$To
z*ti|xnMuUoMJCg}zD2;E_KCZN8&uGoi*DUa$2~B&NQaXX_-1?ssalKV3MU85OvG1}
zHt~G4XPYer7l9@JN9ZW2@kw+_rC3g{JOJM?f&XrmPN%a<uYw2<%6X)UyqjtjYSl}%
zaH}%i<2^+l#Q%(%m9rQtf%bW{qhp{1TEo*+2uke@Wk3O3$s(?#0n)4NM#};8NQv*z
zxYSD%-=iMn!qI<FJ|gE;zJhrd&PGYgVzp9ym2JL(+b+1g#z}SXE0hpZi1QR&aPHiQ
zbbL-KjFa+@62WpEp=(YXyEs%a)j9uSe9%H}qKF~zkaAz8gid6{DUF$luf@=97(PVM
zVM@M72`R?-_6}_bF~{$x++j*iQX<6UHOjp~$tWf7P%=r$JxUlQRZ1wbCS3E_eHDVW
zLa?>%P2@xnAayyu(szY4ia(7c0W8N?$MV29@T&pKss`$chthTgAg|*#LZUy>-_xJ!
cPv^CKsz2Ya@5}f0^ydchDDCSv@~OP`zs>ea9{>OV

literal 0
HcmV?d00001

diff --git a/lobbying-scraper/normalize.py b/lobbying-scraper/normalize.py
new file mode 100644
index 000000000..6e6f7418e
--- /dev/null
+++ b/lobbying-scraper/normalize.py
@@ -0,0 +1,50 @@
+"""Entity name normalization pipeline.
+
+Direct port of functions/src/lobbying/normalize.ts. Steps must be applied in
+this exact order — changing the order produces different (incorrect) output.
+"""
+
+from __future__ import annotations
+
+import re
+
+_DBA_RE = re.compile(r"\s+D\s*/+B\s*/+A?\s+.*|\s+DBA\s+.*", re.IGNORECASE)
+_LEGAL_RE = re.compile(
+    r"\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b"
+)
+_THE_RE = re.compile(r"\bTHE\b")
+_WS_RE = re.compile(r"\s+")
+
+_MISC_PHRASES = [
+    "LAW OFFICE OF",
+    "AND ASSOCIATES",
+    "& ASSOCIATES",
+    "AND ASSOC",
+    "ATTORNEY AT LAW",
+    "ATTORNEY@LAW",
+    "ATTORNET AT LAW",  # known portal typo
+    "AND PARTNERS",
+    "PUBLIC POLICY GROUP",
+    "LEGISLATIVE SERVICES",
+    "POLICY GROUP",
+    "ASSOCIATES",
+    "COUNSELLORS AT LAW",
+]
+
+
+def normalize_entity_name(raw: str | None) -> str:
+    if not raw:
+        return ""
+    x = raw.upper()                          # 1. uppercase
+    x = _DBA_RE.sub("", x)                  # 2. strip d/b/a suffix
+    x = x.replace("-", " ")                 # 3. hyphen → space
+    for ch in (",", ".", "'", "‘", "’", "(", ")"):
+        x = x.replace(ch, " ")             # 4. punctuation → space
+    x = _LEGAL_RE.sub(" ", x)              # 5. remove legal entity words
+    x = _THE_RE.sub(" ", x)               # 6. remove THE anywhere
+    x = x.replace("&", "AND")             # 7. ampersand → AND
+    x = x.replace("ASSICIATES", "ASSOCIATES")  # 8. fix known typo
+    for phrase in _MISC_PHRASES:           # 9. remove professional suffix phrases
+        x = x.replace(phrase, " ")
+    x = _WS_RE.sub(" ", x).strip()        # 10. collapse whitespace
+    return x
diff --git a/lobbying-scraper/portal.py b/lobbying-scraper/portal.py
new file mode 100644
index 000000000..257721991
--- /dev/null
+++ b/lobbying-scraper/portal.py
@@ -0,0 +1,376 @@
+"""HTTP client and HTML parser for the MA SoS lobbying portal.
+
+Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/
+
+Page flow:
+  1. Search POST  → summary links table
+  2. Summary.aspx → registrant name/year/type + CompleteDisclosure links
+  3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity
+
+Two disclosure HTML formats:
+  Modern (>=~2013): grdvClientPaidToEntity + grdvActivitiesNew{year}_{n} tables.
+  Legacy (<~2013):  grdvSalaryPaid (total only) + grdvActivities (all bills).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import re
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+# ── Constants ─────────────────────────────────────────────────────────────────
+
+BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/"
+SEARCH_URL = BASE_URL + "Default.aspx"
+
+_UA = (
+    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) "
+    "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
+)
+_REQUEST_DELAY = 1.0
+_MAX_RETRIES = 5
+
+# Lobby disclosure data begins in 2005; GC 183 started Jan 2003.
+FIRST_YEAR = 2005
+FIRST_GC = 183
+FIRST_GC_START_YEAR = 2003
+
+# clientName sentinel for pre-2013 filings where compensation is a single total
+LEGACY_TOTAL_CLIENT = "_total_salary_"
+
+# Maps canonical chamber names to the bill-ID prefix used in MAPLE's Bill.id
+CHAMBER_PREFIXES: dict[str, str] = {
+    "House Bill": "H",
+    "Senate Bill": "S",
+    "House Docket": "HD",
+    "Senate Docket": "SD",
+}
+
+# Legacy short-form chamber codes found in older filings
+LEGACY_CHAMBER_MAP: dict[str, str] = {
+    "HB": "House Bill",
+    "SB": "Senate Bill",
+}
+
+# ── Data types ────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class Compensation:
+    client_name: str
+    amount: Optional[float]
+
+
+@dataclass
+class BillActivity:
+    client_name: str
+    chamber: str          # canonical LobbyingChamber value
+    raw_bill_number: str
+    bill_id: Optional[str]  # e.g. "H1234"; null for Executive/Other
+    activity_title: str
+    position: str
+    amount: Optional[float]
+
+
+@dataclass
+class DisclosureMeta:
+    entity_name: str
+    year: Optional[int]
+    reg_type: str         # "Lobbyist" | "Employer"
+    disclosure_urls: list[str] = field(default_factory=list)
+
+
+@dataclass
+class DisclosureDetail:
+    compensation: list[Compensation] = field(default_factory=list)
+    bills: list[BillActivity] = field(default_factory=list)
+
+
+# ── Derived-value helpers ─────────────────────────────────────────────────────
+
+
+def year_to_general_court(year: int) -> int:
+    return FIRST_GC + (year - FIRST_GC_START_YEAR) // 2
+
+
+def normalize_chamber(raw: str) -> str:
+    t = raw.strip()
+    if t in LEGACY_CHAMBER_MAP:
+        return LEGACY_CHAMBER_MAP[t]
+    known = {"House Bill", "Senate Bill", "House Docket", "Senate Docket", "Executive"}
+    return t if t in known else "Other"
+
+
+def construct_bill_id(chamber: str, raw_bill_number: str) -> Optional[str]:
+    """Construct the MAPLE-compatible billId from chamber + raw integer.
+
+    Returns None for Executive and Other chambers where no bill join is possible.
+    H1234 and S1234 are distinct bills even though they share the same integer —
+    the prefix is required to disambiguate.
+    """
+    prefix = CHAMBER_PREFIXES.get(chamber)
+    if not prefix:
+        return None
+    try:
+        return f"{prefix}{int(raw_bill_number)}"
+    except (ValueError, TypeError):
+        return None
+
+
+def registrant_id(entity_name: str, year: int) -> str:
+    key = f"{year}|{entity_name}"
+    return hashlib.sha256(key.encode()).hexdigest()[:40]
+
+
+def filing_id(
+    entity_name: str,
+    client_name: str,
+    chamber: str,
+    bill_id: Optional[str],
+    general_court: int,
+    position: str,
+) -> str:
+    key = "|".join([entity_name, client_name, chamber, bill_id or "__null__",
+                    str(general_court), position])
+    return hashlib.sha256(key.encode()).hexdigest()[:40]
+
+
+# ── HTTP session ──────────────────────────────────────────────────────────────
+
+
+def make_session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update({
+        "User-Agent": _UA,
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+    })
+    return s
+
+
+def _get(session: requests.Session, url: str) -> BeautifulSoup:
+    for attempt in range(_MAX_RETRIES):
+        time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY)
+        try:
+            r = session.get(url, timeout=60)
+            r.raise_for_status()
+            return BeautifulSoup(r.text, "html.parser")
+        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+            if attempt == _MAX_RETRIES - 1:
+                raise
+            print(f"  GET retry {attempt + 1}: {e}")
+
+
+def _post(session: requests.Session, url: str, data: dict) -> BeautifulSoup:
+    for attempt in range(_MAX_RETRIES):
+        time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY)
+        try:
+            r = session.post(url, data=data, timeout=180)
+            r.raise_for_status()
+            return BeautifulSoup(r.text, "html.parser")
+        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+            if attempt == _MAX_RETRIES - 1:
+                raise
+            print(f"  POST retry {attempt + 1}: {e}")
+
+
+# ── Portal scraping ───────────────────────────────────────────────────────────
+
+
+def _viewstate(soup: BeautifulSoup) -> dict:
+    return {
+        inp["name"]: inp.get("value", "")
+        for inp in soup.find_all("input", type="hidden")
+        if inp.get("name")
+    }
+
+
+def fetch_summary_links(session: requests.Session, year: int) -> list[str]:
+    """Return all Summary.aspx URLs for a given year via a single search POST."""
+    soup = _get(session, SEARCH_URL)
+    data = {
+        **_viewstate(soup),
+        "__EVENTTARGET": "",
+        "__EVENTARGUMENT": "",
+        "ctl00$ContentPlaceHolder1$Search": "rdbSearchByType",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear": str(year),
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame": "",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown": "3",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType": "L",
+        "ctl00$ContentPlaceHolder1$drpPageSize": "20000",
+        "ctl00$ContentPlaceHolder1$btnSearch": "Search",
+    }
+    results = _post(session, SEARCH_URL, data)
+    table = results.find("table", id=lambda x: x and "grdvSearchResultByTypeAndCategory" in x)
+    if not table:
+        return []
+    return [
+        BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"]
+        for a in table.find_all("a", href=True)
+        if "Summary.aspx" in a["href"]
+    ]
+
+
+def fetch_disclosure_meta(session: requests.Session, summary_url: str) -> DisclosureMeta:
+    soup = _get(session, summary_url)
+
+    def text(el_id: str) -> str:
+        el = soup.find(id=el_id)
+        return el.get_text(strip=True) if el else ""
+
+    entity_name = text("ContentPlaceHolder1_lblRegistrantName")
+    year_text = text("ContentPlaceHolder1_lblYear")
+    reg_type_raw = text("ContentPlaceHolder1_lblRegType")
+
+    try:
+        year = int(year_text)
+    except ValueError:
+        year = None
+
+    reg_type = "Employer" if "Entity" in reg_type_raw else "Lobbyist"
+
+    disc_urls = [
+        BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"]
+        for a in soup.find_all("a", href=True)
+        if "CompleteDisclosure" in a["href"]
+    ]
+
+    return DisclosureMeta(
+        entity_name=entity_name,
+        year=year,
+        reg_type=reg_type,
+        disclosure_urls=disc_urls,
+    )
+
+
+def _parse_amount(text: str) -> Optional[float]:
+    cleaned = text.replace("$", "").replace(",", "").strip()
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None
+
+
+def _grid_rows(table: Tag) -> list[Tag]:
+    return table.find_all("tr", class_=lambda c: c and "Grid" in c and "Header" not in c)
+
+
+def fetch_disclosure_detail(
+    session: requests.Session, disc_url: str, year: int
+) -> DisclosureDetail:
+    soup = _get(session, disc_url)
+    compensation: list[Compensation] = []
+    bills: list[BillActivity] = []
+    gc = year_to_general_court(year)
+
+    # ── Modern format (>=~2013) ───────────────────────────────────────────────
+    comp_table = soup.find("table", id=lambda x: x and "grdvClientPaidToEntity" in (x or ""))
+    if comp_table:
+        for row in _grid_rows(comp_table):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) >= 2:
+                compensation.append(Compensation(
+                    client_name=cells[0],
+                    amount=_parse_amount(cells[1]),
+                ))
+
+    act_tables = soup.find_all(
+        "table",
+        id=lambda x: x and re.search(r"grdvActivitiesNew(\d{4})?_\d+", x or ""),
+    )
+    for act_table in act_tables:
+        # Walk backwards to find the nearest lblClientName span
+        client_name = ""
+        node = act_table
+        while node:
+            node = node.find_previous(["span", "div", "td"])
+            if not node:
+                break
+            if node.get("id") and "lblClientName" in node["id"]:
+                client_name = node.get_text(strip=True)
+                break
+
+        for row in _grid_rows(act_table):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) < 4:
+                continue
+            chamber = normalize_chamber(cells[0])
+            raw_num = cells[1]
+            bill_id = construct_bill_id(chamber, raw_num)
+            bills.append(BillActivity(
+                client_name=client_name,
+                chamber=chamber,
+                raw_bill_number=raw_num,
+                bill_id=bill_id,
+                activity_title=cells[2] if len(cells) > 2 else "",
+                position=cells[3] if len(cells) > 3 else "",
+                amount=_parse_amount(cells[4]) if len(cells) > 4 else None,
+            ))
+
+    if comp_table or bills:
+        return DisclosureDetail(compensation=compensation, bills=bills)
+
+    # ── Legacy format (<~2013) ────────────────────────────────────────────────
+    salary_table = soup.find("table", id=lambda x: x and "grdvSalaryPaid" in (x or ""))
+    if salary_table:
+        total = 0.0
+        for row in salary_table.find_all("tr"):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) >= 2 and "Total" not in cells[0]:
+                amt = _parse_amount(cells[1])
+                if amt:
+                    total += amt
+        if total:
+            compensation.append(Compensation(client_name=LEGACY_TOTAL_CLIENT, amount=total))
+
+    act_table = soup.find("table", id=lambda x: x and x.endswith("grdvActivities"))
+    if act_table:
+        all_rows = act_table.find_all("tr")
+        headers = [th.get_text(strip=True)
+                   for th in (all_rows[0].find_all(["th", "td"]) if all_rows else [])]
+
+        if headers and "Activity" in headers[0]:
+            # 6-col entity layout has Lobbyist as second header
+            if len(headers) >= 2 and "Lobbyist" in headers[1]:
+                bill_col, pos_col, client_col = 0, 2, 4
+            else:
+                bill_col, pos_col, client_col = 0, 1, 3
+        else:
+            bill_col, pos_col, client_col = 1, None, 3
+
+        chamber_map = {"H": "House Bill", "S": "Senate Bill",
+                       "HD": "House Docket", "SD": "Senate Docket"}
+        skip = {"Activity or Bill No and Title", "N/A", "None", "", "Total amount"}
+
+        for row in all_rows[1:]:
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) <= max(bill_col, client_col):
+                continue
+            bill_cell = cells[bill_col]
+            if not bill_cell or bill_cell in skip:
+                continue
+            parts = bill_cell.split(None, 1)
+            bill_no = parts[0]
+            m = re.match(r"^([A-Z]+)(\d+)$", bill_no)
+            if not m:
+                continue
+            prefix, number = m.group(1), m.group(2)
+            chamber = chamber_map.get(prefix, "Other")
+            bill_id = construct_bill_id(chamber, number)
+            bills.append(BillActivity(
+                client_name=cells[client_col] if len(cells) > client_col else "",
+                chamber=chamber,
+                raw_bill_number=number,
+                bill_id=bill_id,
+                activity_title=parts[1] if len(parts) > 1 else "",
+                position=cells[pos_col] if pos_col is not None and len(cells) > pos_col else "",
+                amount=None,
+            ))
+
+    return DisclosureDetail(compensation=compensation, bills=bills)
diff --git a/lobbying-scraper/requirements.txt b/lobbying-scraper/requirements.txt
new file mode 100644
index 000000000..5e7b4bcc7
--- /dev/null
+++ b/lobbying-scraper/requirements.txt
@@ -0,0 +1,3 @@
+requests>=2.28
+beautifulsoup4>=4.12
+google-cloud-firestore>=2.14
diff --git a/lobbying-scraper/scrape.py b/lobbying-scraper/scrape.py
new file mode 100644
index 000000000..fb985e05f
--- /dev/null
+++ b/lobbying-scraper/scrape.py
@@ -0,0 +1,269 @@
+"""Lobbying disclosure scraper — Cloud Run entry point.
+
+Runs on a weekly Cloud Scheduler trigger. Checks for new or amended disclosures
+and exits immediately if none are found (fast path). When new disclosures exist,
+fetches and writes them to Firestore.
+
+Also serves as the library used by the TypeScript backfill admin script via
+subprocess.
+
+Environment variables:
+  GOOGLE_CLOUD_PROJECT  — GCP project ID (set automatically in Cloud Run)
+  FIRESTORE_EMULATOR_HOST — set to use the local emulator (e.g. localhost:8080)
+
+CLI flags (for local / backfill use):
+  --year YEAR     Only process this year (default: current + prior)
+  --limit N       Max registrants per year (for testing)
+  --dry-run       Fetch and parse but do not write to Firestore
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import sys
+from datetime import datetime, timezone
+
+from google.cloud import firestore
+
+from portal import (
+    FIRST_YEAR,
+    fetch_disclosure_detail,
+    fetch_disclosure_meta,
+    fetch_summary_links,
+    make_session,
+)
+from writer import (
+    BACKFILL_DOC,
+    BACKFILL_URLS_COLLECTION,
+    SCRAPER_DOC,
+    write_filings,
+    write_registrant,
+)
+
+
+# ── Cursor helpers ────────────────────────────────────────────────────────────
+
+
+def _load_live_cursor(db: firestore.Client) -> tuple[set[str], dict[str, list[str]]]:
+    """Return (processedDiscUrls, summaryDiscCache) from the live scraper doc."""
+    doc = db.document(SCRAPER_DOC).get()
+    data = doc.to_dict() or {}
+    return (
+        set(data.get("processedDiscUrls", [])),
+        data.get("summaryDiscCache", {}),
+    )
+
+
+def _save_live_cursor(
+    db: firestore.Client,
+    processed: set[str],
+    cache: dict[str, list[str]],
+) -> None:
+    db.document(SCRAPER_DOC).set(
+        {"processedDiscUrls": list(processed), "summaryDiscCache": cache},
+        merge=True,
+    )
+
+
+def _is_backfill_processed(db: firestore.Client, disc_url: str) -> bool:
+    h = hashlib.sha256(disc_url.encode()).hexdigest()[:40]
+    return db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).get().exists
+
+
+def _mark_backfill_processed(db: firestore.Client, disc_url: str) -> None:
+    h = hashlib.sha256(disc_url.encode()).hexdigest()[:40]
+    db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).set(
+        {"url": disc_url, "processedAt": datetime.now(tz=timezone.utc).isoformat()}
+    )
+
+
+# ── Core processing ───────────────────────────────────────────────────────────
+
+
+def process_disclosure(
+    db: firestore.Client | None,
+    session,
+    summary_url: str,
+    disc_url: str,
+    year: int,
+    dry_run: bool = False,
+) -> tuple[int, int]:
+    """Fetch one disclosure page and write registrant + filing documents.
+
+    Returns (compensation_rows, filing_rows).
+    """
+    meta = fetch_disclosure_meta(session, summary_url)
+    detail = fetch_disclosure_detail(session, disc_url, year)
+
+    if dry_run or db is None:
+        return len(detail.compensation), len(detail.bills)
+
+    write_registrant(db, meta, detail, disc_url)
+    n_filings = write_filings(db, meta, detail)
+    return len(detail.compensation), n_filings
+
+
+# ── Weekly incremental run ────────────────────────────────────────────────────
+
+
+def run_weekly(
+    db: "firestore.Client | None",
+    years: list[int],
+    limit: int | None = None,
+    dry_run: bool = False,
+) -> int:
+    """Incremental weekly check. Returns number of new disclosures processed."""
+    current_year = datetime.now(tz=timezone.utc).year
+    processed, cache = _load_live_cursor(db) if db is not None else (set(), {})
+
+    session = make_session()
+    new_count = 0
+
+    for year in years:
+        print(f"\n── {year} ──")
+        try:
+            summary_urls = fetch_summary_links(session, year)
+        except Exception as e:
+            print(f"  failed to fetch summary links: {e}", file=sys.stderr)
+            continue
+
+        if limit:
+            summary_urls = summary_urls[:limit]
+
+        print(f"  {len(summary_urls)} registrants on portal")
+
+        for summary_url in summary_urls:
+            # Use cached disc URLs for prior years; always re-check current year
+            disc_urls = cache.get(summary_url)
+            if disc_urls is None or year == current_year:
+                try:
+                    meta = fetch_disclosure_meta(session, summary_url)
+                    disc_urls = meta.disclosure_urls
+                    cache[summary_url] = disc_urls
+                    if not dry_run:
+                        _save_live_cursor(db, processed, cache)
+                except Exception as e:
+                    print(f"  failed to fetch summary {summary_url}: {e}", file=sys.stderr)
+                    continue
+
+            new_disc_urls = [u for u in disc_urls if u not in processed]
+            if not new_disc_urls:
+                continue
+
+            for disc_url in new_disc_urls:
+                try:
+                    comp_n, filing_n = process_disclosure(
+                        db, session, summary_url, disc_url, year, dry_run=dry_run
+                    )
+                    processed.add(disc_url)
+                    new_count += 1
+                    print(f"  processed: {comp_n} clients, {filing_n} filings")
+                    if not dry_run:
+                        _save_live_cursor(db, processed, cache)
+                except Exception as e:
+                    print(f"  failed to process {disc_url}: {e}", file=sys.stderr)
+
+    return new_count
+
+
+# ── Historical backfill ───────────────────────────────────────────────────────
+
+
+def run_backfill(
+    db: "firestore.Client | None",
+    years: list[int],
+    limit: int | None = None,
+    dry_run: bool = False,
+) -> int:
+    """Full historical backfill using the subcollection cursor. Resumable."""
+    session = make_session()
+    total_new = 0
+
+    for year in years:
+        print(f"\n── {year} ──")
+        try:
+            summary_urls = fetch_summary_links(session, year)
+        except Exception as e:
+            print(f"  failed to fetch summary links: {e}", file=sys.stderr)
+            continue
+
+        if limit:
+            summary_urls = summary_urls[:limit]
+
+        print(f"  {len(summary_urls)} registrants on portal")
+        year_new = 0
+
+        for i, summary_url in enumerate(summary_urls):
+            try:
+                meta = fetch_disclosure_meta(session, summary_url)
+            except Exception as e:
+                print(f"  [{i+1}/{len(summary_urls)}] failed to fetch summary: {e}", file=sys.stderr)
+                continue
+
+            for disc_url in meta.disclosure_urls:
+                if db is not None and not dry_run and _is_backfill_processed(db, disc_url):
+                    continue
+                try:
+                    comp_n, filing_n = process_disclosure(
+                        db, session, summary_url, disc_url, year, dry_run=dry_run
+                    )
+                    if not dry_run:
+                        _mark_backfill_processed(db, disc_url)
+                    total_new += 1
+                    year_new += 1
+                except Exception as e:
+                    print(f"  failed to process {disc_url}: {e}", file=sys.stderr)
+
+            if (i + 1) % 50 == 0 or i + 1 == len(summary_urls):
+                print(f"  [{i+1}/{len(summary_urls)}] {year_new} new disclosures so far")
+
+        print(f"  {year} complete: {year_new} new disclosures")
+
+    return total_new
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--year", type=int, default=None)
+    p.add_argument("--limit", type=int, default=None)
+    p.add_argument("--dry-run", action="store_true")
+    p.add_argument(
+        "--mode",
+        choices=["weekly", "backfill"],
+        default="weekly",
+        help="weekly: incremental check; backfill: full history with subcollection cursor",
+    )
+    args = p.parse_args()
+
+    current_year = datetime.now(tz=timezone.utc).year
+
+    if args.year:
+        years = [args.year]
+    elif args.mode == "weekly":
+        years = [current_year, current_year - 1]
+    else:
+        years = list(range(FIRST_YEAR, current_year + 1))
+
+    db = firestore.Client() if not args.dry_run else None
+
+    if args.mode == "weekly":
+        n = run_weekly(db, years, limit=args.limit, dry_run=args.dry_run)
+        if n == 0:
+            print("\nNo new disclosures found.")
+        else:
+            print(f"\nDone: {n} new disclosures written.")
+    else:
+        n = run_backfill(db, years, limit=args.limit, dry_run=args.dry_run)
+        print(f"\nBackfill complete: {n} new disclosures written.")
+
+    # Emit structured result for callers (e.g. TypeScript backfill script)
+    print(json.dumps({"newDisclosures": n}), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lobbying-scraper/writer.py b/lobbying-scraper/writer.py
new file mode 100644
index 000000000..a6804f401
--- /dev/null
+++ b/lobbying-scraper/writer.py
@@ -0,0 +1,126 @@
+"""Firestore document construction and write helpers.
+
+Mirrors the data model in functions/src/lobbying/types.ts. All collection
+names and field names must stay in sync with that file.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING
+
+from normalize import normalize_entity_name
+from portal import (
+    BillActivity,
+    Compensation,
+    DisclosureDetail,
+    DisclosureMeta,
+    filing_id,
+    registrant_id,
+    year_to_general_court,
+)
+
+if TYPE_CHECKING:
+    from google.cloud import firestore
+
+REGISTRANTS_COLLECTION = "lobbyingRegistrants"
+FILINGS_COLLECTION = "lobbyingFilings"
+SCRAPER_DOC = "/scrapers/lobbying"
+BACKFILL_DOC = "/scrapers/lobbyingBackfill"
+BACKFILL_URLS_COLLECTION = "processedUrls"
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+def write_registrant(
+    db: firestore.Client,
+    meta: DisclosureMeta,
+    detail: DisclosureDetail,
+    disc_url: str,
+) -> None:
+    """Upsert a LobbyingRegistrant document."""
+    if not meta.entity_name or meta.year is None:
+        return
+
+    doc_id = registrant_id(meta.entity_name, meta.year)
+    ref = db.collection(REGISTRANTS_COLLECTION).document(doc_id)
+
+    clients = [
+        {
+            "clientName": c.client_name,
+            "clientNameNorm": normalize_entity_name(c.client_name),
+            "compensation": c.amount,
+        }
+        for c in detail.compensation
+    ]
+
+    data = {
+        "registrantId": doc_id,
+        "entityName": meta.entity_name,
+        "entityNameNorm": normalize_entity_name(meta.entity_name),
+        "year": meta.year,
+        "generalCourt": year_to_general_court(meta.year),
+        "regType": meta.reg_type,
+        "clients": clients,
+        "disclosureUrls": firestore.ArrayUnion([disc_url]),
+        "fetchedAt": _now(),
+    }
+    ref.set(data, merge=True)
+
+
+def write_filings(
+    db: firestore.Client,
+    meta: DisclosureMeta,
+    detail: DisclosureDetail,
+) -> int:
+    """Batch-write LobbyingFiling documents. Returns the number written."""
+    if not meta.entity_name or meta.year is None or not detail.bills:
+        return 0
+
+    gc = year_to_general_court(meta.year)
+    entity_name = meta.entity_name
+    entity_norm = normalize_entity_name(entity_name)
+    now = _now()
+
+    batch = db.batch()
+    count = 0
+
+    for bill in detail.bills:
+        fid = filing_id(
+            entity_name,
+            bill.client_name,
+            bill.chamber,
+            bill.bill_id,
+            gc,
+            bill.position,
+        )
+        ref = db.collection(FILINGS_COLLECTION).document(fid)
+        doc = {
+            "filingId": fid,
+            "entityName": entity_name,
+            "entityNameNorm": entity_norm,
+            "clientName": bill.client_name,
+            "clientNameNorm": normalize_entity_name(bill.client_name),
+            "year": meta.year,
+            "generalCourt": gc,
+            "chamber": bill.chamber,
+            "billId": bill.bill_id,
+            "activityTitle": bill.activity_title,
+            "position": bill.position,
+            "amount": bill.amount,
+            "fetchedAt": now,
+        }
+        batch.set(ref, doc)
+        count += 1
+
+        # Firestore batch limit is 500 writes
+        if count % 400 == 0:
+            batch.commit()
+            batch = db.batch()
+
+    if count % 400 != 0:
+        batch.commit()
+
+    return count
diff --git a/scripts/firebase-admin/backfillLobbying.ts b/scripts/firebase-admin/backfillLobbying.ts
index f7914dd84..a2a66330e 100644
--- a/scripts/firebase-admin/backfillLobbying.ts
+++ b/scripts/firebase-admin/backfillLobbying.ts
@@ -1,156 +1,64 @@
 /**
  * Backfill lobbying disclosure data from 2005 to the present.
  *
- * This script is the primary ingestion path for all historical data. The live
- * Cloud Function (scrapeLobbying) only handles the current and prior year in
- * steady state. Run this once to populate the full history, and re-run with
- * --year to refresh specific years.
+ * Delegates all HTTP fetching and Firestore writes to the Python scraper in
+ * lobbying-scraper/. The TypeScript layer handles argument parsing and
+ * environment setup only.
  *
  * Usage:
  *   GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
  *     yarn firebase-admin run-script backfillLobbying --env dev
  *
- * Options:
- *   --year  NUMBER   Only process this year (useful for testing or re-runs)
- *   --limit NUMBER   Max registrants to process per year (for testing)
+ * Options (passed through to scrape.py):
+ *   --year  NUMBER   Only process this year
+ *   --limit NUMBER   Max registrants per year (for testing)
  *
- * Cursor storage:
- *   Processed disclosure URLs are stored as documents in the Firestore
- *   subcollection /scrapers/lobbyingBackfill/processedUrls/{urlHash}.
- *   This scales to the full historical URL set (~50,000+) without hitting the
- *   1MB Firestore document size limit. Restart the script at any time; it will
- *   resume from where it left off.
+ * Requires: pip install -r lobbying-scraper/requirements.txt
+ * Or run inside the maple-2025 conda environment.
  */
 
-import { createHash } from "crypto"
+import { spawn } from "child_process"
+import path from "path"
 import { z } from "zod"
-import {
-  allLobbyingYears,
-  processDisclosure,
-  writeRegistrant
-} from "../../functions/src/lobbying/scrapeLobbying"
-import {
-  fetchDisclosureMeta,
-  fetchSummaryLinks,
-  makePortalClient
-} from "../../functions/src/lobbying/portal"
-import {
-  BACKFILL_DOC,
-  BACKFILL_URLS_COLLECTION,
-  FIRST_LOBBYING_YEAR
-} from "../../functions/src/lobbying/types"
 import { Script } from "./types"
 
 const Args = z
   .object({
-    year: z.number().int().min(FIRST_LOBBYING_YEAR).optional(),
+    year: z.number().int().min(2005).optional(),
     limit: z.number().int().positive().optional()
   })
   .passthrough()
 
-export const script: Script = async ({ db, args }) => {
-  const { year: onlyYear, limit } = Args.parse(args)
+const SCRAPER = path.resolve(__dirname, "../../lobbying-scraper/scrape.py")
 
-  const years = onlyYear ? [onlyYear] : allLobbyingYears()
-  console.log(
-    `backfillLobbying: processing years ${years[0]}–${years[years.length - 1]}`
-  )
-
-  // Load already-processed disc URLs from the subcollection cursor.
-  const backfillRef = db.doc(BACKFILL_DOC)
-  const processedSnap = await backfillRef
-    .collection(BACKFILL_URLS_COLLECTION)
-    .select() // fetch only doc IDs (the URL hash), no field data needed
-    .get()
-  const processedHashes = new Set(processedSnap.docs.map(d => d.id))
-  console.log(
-    `backfillLobbying: ${processedHashes.size} disc URLs already processed`
-  )
-
-  const client = makePortalClient()
-  let totalNew = 0
-
-  for (const year of years) {
-    console.log(`\n── ${year} ──`)
-
-    let summaryUrls: string[]
-    try {
-      summaryUrls = await fetchSummaryLinks(client, year)
-    } catch (e) {
-      console.error(`  Failed to fetch summary links for ${year}:`, e)
-      continue
-    }
-
-    if (limit) summaryUrls = summaryUrls.slice(0, limit)
-    console.log(`  ${summaryUrls.length} registrants on portal`)
-
-    let yearNew = 0
-
-    for (let i = 0; i < summaryUrls.length; i++) {
-      const summaryUrl = summaryUrls[i]
-      let meta: Awaited<ReturnType<typeof fetchDisclosureMeta>>
+export const script: Script = async ({ env, args }) => {
+  const { year, limit } = Args.parse(args)
 
-      try {
-        meta = await fetchDisclosureMeta(client, summaryUrl)
-      } catch (e) {
-        console.warn(
-          `  [${i + 1}/${
-            summaryUrls.length
-          }] Failed to fetch summary: ${summaryUrl}`,
-          e
-        )
-        continue
-      }
-
-      if (meta.entityName && meta.year) {
-        try {
-          await writeRegistrant(
-            db,
-            meta.entityName,
-            meta.year,
-            meta.regType,
-            meta.disclosureUrls
-          )
-        } catch (e) {
-          console.warn(`  Failed to write registrant ${meta.entityName}:`, e)
-        }
-      }
-
-      for (const discUrl of meta.disclosureUrls) {
-        const urlHash = createHash("sha256")
-          .update(discUrl)
-          .digest("hex")
-          .slice(0, 40)
-        if (processedHashes.has(urlHash)) continue
-
-        try {
-          await processDisclosure(db, client, summaryUrl, discUrl, year)
-
-          // Mark as processed in the subcollection cursor
-          await backfillRef
-            .collection(BACKFILL_URLS_COLLECTION)
-            .doc(urlHash)
-            .set({ url: discUrl, processedAt: new Date().toISOString() })
-
-          processedHashes.add(urlHash)
-          totalNew++
-          yearNew++
-        } catch (e) {
-          console.warn(`  Failed to process disclosure ${discUrl}:`, e)
-        }
-      }
+  if (env === "local") {
+    throw new Error(
+      "backfillLobbying requires --env dev or --env prod " +
+        "(it writes to a real Firestore project; local emulator not supported yet)"
+    )
+  }
 
-      if ((i + 1) % 50 === 0 || i + 1 === summaryUrls.length) {
-        console.log(
-          `  [${i + 1}/${
-            summaryUrls.length
-          }] ${yearNew} new disclosures this year`
-        )
-      }
-    }
+  const pyArgs = ["--mode", "backfill"]
+  if (year) pyArgs.push("--year", String(year))
+  if (limit) pyArgs.push("--limit", String(limit))
 
-    console.log(`  ${year} complete: ${yearNew} new disclosures`)
-  }
+  console.log(`Running: python3 ${SCRAPER} ${pyArgs.join(" ")}`)
+  console.log(
+    `Firestore project: ${process.env.GCLOUD_PROJECT || "(from ADC)"}`
+  )
 
-  console.log(`\nbackfillLobbying complete: ${totalNew} new disclosures total`)
+  await new Promise<void>((resolve, reject) => {
+    const proc = spawn("python3", [SCRAPER, ...pyArgs], {
+      stdio: ["ignore", "inherit", "inherit"],
+      env: { ...process.env }
+    })
+    proc.on("close", code => {
+      if (code === 0) resolve()
+      else reject(new Error(`scrape.py exited with code ${code}`))
+    })
+    proc.on("error", reject)
+  })
 }

From 2bcb783f2e2e71631b524f49f5b194eb2d551e63 Mon Sep 17 00:00:00 2001
From: Nathan <sandersn@gmail.com>
Date: Mon, 8 Jun 2026 22:05:04 -0400
Subject: [PATCH 4/4] refactor: remove dead TypeScript scraper code from
 lobbying module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per code review feedback: the TypeScript Firebase Function and backfill
script added no value — the portal's TLS fingerprinting requirements mean
Node.js cannot reach it, so the TS HTTP layer was non-functional and the
backfill script was just a thin subprocess wrapper with no benefit over
calling scrape.py directly.

Removed:
- functions/src/lobbying/scrapeLobbying.ts (broken Cloud Function)
- functions/src/lobbying/portal.ts (non-functional TS HTTP layer)
- functions/src/lobbying/http/ (unused Python fetch helper)
- scripts/firebase-admin/backfillLobbying.ts (shell wrapper, no value)
- scrapeLobbying export from functions/src/index.ts

Kept:
- functions/src/lobbying/types.ts — Firestore schema; imported by frontend
- functions/src/lobbying/normalize.ts — normalization pipeline
- lobbying-scraper/ — the working Cloud Run container (unchanged)

The historical backfill is now run directly:
  python3 lobbying-scraper/scrape.py --mode backfill

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/lobbying-disclosure-ingestion.md        | 108 ++--
 functions/src/index.ts                       |   2 -
 functions/src/lobbying/http/.gitignore       |   3 -
 functions/src/lobbying/http/fetch.py         |  81 ---
 functions/src/lobbying/http/requirements.txt |   1 -
 functions/src/lobbying/index.ts              |  10 -
 functions/src/lobbying/portal.ts             | 553 -------------------
 functions/src/lobbying/scrapeLobbying.ts     | 274 ---------
 lobbying-scraper/Dockerfile                  |   4 +-
 scripts/firebase-admin/backfillLobbying.ts   |  64 ---
 10 files changed, 66 insertions(+), 1034 deletions(-)
 delete mode 100644 functions/src/lobbying/http/.gitignore
 delete mode 100644 functions/src/lobbying/http/fetch.py
 delete mode 100644 functions/src/lobbying/http/requirements.txt
 delete mode 100644 functions/src/lobbying/portal.ts
 delete mode 100644 functions/src/lobbying/scrapeLobbying.ts
 delete mode 100644 scripts/firebase-admin/backfillLobbying.ts

diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md
index 264c77c52..51719f342 100644
--- a/docs/lobbying-disclosure-ingestion.md
+++ b/docs/lobbying-disclosure-ingestion.md
@@ -298,64 +298,86 @@ them. No bill-level compensation amount is available for these years.
 
 ```
 functions/src/lobbying/
-  types.ts          — Runtypes definitions for LobbyingRegistrant, LobbyingFiling
-  normalize.ts      — Entity name normalization pipeline
-  portal.ts         — Reference implementation (HTTP layer not used in production)
-  scrapeLobbying.ts — Reference implementation (superseded by Cloud Run container)
-  index.ts          — Re-exports
+  types.ts     — Runtypes schema definitions for LobbyingRegistrant, LobbyingFiling
+  normalize.ts — Entity name normalization pipeline (also used client-side)
+  index.ts     — Re-exports
 
 lobbying-scraper/
-  scrape.py         — Entry point: --mode weekly (incremental) | --mode backfill
-  portal.py         — HTTP + HTML parsing
-  normalize.py      — Port of normalize.ts
-  writer.py         — Firestore document construction + writes
-  requirements.txt  — requests, beautifulsoup4, google-cloud-firestore
-  Dockerfile        — Python 3.12-slim image
+  scrape.py        — Entry point: --mode weekly (incremental) | --mode backfill
+  portal.py        — HTTP + HTML parsing
+  normalize.py     — Port of normalize.ts
+  writer.py        — Firestore document construction + writes
+  requirements.txt — requests, beautifulsoup4, google-cloud-firestore
+  Dockerfile       — Python 3.12-slim image
 ```
 
+The TypeScript lobbying module (`functions/src/lobbying/`) contains only the
+schema types and normalization logic. There is no TypeScript scraper or
+Firebase Function — ingestion is handled entirely by the Cloud Run container.
+This follows the same pattern as the MCP server and avoids the complexity of
+running multiple language runtimes in the same Firebase Functions deployment.
+
 ---
 
 ## Deploying the Cloud Run Container
 
-Follows the same pattern as the MCP server. Requires the
-`maple-lobbying-scraper` Artifact Registry repository to exist.
+Follows the same pattern as the MCP server. The Artifact Registry repo
+(`maple-lobbying`) and Cloud Run job (`maple-lobbying-scraper`) are already
+created in `digital-testimony-dev`.
 
 ```bash
 cd lobbying-scraper
 IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest
 docker build -t $IMAGE . && docker push $IMAGE
 
-gcloud run jobs create maple-lobbying-scraper \
+gcloud run jobs update maple-lobbying-scraper \
   --image=$IMAGE \
   --project=digital-testimony-dev \
+  --region=us-central1
+```
+
+For a new project (prod), create the job first:
+
+```bash
+gcloud artifacts repositories create maple-lobbying \
+  --repository-format=docker --location=us-central1 --project=<project>
+
+gcloud run jobs create maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=<project> \
   --region=us-central1 \
-  --service-account=<scraper-sa>@digital-testimony-dev.iam.gserviceaccount.com
+  --task-timeout=30m \
+  --max-retries=0
 
-# Schedule weekly via Cloud Scheduler
+# Schedule weekly (Mondays 6am UTC)
 gcloud scheduler jobs create http maple-lobbying-weekly \
   --schedule="0 6 * * 1" \
-  --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/digital-testimony-dev/jobs/maple-lobbying-scraper:run" \
+  --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/<project>/jobs/maple-lobbying-scraper:run" \
   --http-method=POST \
-  --oauth-service-account-email=<scheduler-sa>@digital-testimony-dev.iam.gserviceaccount.com \
+  --oauth-service-account-email=<scheduler-sa>@<project>.iam.gserviceaccount.com \
   --location=us-central1
 ```
 
-## Historical Backfill (Admin Script)
+## Historical Backfill
 
-Ingests all historical filings from 2005 to the present. Delegates to
-`scrape.py --mode backfill` via subprocess. Resumable — the subcollection
-cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks what has been
-processed. Run directly on the machine (requires `lobbying-scraper/` deps
-installed or the `maple-2025` conda environment).
+Runs `scrape.py --mode backfill` directly. Resumable — the subcollection
+cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks progress.
+Requires `lobbying-scraper/` deps or the `maple-2025` conda environment.
 
 ```bash
+cd lobbying-scraper
+
+# Test a single year with no writes
 GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
-  yarn firebase-admin run-script backfillLobbying --env dev
+  python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run
 
-# Or call scrape.py directly for more control:
-cd lobbying-scraper
-python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run
-python3 scrape.py --mode backfill --year 2024
+# Run a single year for real
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill --year 2024
+
+# Full history (2005-present, resumable)
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill
 ```
 
 ---
@@ -400,22 +422,18 @@ export { scrapeLobbying } from "./lobbying"
 
 ## Implementation Status
 
-| File                                         | Status  | Notes                                                      |
-| -------------------------------------------- | ------- | ---------------------------------------------------------- |
-| `functions/src/lobbying/types.ts`            | ✅ Done | TypeScript type definitions; source of truth for schema    |
-| `functions/src/lobbying/normalize.ts`        | ✅ Done | Normalization pipeline (also ported to `normalize.py`)     |
-| `functions/src/lobbying/portal.ts`           | ✅ Done | Kept for reference; HTTP layer not used (see architecture) |
-| `functions/src/lobbying/scrapeLobbying.ts`   | ✅ Done | Not deployed; superseded by Cloud Run container            |
-| `functions/src/lobbying/index.ts`            | ✅ Done |                                                            |
-| `functions/src/index.ts` (export)            | ✅ Done |                                                            |
-| `firestore.rules`                            | ✅ Done |                                                            |
-| `firestore.indexes.json`                     | ✅ Done |                                                            |
-| `lobbying-scraper/normalize.py`              | ✅ Done | Port of normalize.ts                                       |
-| `lobbying-scraper/portal.py`                 | ✅ Done | HTTP + HTML parsing                                        |
-| `lobbying-scraper/writer.py`                 | ✅ Done | Firestore document construction                            |
-| `lobbying-scraper/scrape.py`                 | ✅ Done | Entry point; `--mode weekly` and `--mode backfill`         |
-| `lobbying-scraper/Dockerfile`                | ✅ Done | Python 3.12 slim                                           |
-| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done | Calls `scrape.py --mode backfill` as subprocess            |
+| File                                  | Status  | Notes                                                    |
+| ------------------------------------- | ------- | -------------------------------------------------------- |
+| `functions/src/lobbying/types.ts`     | ✅ Done | Firestore schema types; imported by future frontend code |
+| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline; also ported to `normalize.py`    |
+| `functions/src/lobbying/index.ts`     | ✅ Done | Re-exports types and normalize                           |
+| `firestore.rules`                     | ✅ Done |                                                          |
+| `firestore.indexes.json`              | ✅ Done |                                                          |
+| `lobbying-scraper/normalize.py`       | ✅ Done | Port of normalize.ts                                     |
+| `lobbying-scraper/portal.py`          | ✅ Done | HTTP + HTML parsing                                      |
+| `lobbying-scraper/writer.py`          | ✅ Done | Firestore document construction                          |
+| `lobbying-scraper/scrape.py`          | ✅ Done | Entry point; `--mode weekly` and `--mode backfill`       |
+| `lobbying-scraper/Dockerfile`         | ✅ Done | Python 3.12-slim; deployed to Cloud Run                  |
 
 ### Document ID scheme
 
diff --git a/functions/src/index.ts b/functions/src/index.ts
index 6c52b78c1..641255bf4 100644
--- a/functions/src/index.ts
+++ b/functions/src/index.ts
@@ -60,8 +60,6 @@ export {
 
 export { transcription } from "./webhooks"
 
-export { scrapeLobbying } from "./lobbying"
-
 export * from "./triggerPubsubFunction"
 
 // Export the health check last so it is loaded last.
diff --git a/functions/src/lobbying/http/.gitignore b/functions/src/lobbying/http/.gitignore
deleted file mode 100644
index d0ee3b17c..000000000
--- a/functions/src/lobbying/http/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-venv/
-__pycache__/
-*.pyc
diff --git a/functions/src/lobbying/http/fetch.py b/functions/src/lobbying/http/fetch.py
deleted file mode 100644
index 4e6c2c4ec..000000000
--- a/functions/src/lobbying/http/fetch.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""Minimal HTTP fetch helper for the lobbying portal.
-
-Handles the portal's session cookie requirements that standard Node.js HTTP
-clients cannot satisfy due to TLS-layer constraints.
-
-Usage:
-    python3 fetch.py --url URL [--method GET|POST] [--jar PATH]
-
-POST body is read from stdin as application/x-www-form-urlencoded.
-Cookies are persisted to/from the JSON file at --jar so the session survives
-across multiple subprocess invocations.
-HTML response is written to stdout. Errors go to stderr with exit code 1.
-"""
-
-import argparse
-import json
-import sys
-from pathlib import Path
-
-import requests
-
-_UA = (
-    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) "
-    "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
-)
-
-
-def main() -> None:
-    p = argparse.ArgumentParser()
-    p.add_argument("--url", required=True)
-    p.add_argument("--method", default="GET", choices=["GET", "POST"])
-    p.add_argument("--jar", default=None, help="Path to JSON cookie-jar file")
-    args = p.parse_args()
-
-    session = requests.Session()
-    session.headers.update(
-        {
-            "User-Agent": _UA,
-            "Accept": "*/*",
-            "Accept-Encoding": "gzip, deflate, br",
-            "Connection": "keep-alive",
-        }
-    )
-
-    if args.jar:
-        jar = Path(args.jar)
-        if jar.exists():
-            try:
-                session.cookies.update(json.loads(jar.read_text()))
-            except Exception as e:
-                print(f"warning: could not read cookie jar: {e}", file=sys.stderr)
-
-    try:
-        if args.method == "POST":
-            body = sys.stdin.buffer.read()
-            resp = session.post(
-                args.url,
-                data=body,
-                headers={"Content-Type": "application/x-www-form-urlencoded"},
-                timeout=180,
-            )
-        else:
-            resp = session.get(args.url, timeout=60)
-
-        resp.raise_for_status()
-
-        if args.jar:
-            Path(args.jar).write_text(json.dumps(dict(session.cookies)))
-
-        sys.stdout.buffer.write(resp.content)
-
-    except requests.exceptions.HTTPError as e:
-        print(f"HTTP error {e.response.status_code}: {args.url}", file=sys.stderr)
-        sys.exit(1)
-    except requests.exceptions.RequestException as e:
-        print(f"request failed: {e}", file=sys.stderr)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/functions/src/lobbying/http/requirements.txt b/functions/src/lobbying/http/requirements.txt
deleted file mode 100644
index b18d51347..000000000
--- a/functions/src/lobbying/http/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-requests>=2.28
diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts
index 5e594cb34..6d039ae51 100644
--- a/functions/src/lobbying/index.ts
+++ b/functions/src/lobbying/index.ts
@@ -1,12 +1,2 @@
-export { scrapeLobbying } from "./scrapeLobbying"
 export * from "./types"
 export { normalizeEntityName } from "./normalize"
-export {
-  constructBillId,
-  fetchDisclosureDetail,
-  fetchDisclosureMeta,
-  fetchSummaryLinks,
-  makePortalClient,
-  normalizeChamber,
-  yearToGeneralCourt
-} from "./portal"
diff --git a/functions/src/lobbying/portal.ts b/functions/src/lobbying/portal.ts
deleted file mode 100644
index 64d65831b..000000000
--- a/functions/src/lobbying/portal.ts
+++ /dev/null
@@ -1,553 +0,0 @@
-/**
- * HTTP client and HTML parser for the MA Secretary of State lobbying portal.
- *
- * Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/
- *
- * Page flow:
- *   1. Search POST  → grdvSearchResultByTypeAndCategory table
- *                     One row per registrant; each row has a Summary.aspx link.
- *   2. Summary.aspx → registrant name/year/type + CompleteDisclosure links
- *   3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity
- *
- * Two disclosure HTML formats exist:
- *   Modern (≥~2013): per-client compensation in grdvClientPaidToEntity;
- *     per-client bill tables as grdvActivitiesNew{year}_{n}.
- *   Legacy (<~2013): total salary in grdvSalaryPaid (no client breakdown);
- *     all bill activity in a single grdvActivities table.
- */
-
-import axios, { AxiosInstance } from "axios"
-import { JSDOM } from "jsdom"
-import { sha256 } from "js-sha256"
-import { CookieJar } from "tough-cookie"
-import {
-  CHAMBER_PREFIXES,
-  LEGACY_CHAMBER_MAP,
-  LEGACY_TOTAL_CLIENT,
-  LobbyingChamber
-} from "./types"
-
-// ─── Constants ──────────────────────────────────────────────────────────────
-
-const BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/"
-const SEARCH_URL = BASE_URL + "Default.aspx"
-const REQUEST_DELAY_MS = 1000
-const MAX_RETRIES = 5
-
-const IPAD_UA =
-  "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " +
-  "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
-
-const FIRST_GC = 183
-const FIRST_GC_START_YEAR = 2003
-
-// ─── Public types ───────────────────────────────────────────────────────────
-
-export interface RawCompensation {
-  clientName: string
-  amount: number | null
-}
-
-export interface RawBillActivity {
-  clientName: string
-  chamber: LobbyingChamber
-  rawBillNumber: string
-  billId: string | null // pre-computed from chamber + rawBillNumber
-  activityTitle: string
-  position: string
-  amount: number | null
-}
-
-export interface DisclosureMeta {
-  entityName: string
-  year: number | null
-  /** Portal reg_type mapped to our vocabulary */
-  regType: "Lobbyist" | "Employer"
-  disclosureUrls: string[]
-}
-
-export interface DisclosureDetail {
-  compensation: RawCompensation[]
-  bills: RawBillActivity[]
-}
-
-// ─── HTTP helpers ────────────────────────────────────────────────────────────
-
-/**
- * Create an axios instance pre-configured for the MA SoS portal.
- *
- * Includes a cookie jar via interceptors so ASP.NET session state (ViewState,
- * anti-forgery tokens) is preserved across the GET → POST page flow without
- * requiring the axios-cookiejar-support package.
- */
-export interface PortalClient {
-  jar: CookieJar
-  client: AxiosInstance
-}
-
-/**
- * Create a portal client pre-configured for the MA SoS portal.
- *
- * Uses maxRedirects: 0 so our manual redirect loop (inside getHtml / postHtml)
- * can extract Set-Cookie headers at each hop before following. This is necessary
- * because the portal is protected by Incapsula, which issues a 302 challenge on
- * first contact and requires the session cookies to be sent on the retried request.
- * Axios's built-in redirect following happens before response interceptors fire,
- * so the cookies from the challenge response are never captured automatically.
- */
-export function makePortalClient(): PortalClient {
-  const jar = new CookieJar()
-  const client = axios.create({
-    headers: {
-      "User-Agent": IPAD_UA,
-      Accept: "*/*",
-      "Accept-Encoding": "gzip, deflate, br",
-      Connection: "keep-alive"
-    },
-    timeout: 60_000,
-    maxRedirects: 10, // let axios handle ordinary redirects; only Incapsula challenges need manual handling
-    validateStatus: s => s < 500 // surface 4xx so we can log them
-  })
-  return { jar, client }
-}
-
-function sleep(ms: number): Promise<void> {
-  return new Promise(resolve => setTimeout(resolve, ms))
-}
-
-function cookieHeader(jar: CookieJar, url: string): string {
-  return jar
-    .getCookiesSync(url)
-    .map(c => c.cookieString())
-    .join("; ")
-}
-
-function saveCookies(
-  jar: CookieJar,
-  url: string,
-  headers: Record<string, string | string[] | undefined>
-): void {
-  const raw = headers["set-cookie"]
-  if (!raw) return
-  const list = Array.isArray(raw) ? raw : [raw]
-  for (const c of list) jar.setCookieSync(c, url)
-}
-
-async function getHtml(
-  pc: PortalClient,
-  url: string,
-  retries = MAX_RETRIES
-): Promise<Document> {
-  for (let attempt = 0; attempt < retries; attempt++) {
-    await sleep(
-      attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt
-    )
-    try {
-      const res = await pc.client.get<string>(url, {
-        responseType: "text",
-        headers: { Cookie: cookieHeader(pc.jar, url) }
-      })
-      saveCookies(
-        pc.jar,
-        url,
-        res.headers as Record<string, string | string[] | undefined>
-      )
-      if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`)
-      return new JSDOM(res.data).window.document
-    } catch (e) {
-      if (attempt === retries - 1) throw e
-      if (axios.isAxiosError(e)) continue
-      throw e
-    }
-  }
-  throw new Error("unreachable")
-}
-
-async function postHtml(
-  pc: PortalClient,
-  url: string,
-  data: Record<string, string>,
-  retries = MAX_RETRIES
-): Promise<Document> {
-  const body = new URLSearchParams(data).toString()
-  for (let attempt = 0; attempt < retries; attempt++) {
-    await sleep(
-      attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt
-    )
-    try {
-      const res = await pc.client.post<string>(url, body, {
-        responseType: "text",
-        headers: {
-          "Content-Type": "application/x-www-form-urlencoded",
-          Cookie: cookieHeader(pc.jar, url)
-        },
-        timeout: 180_000
-      })
-      saveCookies(
-        pc.jar,
-        url,
-        res.headers as Record<string, string | string[] | undefined>
-      )
-      if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`)
-      return new JSDOM(res.data).window.document
-    } catch (e) {
-      if (attempt === retries - 1) throw e
-      if (axios.isAxiosError(e)) continue
-      throw e
-    }
-  }
-  throw new Error("unreachable")
-}
-
-// ─── Year / General Court helpers ────────────────────────────────────────────
-
-export function yearToGeneralCourt(year: number): number {
-  return FIRST_GC + Math.floor((year - FIRST_GC_START_YEAR) / 2)
-}
-
-// ─── Chamber normalization ────────────────────────────────────────────────────
-
-/** Normalize raw portal chamber string to a canonical LobbyingChamber value. */
-export function normalizeChamber(raw: string): LobbyingChamber {
-  const trimmed = raw.trim()
-  if (LEGACY_CHAMBER_MAP[trimmed]) return LEGACY_CHAMBER_MAP[trimmed]
-  const known: LobbyingChamber[] = [
-    "House Bill",
-    "Senate Bill",
-    "House Docket",
-    "Senate Docket",
-    "Executive"
-  ]
-  if (known.includes(trimmed as LobbyingChamber))
-    return trimmed as LobbyingChamber
-  return "Other"
-}
-
-/**
- * Construct the MAPLE-compatible billId from the portal's chamber + raw integer.
- *
- * The portal stores bill numbers as bare integers; the chamber prefix is what
- * distinguishes H1234 from S1234. Returns null for Executive and Other chambers
- * where no bill join is possible.
- */
-export function constructBillId(
-  chamber: LobbyingChamber,
-  rawBillNumber: string
-): string | null {
-  const prefix = CHAMBER_PREFIXES[chamber]
-  if (!prefix) return null
-  const n = parseInt(rawBillNumber, 10)
-  if (isNaN(n)) return null
-  return `${prefix}${n}`
-}
-
-// ─── Document ID generation ───────────────────────────────────────────────────
-
-/** Stable Firestore document ID for a registrant (entity + year). */
-export function registrantId(entityName: string, year: number): string {
-  return sha256(`${year}|${entityName}`).slice(0, 40)
-}
-
-/**
- * Stable Firestore document ID for a filing.
- *
- * Uses a hash of the logical deduplication key. For null-bill rows (billId is
- * null) the chamber is included in the key to avoid merging executive null rows
- * with legislative null rows.
- */
-export function filingId(
-  entityName: string,
-  clientName: string,
-  chamber: LobbyingChamber,
-  billId: string | null,
-  generalCourt: number,
-  position: string
-): string {
-  const key = [
-    entityName,
-    clientName,
-    chamber,
-    billId ?? "__null__",
-    generalCourt,
-    position
-  ].join("|")
-  return sha256(key).slice(0, 40)
-}
-
-// ─── Amount parsing ───────────────────────────────────────────────────────────
-
-function parseAmount(text: string): number | null {
-  const cleaned = text.replace(/[$,]/g, "").trim()
-  const n = parseFloat(cleaned)
-  return isNaN(n) ? null : n
-}
-
-// ─── Portal scraping functions ────────────────────────────────────────────────
-
-/** Extract ASP.NET WebForms ViewState hidden inputs from a page. */
-function extractViewState(doc: Document): Record<string, string> {
-  const fields: Record<string, string> = {}
-  doc.querySelectorAll('input[type="hidden"]').forEach(el => {
-    const input = el as HTMLInputElement
-    if (input.name) fields[input.name] = input.value ?? ""
-  })
-  return fields
-}
-
-/**
- * Fetch all Summary.aspx URLs for a given year.
- * Sends a single search POST with page size 20000 to get all registrants at once.
- */
-export async function fetchSummaryLinks(
-  pc: PortalClient,
-  year: number
-): Promise<string[]> {
-  const searchPage = await getHtml(pc, SEARCH_URL)
-  const vs = extractViewState(searchPage)
-
-  const postData: Record<string, string> = {
-    ...vs,
-    __EVENTTARGET: "",
-    __EVENTARGUMENT: "",
-    ctl00$ContentPlaceHolder1$Search: "rdbSearchByType",
-    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear: String(year),
-    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame: "",
-    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown:
-      "3",
-    ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType: "L",
-    ctl00$ContentPlaceHolder1$drpPageSize: "20000",
-    ctl00$ContentPlaceHolder1$btnSearch: "Search"
-  }
-
-  const resultsPage = await postHtml(pc, SEARCH_URL, postData)
-
-  const table = resultsPage.querySelector(
-    '[id*="grdvSearchResultByTypeAndCategory"]'
-  )
-  if (!table) return []
-
-  const links: string[] = []
-  table.querySelectorAll("a[href]").forEach(el => {
-    const href = (el as HTMLAnchorElement).href
-    if (href && href.includes("Summary.aspx")) {
-      // href from JSDOM is already absolute when base is set; handle both cases
-      const url = href.startsWith("http") ? href : BASE_URL + href
-      links.push(url)
-    }
-  })
-  return links
-}
-
-/**
- * Fetch a Summary.aspx page and return the registrant metadata + disclosure URLs.
- */
-export async function fetchDisclosureMeta(
-  pc: PortalClient,
-  summaryUrl: string
-): Promise<DisclosureMeta> {
-  const doc = await getHtml(pc, summaryUrl)
-
-  const text = (id: string) => {
-    const el = doc.getElementById(id)
-    return el?.textContent?.trim() ?? ""
-  }
-
-  const entityName = text("ContentPlaceHolder1_lblRegistrantName")
-  const yearText = text("ContentPlaceHolder1_lblYear")
-  const regTypeRaw = text("ContentPlaceHolder1_lblRegType")
-
-  const year = parseInt(yearText, 10)
-  const regType: "Lobbyist" | "Employer" = regTypeRaw.includes("Entity")
-    ? "Employer"
-    : "Lobbyist"
-
-  const disclosureUrls: string[] = []
-  doc.querySelectorAll("a[href]").forEach(el => {
-    const raw = (el as HTMLAnchorElement).getAttribute("href") ?? ""
-    if (raw.includes("CompleteDisclosure")) {
-      const url = raw.startsWith("http") ? raw : BASE_URL + raw
-      disclosureUrls.push(url)
-    }
-  })
-
-  return {
-    entityName,
-    year: isNaN(year) ? null : year,
-    regType,
-    disclosureUrls
-  }
-}
-
-/**
- * Parse a CompleteDisclosure.aspx page.
- *
- * Handles both modern (≥~2013) and legacy (<~2013) HTML layouts.
- */
-export async function fetchDisclosureDetail(
-  pc: PortalClient,
-  discUrl: string,
-  year: number
-): Promise<DisclosureDetail> {
-  const doc = await getHtml(pc, discUrl)
-  const compensation: RawCompensation[] = []
-  const bills: RawBillActivity[] = []
-
-  // ── Modern format ──────────────────────────────────────────────────────────
-  const compTable = doc.querySelector('[id*="grdvClientPaidToEntity"]')
-  if (compTable) {
-    compTable
-      .querySelectorAll("tr.GridRow, tr.GridAlternatingRow")
-      .forEach(row => {
-        const cells = Array.from(row.querySelectorAll("td")).map(
-          td => td.textContent?.trim() ?? ""
-        )
-        if (cells.length >= 2) {
-          compensation.push({
-            clientName: cells[0],
-            amount: parseAmount(cells[1])
-          })
-        }
-      })
-  }
-
-  // Bill activity tables — one per client per reporting period. Two ID patterns:
-  //   2014–2018: …rptActivityNew_grdvActivitiesNew_0      (no year suffix)
-  //   2019+:     …rptActivityNew2020_grdvActivitiesNew2020_0 (year suffix)
-  doc.querySelectorAll('[id*="grdvActivitiesNew"]').forEach(actTable => {
-    // The client name lives in the nearest preceding span with lblClientName
-    let clientName = ""
-    let node: Element | null = actTable
-    while ((node = node.previousElementSibling ?? node.parentElement)) {
-      const span = node.id?.includes("lblClientName")
-        ? node
-        : node.querySelector?.('[id*="lblClientName"]')
-      if (span) {
-        clientName = span.textContent?.trim() ?? ""
-        break
-      }
-      if (node === node.parentElement) break
-    }
-
-    actTable
-      .querySelectorAll("tr.GridRow, tr.GridAlternatingRow")
-      .forEach(row => {
-        const cells = Array.from(row.querySelectorAll("td")).map(
-          td => td.textContent?.trim() ?? ""
-        )
-        // Columns: House/Senate, Bill Number, Bill title, Position, Amount, Direct business
-        if (cells.length < 4) return
-        const chamber = normalizeChamber(cells[0])
-        const rawBillNumber = cells[1]
-        const billId = constructBillId(chamber, rawBillNumber)
-        bills.push({
-          clientName,
-          chamber,
-          rawBillNumber,
-          billId,
-          activityTitle: cells[2] ?? "",
-          position: cells[3] ?? "",
-          amount: cells.length > 4 ? parseAmount(cells[4]) : null
-        })
-      })
-  })
-
-  if (compTable || bills.length > 0) {
-    return { compensation, bills }
-  }
-
-  // ── Legacy format (<~2013) ─────────────────────────────────────────────────
-  const salaryTable = doc.querySelector('[id*="grdvSalaryPaid"]')
-  if (salaryTable) {
-    let total = 0
-    salaryTable.querySelectorAll("tr").forEach(row => {
-      const cells = Array.from(row.querySelectorAll("td")).map(
-        td => td.textContent?.trim() ?? ""
-      )
-      if (cells.length >= 2 && !cells[0].includes("Total")) {
-        const amt = parseAmount(cells[1])
-        if (amt !== null) total += amt
-      }
-    })
-    if (total > 0) {
-      compensation.push({ clientName: LEGACY_TOTAL_CLIENT, amount: total })
-    }
-  }
-
-  // Legacy bill activity: single grdvActivities table. Three known column layouts:
-  //   2009 4-col:               Date | Bill+Title | Lobbyist | Client
-  //   2010+ individual 5-col:   Activity | Position | DirectBiz | Client | Compensation
-  //   2010+ entity 6-col:       Activity | Lobbyist | Position | DirectBiz | Client | Compensation
-  const actTable = doc.querySelector('[id$="grdvActivities"]')
-  if (actTable) {
-    const allRows = Array.from(actTable.querySelectorAll("tr"))
-    const headerCells = Array.from(
-      allRows[0]?.querySelectorAll("th, td") ?? []
-    ).map(el => el.textContent?.trim() ?? "")
-
-    let billCol = 1
-    let positionCol: number | null = null
-    let clientCol = 3
-
-    if (headerCells[0]?.includes("Activity")) {
-      if (headerCells[1]?.includes("Lobbyist")) {
-        // 6-col entity layout
-        billCol = 0
-        positionCol = 2
-        clientCol = 4
-      } else {
-        // 5-col individual layout
-        billCol = 0
-        positionCol = 1
-        clientCol = 3
-      }
-    }
-
-    const chamberMap: Record<string, LobbyingChamber> = {
-      H: "House Bill",
-      S: "Senate Bill",
-      HD: "House Docket",
-      SD: "Senate Docket"
-    }
-
-    allRows.slice(1).forEach(row => {
-      const cells = Array.from(row.querySelectorAll("td")).map(
-        td => td.textContent?.trim() ?? ""
-      )
-      if (cells.length <= Math.max(billCol, clientCol)) return
-
-      const billCell = cells[billCol]
-      const skipValues = new Set([
-        "Activity or Bill No and Title",
-        "N/A",
-        "None",
-        "",
-        "Total amount"
-      ])
-      if (!billCell || skipValues.has(billCell)) return
-
-      const parts = billCell.split(/\s+/)
-      const billNo = parts[0]
-      const activityTitle = parts.slice(1).join(" ")
-      const match = billNo.match(/^([A-Z]+)(\d+)$/)
-      if (!match) return
-
-      const [, prefix, number] = match
-      const chamber: LobbyingChamber = chamberMap[prefix] ?? "Other"
-      const billId = constructBillId(chamber, number)
-      const position = positionCol !== null ? cells[positionCol] ?? "" : ""
-      const clientName = cells[clientCol] ?? ""
-
-      bills.push({
-        clientName,
-        chamber,
-        rawBillNumber: number,
-        billId,
-        activityTitle,
-        position,
-        amount: null
-      })
-    })
-  }
-
-  return { compensation, bills }
-}
diff --git a/functions/src/lobbying/scrapeLobbying.ts b/functions/src/lobbying/scrapeLobbying.ts
deleted file mode 100644
index 7a6140e8e..000000000
--- a/functions/src/lobbying/scrapeLobbying.ts
+++ /dev/null
@@ -1,274 +0,0 @@
-import { logger } from "firebase-functions"
-import { runWith } from "firebase-functions/v1"
-import { db, Timestamp } from "../firebase"
-import type { Database } from "../types"
-import { normalizeEntityName } from "./normalize"
-import {
-  fetchDisclosureDetail,
-  fetchDisclosureMeta,
-  fetchSummaryLinks,
-  filingId,
-  makePortalClient,
-  registrantId,
-  yearToGeneralCourt
-} from "./portal"
-import {
-  FILINGS_COLLECTION,
-  FIRST_LOBBYING_YEAR,
-  LobbyingFiling,
-  LobbyingRegistrant,
-  REGISTRANTS_COLLECTION,
-  SCRAPER_DOC
-} from "./types"
-
-/**
- * Scraper state stored in Firestore at /scrapers/lobbying.
- *
- * processedDiscUrls: disc URLs already fetched; skip on re-runs.
- * summaryDiscCache:  maps summaryUrl → its known disc URLs so we can skip
- *                    summary page GETs for registrants with no new filings.
- */
-interface ScraperState {
-  processedDiscUrls: string[]
-  summaryDiscCache: Record<string, string[]>
-}
-
-/**
- * Maximum number of new disclosure pages to fetch per function invocation.
- * Each page takes ~1s; this keeps the run well within the 540s timeout.
- * Remaining work is picked up on the next scheduled run.
- */
-const MAX_DISCLOSURES_PER_RUN = 200
-
-/**
- * Scrape lobbying disclosure data for the current and prior calendar year.
- *
- * Runs every 24 hours. New filers arrive semi-annually so daily polling is
- * more than sufficient for steady-state freshness. For initial historical
- * ingestion (2005-present) use the backfillLobbying admin script instead.
- *
- * Progress is checkpointed to Firestore after every disclosure page so the
- * function is fully resumable if it times out or is interrupted.
- */
-export const scrapeLobbying = runWith({ timeoutSeconds: 540, maxInstances: 1 })
-  .pubsub.schedule("every 24 hours")
-  .onRun(async () => {
-    const currentYear = new Date().getFullYear()
-    const years = [currentYear, currentYear - 1]
-
-    const scraperRef = db.doc(SCRAPER_DOC)
-    const scraperDoc = await scraperRef.get()
-    const state: ScraperState = {
-      processedDiscUrls: scraperDoc.data()?.processedDiscUrls ?? [],
-      summaryDiscCache: scraperDoc.data()?.summaryDiscCache ?? {}
-    }
-    const processedSet = new Set<string>(state.processedDiscUrls)
-    const summaryCache: Record<string, string[]> = state.summaryDiscCache
-
-    const client = makePortalClient()
-    let newDiscCount = 0
-
-    for (const year of years) {
-      if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break
-
-      logger.info(`scrapeLobbying: fetching summary links for ${year}`)
-      let summaryUrls: string[]
-      try {
-        summaryUrls = await fetchSummaryLinks(client, year)
-      } catch (e) {
-        logger.error(
-          `scrapeLobbying: failed to fetch summary links for ${year}`,
-          e
-        )
-        continue
-      }
-      logger.info(
-        `scrapeLobbying: ${summaryUrls.length} registrants for ${year}`
-      )
-
-      for (const summaryUrl of summaryUrls) {
-        if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break
-
-        // Use cached disc URLs when available to avoid re-fetching summary pages.
-        // For current year we always re-check (new filings arrive mid-year).
-        let discUrls = summaryCache[summaryUrl]
-        if (!discUrls || year === currentYear) {
-          try {
-            const meta = await fetchDisclosureMeta(client, summaryUrl)
-            discUrls = meta.disclosureUrls
-
-            // Write registrant doc (upsert); don't wait for individual writes to
-            // finish — use a bulkWriter for the doc contents but checkpoint the
-            // scraper state separately so interruptions are recoverable.
-            if (meta.entityName && meta.year) {
-              await writeRegistrant(
-                db,
-                meta.entityName,
-                meta.year,
-                meta.regType,
-                discUrls
-              )
-            }
-
-            summaryCache[summaryUrl] = discUrls
-            await scraperRef.set(
-              { summaryDiscCache: summaryCache },
-              { merge: true }
-            )
-          } catch (e) {
-            logger.warn(
-              `scrapeLobbying: failed to fetch summary ${summaryUrl}`,
-              e
-            )
-            continue
-          }
-        }
-
-        const newDiscUrls = discUrls.filter(u => !processedSet.has(u))
-        if (newDiscUrls.length === 0) continue
-
-        for (const discUrl of newDiscUrls) {
-          if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break
-          try {
-            await processDisclosure(db, client, summaryUrl, discUrl, year)
-            processedSet.add(discUrl)
-            newDiscCount++
-
-            // Checkpoint after every disclosure so restarts lose at most one page
-            await scraperRef.set(
-              { processedDiscUrls: Array.from(processedSet) },
-              { merge: true }
-            )
-          } catch (e) {
-            logger.warn(
-              `scrapeLobbying: failed to process disclosure ${discUrl}`,
-              e
-            )
-          }
-        }
-      }
-    }
-
-    logger.info(`scrapeLobbying: processed ${newDiscCount} new disclosures`)
-  })
-
-// ─── Shared write helpers (also used by backfillLobbying) ────────────────────
-
-/**
- * Write or update a LobbyingRegistrant document. Client list is assembled from
- * the disclosure meta; filing documents are written separately per-bill.
- */
-export async function writeRegistrant(
-  database: Database,
-  entityName: string,
-  year: number,
-  regType: "Lobbyist" | "Employer",
-  disclosureUrls: string[]
-): Promise<void> {
-  const id = registrantId(entityName, year)
-  const ref = database.collection(REGISTRANTS_COLLECTION).doc(id)
-  const partial: Omit<LobbyingRegistrant, "clients" | "fetchedAt"> & {
-    fetchedAt: FirebaseFirestore.Timestamp
-  } = {
-    registrantId: id,
-    entityName,
-    entityNameNorm: normalizeEntityName(entityName),
-    year,
-    generalCourt: yearToGeneralCourt(year),
-    regType,
-    disclosureUrls,
-    fetchedAt: Timestamp.now()
-  }
-  // Merge so repeated runs don't wipe clients accumulated from multiple disclosures
-  await ref.set(partial, { merge: true })
-}
-
-/**
- * Fetch one CompleteDisclosure page and write LobbyingFiling documents.
- * Also updates the registrant's client list.
- */
-export async function processDisclosure(
-  database: Database,
-  client: ReturnType<typeof makePortalClient>,
-  summaryUrl: string,
-  discUrl: string,
-  year: number
-): Promise<void> {
-  const meta = await fetchDisclosureMeta(client, summaryUrl)
-  const detail = await fetchDisclosureDetail(client, discUrl, year)
-
-  const { entityName, regType } = meta
-  const gc = yearToGeneralCourt(year)
-  const entityNameNorm = normalizeEntityName(entityName)
-  const now = Timestamp.now()
-
-  // Update registrant's client list
-  if (entityName && year) {
-    const regRef = database
-      .collection(REGISTRANTS_COLLECTION)
-      .doc(registrantId(entityName, year))
-
-    const clients = detail.compensation.map(c => ({
-      clientName: c.clientName,
-      clientNameNorm: normalizeEntityName(c.clientName),
-      compensation: c.amount
-    }))
-
-    await regRef.set(
-      {
-        registrantId: registrantId(entityName, year),
-        entityName,
-        entityNameNorm,
-        year,
-        generalCourt: gc,
-        regType: regType ?? "Lobbyist",
-        clients,
-        disclosureUrls: [discUrl],
-        fetchedAt: now
-      },
-      { merge: true }
-    )
-  }
-
-  // Write one LobbyingFiling doc per bill row
-  if (detail.bills.length === 0) return
-
-  const writer = database.bulkWriter()
-  for (const bill of detail.bills) {
-    const fid = filingId(
-      entityName,
-      bill.clientName,
-      bill.chamber,
-      bill.billId,
-      gc,
-      bill.position
-    )
-    const doc: LobbyingFiling = {
-      filingId: fid,
-      entityName,
-      entityNameNorm,
-      clientName: bill.clientName,
-      clientNameNorm: normalizeEntityName(bill.clientName),
-      year,
-      generalCourt: gc,
-      chamber: bill.chamber,
-      billId: bill.billId,
-      activityTitle: bill.activityTitle,
-      position: bill.position,
-      amount: bill.amount,
-      fetchedAt: now
-    }
-    writer.set(database.collection(FILINGS_COLLECTION).doc(fid), doc, {
-      merge: false
-    })
-  }
-  await writer.close()
-}
-
-/** All years to scrape, for use by the backfill script. */
-export function allLobbyingYears(): number[] {
-  const current = new Date().getFullYear()
-  const years: number[] = []
-  for (let y = FIRST_LOBBYING_YEAR; y <= current; y++) years.push(y)
-  return years
-}
diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile
index 738293459..4b2da65b5 100644
--- a/lobbying-scraper/Dockerfile
+++ b/lobbying-scraper/Dockerfile
@@ -11,4 +11,6 @@ COPY normalize.py portal.py writer.py scrape.py ./
 # Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally.
 ENV PYTHONUNBUFFERED=1
 
-CMD ["python3", "scrape.py", "--mode", "weekly"]
+# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides.
+ENTRYPOINT ["python3", "scrape.py"]
+CMD ["--mode", "weekly"]
diff --git a/scripts/firebase-admin/backfillLobbying.ts b/scripts/firebase-admin/backfillLobbying.ts
deleted file mode 100644
index a2a66330e..000000000
--- a/scripts/firebase-admin/backfillLobbying.ts
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Backfill lobbying disclosure data from 2005 to the present.
- *
- * Delegates all HTTP fetching and Firestore writes to the Python scraper in
- * lobbying-scraper/. The TypeScript layer handles argument parsing and
- * environment setup only.
- *
- * Usage:
- *   GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
- *     yarn firebase-admin run-script backfillLobbying --env dev
- *
- * Options (passed through to scrape.py):
- *   --year  NUMBER   Only process this year
- *   --limit NUMBER   Max registrants per year (for testing)
- *
- * Requires: pip install -r lobbying-scraper/requirements.txt
- * Or run inside the maple-2025 conda environment.
- */
-
-import { spawn } from "child_process"
-import path from "path"
-import { z } from "zod"
-import { Script } from "./types"
-
-const Args = z
-  .object({
-    year: z.number().int().min(2005).optional(),
-    limit: z.number().int().positive().optional()
-  })
-  .passthrough()
-
-const SCRAPER = path.resolve(__dirname, "../../lobbying-scraper/scrape.py")
-
-export const script: Script = async ({ env, args }) => {
-  const { year, limit } = Args.parse(args)
-
-  if (env === "local") {
-    throw new Error(
-      "backfillLobbying requires --env dev or --env prod " +
-        "(it writes to a real Firestore project; local emulator not supported yet)"
-    )
-  }
-
-  const pyArgs = ["--mode", "backfill"]
-  if (year) pyArgs.push("--year", String(year))
-  if (limit) pyArgs.push("--limit", String(limit))
-
-  console.log(`Running: python3 ${SCRAPER} ${pyArgs.join(" ")}`)
-  console.log(
-    `Firestore project: ${process.env.GCLOUD_PROJECT || "(from ADC)"}`
-  )
-
-  await new Promise<void>((resolve, reject) => {
-    const proc = spawn("python3", [SCRAPER, ...pyArgs], {
-      stdio: ["ignore", "inherit", "inherit"],
-      env: { ...process.env }
-    })
-    proc.on("close", code => {
-      if (code === 0) resolve()
-      else reject(new Error(`scrape.py exited with code ${code}`))
-    })
-    proc.on("error", reject)
-  })
-}