diff --git a/.gitignore b/.gitignore
index 571150641..7301e0ec2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,10 @@ cert.txt
 # local MCP server config (contains auth tokens)
 .mcp.json
 mcp-server/create-agent-key.ts
+
+# Claude
+CLAUDE.md
+
+#gcloud
+.gcloudignore
+
diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md
new file mode 100644
index 000000000..51719f342
--- /dev/null
+++ b/docs/lobbying-disclosure-ingestion.md
@@ -0,0 +1,678 @@
+# Lobbying Disclosure Ingestion Pipeline
+
+## Overview
+
+The MA Secretary of State lobbying portal
+([sec.state.ma.us/LobbyistPublicSearch](https://www.sec.state.ma.us/LobbyistPublicSearch/))
+publishes semi-annual disclosure filings for all registered lobbyists and
+lobbying entities. This document describes the plan for scraping that data and
+storing it in Firestore in a way that allows joining to MAPLE bill data.
+
+The portal has three levels of pages:
+
+1. **Search page** → one row per registrant per year
+2. **Summary page** → registrant metadata + links to semi-annual disclosure
+   filings
+3. **CompleteDisclosure page** → per-client compensation table + per-client bill
+   activity tables
+
+Historical data goes back to 2005. MAPLE has bill data only from ~2020 onward,
+so bill joins will only resolve for filings from the 192nd General Court (2021)
+and later. All historical filings are ingested regardless.
+
+---
+
+## Terminology
+
+The portal has two registrant types:
+
+- **Lobbyist** — an individual person who lobbies directly on behalf of clients.
+- **Employer** — a lobbying firm that employs individual lobbyists and is
+  retained by clients. Called "Lobbyist Entity" on the portal.
+
+In both cases, the registrant reports compensation received from each **client**
+(the organization that hired them) and which bills they lobbied for that client.
+
+---
+
+## Firestore Data Model
+
+Two top-level collections, normalized by registrant and by lobbying activity
+record.
+
+### `/lobbyingRegistrants/{registrantId}`
+
+`registrantId` is a slugified `{entityName}_{year}` (stable, dedup-safe).
+
+One model covers both individual lobbyists and lobbying firms. A separate model
+is not needed because the portal search returns both under the same schema, and
+per-filing detail pages do not expose which individual lobbyists within a firm
+worked on which bill.
+
+```typescript
+interface LobbyingRegistrant {
+  registrantId: string // "{entityName}_{year}" slugified
+  entityName: string // firm name or individual lobbyist name (raw portal value)
+  entityNameNorm: string // normalized form; see Normalization section
+  year: number
+  generalCourt: number // computed from year
+  regType: "Lobbyist" | "Employer"
+  clients: LobbyingClient[]
+  disclosureUrls: string[] // source portal URLs, for audit trail
+  fetchedAt: Timestamp
+}
+
+interface LobbyingClient {
+  clientName: string
+  clientNameNorm: string // normalized form
+  compensation: number | null
+}
+```
+
+### `/lobbyingFilings/{filingId}`
+
+`filingId` is a slugified
+`{entityName}_{clientName}_{chamber}_{activityRef}_{generalCourt}`.
+
+```typescript
+type LobbyingChamber =
+  | "House Bill"
+  | "Senate Bill"
+  | "House Docket"
+  | "Senate Docket"
+  | "Executive" // lobbying of executive branch agencies
+  | "Other" // catch-all for rare legacy codes (FY, CMR, etc.)
+
+interface LobbyingFiling {
+  filingId: string
+  entityName: string // raw portal value
+  entityNameNorm: string // normalized form
+  clientName: string // raw portal value; "_total_salary_" sentinel for pre-2013
+  clientNameNorm: string // normalized form
+  year: number
+  generalCourt: number
+  chamber: LobbyingChamber
+  // For legislative chambers: the bill number string (e.g. "H1234", "HD56").
+  // For Executive: the agency name. Not a bill reference.
+  billId: string | null
+  activityTitle: string // bill title (legislative) or meeting description (executive)
+  position: string // "Support" | "Oppose" | "Neutral" | etc.; empty for executive
+  amount: number | null // compensation allocated to this activity
+  fetchedAt: Timestamp
+}
+```
+
+### Constructing `billId` from Raw Portal Data
+
+The portal stores bill numbers as bare integers and records the chamber
+separately. The `billId` field — which maps to `Bill.id` in MAPLE — is
+constructed during ingest by combining chamber prefix and integer:
+
+| `chamber`       | Prefix | Example raw | `billId` |
+| --------------- | ------ | ----------- | -------- |
+| `House Bill`    | `H`    | `1234`      | `H1234`  |
+| `Senate Bill`   | `S`    | `1234`      | `S1234`  |
+| `House Docket`  | `HD`   | `56`        | `HD56`   |
+| `Senate Docket` | `SD`   | `56`        | `SD56`   |
+| `Executive`     | —      | agency name | `null`   |
+| `Other`         | —      | varies      | `null`   |
+
+Note: `H1234` and `S1234` are distinct bills even though they share the same
+integer. The prefix is required to disambiguate. `billId` is `null` for
+non-legislative chambers.
+
+#### Legacy chamber code normalization
+
+The portal uses short-form codes in older filings, normalized during ingest:
+
+| Raw value | Stored as     |
+| --------- | ------------- |
+| `HB`      | `House Bill`  |
+| `SB`      | `Senate Bill` |
+
+Rare codes (`FY`, `C`, `CMR`, `HR`, etc.) are stored as `Other`.
+
+### Joining to Bill Data
+
+**The join only applies to legislative chambers** (`House Bill`, `Senate Bill`,
+`House Docket`, `Senate Docket`) where `billId` is non-null. For `Executive`
+and `Other`, no join should be attempted.
+
+```typescript
+// Only valid when filing.billId !== null
+db.collection(`/generalCourts/${filing.generalCourt}/bills`).doc(filing.billId)
+```
+
+---
+
+## Entity Name Normalization
+
+The portal does not enforce consistent name formatting. The same client or
+registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
+Acme Consulting", etc. across filings and years. Without normalization,
+grouping by entity is unreliable.
+
+Both `entityName` and `clientName` are normalized using the following pipeline,
+applied in order. The raw portal value is always preserved alongside the
+normalized form.
+
+### Normalization pipeline
+
+1. **Uppercase** — convert the entire string to upper case.
+2. **Strip d/b/a suffix** — remove everything from the first occurrence of
+   `D/B/A`, `D/B/A`, `DBA` (and spacing variants) onward, so the registered
+   name is used rather than a trade name.
+3. **Hyphen → space** — replace `-` with ` ` so `LAN-TEL` and `LAN TEL`
+   collapse to the same key.
+4. **Punctuation → space** — replace `,`, `.`, `'`, `'`, `'`, `(`, `)` with
+   space. Replacement with space (not empty string) prevents adjacent tokens
+   from concatenating (e.g. `,INC` becomes ` INC`, which is then caught by step
+   5).
+5. **Remove legal entity type words** — whole-word removal of: `LLC`, `LLP`,
+   `INC`, `INCORPORATED`, `CORPORATION`, `CORP`, `LTD`, `LIMITED`, `PC`,
+   `PLLC`.
+6. **Remove "THE"** — whole-word removal anywhere in the string (not just as a
+   leading prefix).
+7. **Ampersand → AND** — replace `&` with `AND`.
+8. **Fix known typo** — replace `ASSICIATES` with `ASSOCIATES` (legacy portal
+   data).
+9. **Remove professional suffix phrases** — whole-phrase removal of: `LAW
+OFFICE OF`, `AND ASSOCIATES`, `& ASSOCIATES`, `AND ASSOC`, `ATTORNEY AT
+LAW`, `ATTORNEY@LAW`, `ATTORNET AT LAW`, `AND PARTNERS`, `PUBLIC POLICY
+GROUP`, `LEGISLATIVE SERVICES`, `POLICY GROUP`, `ASSOCIATES`, `COUNSELLORS
+AT LAW`.
+10. **Collapse whitespace** — replace runs of whitespace with a single space and
+    strip leading/trailing whitespace.
+
+### Usage
+
+`entityNameNorm` and `clientNameNorm` are stored on every document and filing.
+They should be used for grouping, deduplication, and display-level matching.
+Raw names are preserved for provenance and audit.
+
+---
+
+## Deduplication and Amount Aggregation
+
+### Does lobbying the same bill multiple times mean we should sum amounts?
+
+The portal collects two semi-annual disclosure filings per registrant per year
+(one for each 6-month period). In theory, a registrant could report the same
+bill in both H1 and H2 filings with separate compensation amounts that should
+be summed. Analysis of the actual data shows this does not occur: after
+processing, zero rows share the same `(entityName, clientName, year,
+generalCourt, billId, position)` — each (registrant, client, bill, year)
+combination appears exactly once. The semi-annual periods report different
+activity, not the same activity twice.
+
+The same registrant can lobby the same bill across multiple General Courts
+(observed up to 6 times across years). These are stored as separate documents
+per `generalCourt` and should not be summed — each court is a distinct
+legislative session.
+
+### Null-bill row deduplication
+
+The one real duplication artifact in the portal data is **null-bill rows** —
+entries filed when a registrant had no specific bills to report for a client in
+a period. These appear in both the H1 and H2 disclosures as identical rows and
+should be collapsed. During ingest, if the same `(entityName, clientName, year,
+generalCourt, chamber, position)` with a null `billId` is encountered more than
+once, keep the row with the highest `amount` so no spend is lost if the two
+copies carry different values (in practice amounts are usually both zero).
+
+### Ingest strategy
+
+When processing multiple disclosure URLs for the same registrant+year, write
+`lobbyingFilings` documents using the logical key as the document ID. A
+subsequent disclosure URL that produces the same document ID will naturally
+upsert (overwrite) rather than duplicate. For null-bill rows, since `billId` is
+null, include `chamber` in the document ID to avoid false merges between
+executive and legislative null rows.
+
+---
+
+## Scraper Architecture
+
+### Why a standalone Cloud Run container
+
+The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to
+classify HTTP clients at the network layer before examining any headers. Node.js
+produces a TLS fingerprint that Imperva challenges with a JavaScript
+verification page; Python's `requests` library produces a fingerprint that
+Imperva allows through without challenge. This is a runtime-level constraint
+that cannot be addressed by header configuration or cipher reordering alone.
+
+The scraper therefore runs as a standalone **Cloud Run container** written in
+Python, deployed alongside the existing MCP server container. All data modeling,
+Firestore collection/field names, and normalization logic are documented here and
+kept consistent between the Python container and the TypeScript type definitions
+in `functions/src/lobbying/types.ts`.
+
+### Cloud Run container: `lobbying-scraper/`
+
+**Files:** `lobbying-scraper/{scrape,portal,normalize,writer}.py`
+
+- Scheduled weekly by Cloud Scheduler
+- Runs an incremental check: fetches the current and prior year's summary links
+  (one POST), compares disc URLs against the Firestore cursor, and **exits
+  immediately if nothing is new** (fast path, typically seconds)
+- When new or updated disclosures are found, fetches and processes them
+- Persists a cursor in `/scrapers/lobbying`:
+  - `processedDiscUrls: string[]` — disc URLs already written; skipped on
+    re-runs
+  - `summaryDiscCache: {[summaryUrl]: string[]}` — maps summary page URLs to
+    their disc URLs so summary page GETs are skipped for prior-year registrants
+    whose disclosures are all already processed
+- For each new disclosure URL:
+  - Parse registrant + client compensation rows → upsert `lobbyingRegistrants`
+  - Parse bill activity rows → batch-write `lobbyingFilings`
+- 1s delay between requests; exponential backoff on transient failures
+
+### Incremental strategy
+
+In steady state (after the initial backfill), each weekly run:
+
+1. One POST to fetch all summary links for current + prior year
+2. For prior-year registrants with all disc URLs in the cursor: zero GETs
+3. For current-year registrants: one GET per summary page to check for new
+   disclosure periods
+4. For any new disc URLs: one GET per disclosure page
+
+New filings arrive twice a year (semi-annual reporting periods). Between
+periods, the run completes in under a minute.
+
+The backfill script (`--mode backfill`) uses a separate subcollection cursor at
+`/scrapers/lobbyingBackfill/processedUrls/{urlHash}` so it does not interfere
+with the live scraper state.
+
+### Legacy Format (pre-2013)
+
+The portal uses a different HTML layout for filings before ~2013: total salary
+is not broken down by client, and all bill activity is in a single table. These
+are stored with `clientName: "_total_salary_"` so callers can detect and filter
+them. No bill-level compensation amount is available for these years.
+
+---
+
+## New Files
+
+```
+functions/src/lobbying/
+  types.ts     — Runtypes schema definitions for LobbyingRegistrant, LobbyingFiling
+  normalize.ts — Entity name normalization pipeline (also used client-side)
+  index.ts     — Re-exports
+
+lobbying-scraper/
+  scrape.py        — Entry point: --mode weekly (incremental) | --mode backfill
+  portal.py        — HTTP + HTML parsing
+  normalize.py     — Port of normalize.ts
+  writer.py        — Firestore document construction + writes
+  requirements.txt — requests, beautifulsoup4, google-cloud-firestore
+  Dockerfile       — Python 3.12-slim image
+```
+
+The TypeScript lobbying module (`functions/src/lobbying/`) contains only the
+schema types and normalization logic. There is no TypeScript scraper or
+Firebase Function — ingestion is handled entirely by the Cloud Run container.
+This follows the same pattern as the MCP server and avoids the complexity of
+running multiple language runtimes in the same Firebase Functions deployment.
+
+---
+
+## Deploying the Cloud Run Container
+
+Follows the same pattern as the MCP server. The Artifact Registry repo
+(`maple-lobbying`) and Cloud Run job (`maple-lobbying-scraper`) are already
+created in `digital-testimony-dev`.
+
+```bash
+cd lobbying-scraper
+IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest
+docker build -t $IMAGE . && docker push $IMAGE
+
+gcloud run jobs update maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=digital-testimony-dev \
+  --region=us-central1
+```
+
+For a new project (prod), create the job first:
+
+```bash
+gcloud artifacts repositories create maple-lobbying \
+  --repository-format=docker --location=us-central1 --project=<project>
+
+gcloud run jobs create maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=<project> \
+  --region=us-central1 \
+  --task-timeout=30m \
+  --max-retries=0
+
+# Schedule weekly (Mondays 6am UTC)
+gcloud scheduler jobs create http maple-lobbying-weekly \
+  --schedule="0 6 * * 1" \
+  --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/<project>/jobs/maple-lobbying-scraper:run" \
+  --http-method=POST \
+  --oauth-service-account-email=<scheduler-sa>@<project>.iam.gserviceaccount.com \
+  --location=us-central1
+```
+
+## Historical Backfill
+
+Runs `scrape.py --mode backfill` directly. Resumable — the subcollection
+cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks progress.
+Requires `lobbying-scraper/` deps or the `maple-2025` conda environment.
+
+```bash
+cd lobbying-scraper
+
+# Test a single year with no writes
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run
+
+# Run a single year for real
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill --year 2024
+
+# Full history (2005-present, resumable)
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill
+```
+
+---
+
+## Firestore Rules
+
+Add read-only public rules alongside the existing `generalCourts` rule:
+
+```
+match /lobbyingRegistrants/{doc} { allow read: if true; }
+match /lobbyingFilings/{doc}     { allow read: if true; }
+```
+
+---
+
+## Firestore Indexes
+
+Add composite indexes for common query patterns:
+
+| Collection        | Fields                                 | Use case                                 |
+| ----------------- | -------------------------------------- | ---------------------------------------- |
+| `lobbyingFilings` | `generalCourt ASC, billId ASC`         | Fetch all legislative filings for a bill |
+| `lobbyingFilings` | `generalCourt ASC, chamber ASC`        | Filter by chamber within a court         |
+| `lobbyingFilings` | `generalCourt ASC, entityNameNorm ASC` | Fetch all filings for a registrant       |
+| `lobbyingFilings` | `generalCourt ASC, clientNameNorm ASC` | Fetch all filings for a client           |
+
+Note: bill-join queries should always filter on `chamber` (or check
+`billId !== null`) to exclude `Executive` and `Other` rows before treating
+`billId` as a MAPLE bill reference.
+
+---
+
+## Function Export
+
+Add to `functions/src/index.ts`:
+
+```typescript
+export { scrapeLobbying } from "./lobbying"
+```
+
+---
+
+## Implementation Status
+
+| File                                  | Status  | Notes                                                    |
+| ------------------------------------- | ------- | -------------------------------------------------------- |
+| `functions/src/lobbying/types.ts`     | ✅ Done | Firestore schema types; imported by future frontend code |
+| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline; also ported to `normalize.py`    |
+| `functions/src/lobbying/index.ts`     | ✅ Done | Re-exports types and normalize                           |
+| `firestore.rules`                     | ✅ Done |                                                          |
+| `firestore.indexes.json`              | ✅ Done |                                                          |
+| `lobbying-scraper/normalize.py`       | ✅ Done | Port of normalize.ts                                     |
+| `lobbying-scraper/portal.py`          | ✅ Done | HTTP + HTML parsing                                      |
+| `lobbying-scraper/writer.py`          | ✅ Done | Firestore document construction                          |
+| `lobbying-scraper/scrape.py`          | ✅ Done | Entry point; `--mode weekly` and `--mode backfill`       |
+| `lobbying-scraper/Dockerfile`         | ✅ Done | Python 3.12-slim; deployed to Cloud Run                  |
+
+### Document ID scheme
+
+Both `registrantId` and `filingId` are SHA-256 hashes (first 40 hex chars) of
+their respective logical keys. Hashes are used rather than slugified strings
+because entity names and client names contain arbitrary Unicode and punctuation
+that would require aggressive sanitization to fit Firestore ID constraints. The
+hash is stable across runs for the same logical record.
+
+---
+
+## Future Work (Subsequent PRs)
+
+### Frontend
+
+- **Dedicated lobbying pages**
+
+  - `/lobbyists` index: searchable list of registrants with total compensation,
+    client count, and year filter
+  - `/lobbyists/{registrantId}` profile: full client list, all bills lobbied,
+    compensation over time
+  - `/clients/{clientNameNorm}` profile: registrants hired, bills lobbied,
+    total spend per year
+
+- **Bill page integration** (`/bills/{court}/{billId}`)
+
+  - "Lobbying activity" section listing registrants + clients that lobbied this
+    bill, with position (Support / Oppose / Neutral) and compensation where
+    available
+  - Link to registrant profile pages
+
+- **Organization profile page integration**
+  - If an organization's normalized name matches a `clientNameNorm` in
+    `lobbyingFilings`, surface a "Lobbying history" panel showing which bills
+    they lobbied and which registrants they hired
+
+### MCP Tools
+
+Expose lobbying data via the MAPLE MCP server so that AI agents and Claude can
+answer questions like "who lobbied bill H1234?" or "what did Acme Corp lobby
+for in 2024?".
+
+- **`get_lobbying_filings_for_bill`** — given `generalCourt` + `billId`, return
+  all `lobbyingFilings` for that bill with registrant, client, position, and
+  amount
+- **`get_lobbying_registrant`** — given `registrantId`, return the registrant
+  document with client list and disclosure URLs
+- **`search_lobbying_by_client`** — given a client name (raw or normalized),
+  return matching filings across all courts
+- **`get_lobbying_summary_for_bill`** — aggregate view: unique registrant count,
+  unique client count, total compensation (where non-null), position breakdown
+
+---
+
+## Incremental Test Plan
+
+Testing proceeds from the inside out: unit logic first, then live portal
+fetches against the real site, then a small Firestore write, then a full
+backfill year, then steady-state function operation.
+
+### Step 1 — Unit test: normalization
+
+Run the normalization pipeline against known inputs and verify the outputs match
+the reference implementation.
+
+```bash
+# In a Node REPL or ts-node session:
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { normalizeEntityName } = require('./functions/src/lobbying/normalize')
+console.log(normalizeEntityName('Acme Corp., Inc. d/b/a Acme Consulting'))
+// Expected: 'ACME'
+console.log(normalizeEntityName('LAN-TEL COMMUNICATIONS, INC.'))
+// Expected: 'LAN TEL COMMUNICATIONS'
+console.log(normalizeEntityName('Law Office of Jane Smith, LLC'))
+// Expected: 'JANE SMITH'
+"
+```
+
+### Step 2 — Unit test: chamber normalization and billId construction
+
+```bash
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { normalizeChamber, constructBillId } = require('./functions/src/lobbying/portal')
+console.log(normalizeChamber('HB'))           // House Bill
+console.log(normalizeChamber('SB'))           // Senate Bill
+console.log(normalizeChamber('Executive'))    // Executive
+console.log(normalizeChamber('FY2024'))       // Other
+console.log(constructBillId('House Bill', '1234'))   // H1234
+console.log(constructBillId('Senate Bill', '567'))   // S567
+console.log(constructBillId('House Docket', '89'))   // HD89
+console.log(constructBillId('Executive', 'EOEEA'))   // null
+"
+```
+
+### Step 3 — Live portal fetch: summary links
+
+Verify the portal is reachable and returns results for the current year. Use
+`--limit 1` to minimize requests.
+
+```bash
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { makePortalClient, fetchSummaryLinks } = require('./functions/src/lobbying/portal')
+const client = makePortalClient()
+fetchSummaryLinks(client, 2024).then(urls => {
+  console.log('Summary links for 2024:', urls.length)
+  console.log('First URL:', urls[0])
+}).catch(console.error)
+"
+```
+
+Expected: ~400–600 URLs, each containing `Summary.aspx`.
+
+### Step 4 — Live portal fetch: summary meta + one disclosure
+
+Pick the first summary URL from Step 3 and fetch its meta and first disclosure.
+
+```bash
+conda run -n maple-2025 ts-node -P tsconfig.script.json -e "
+const { makePortalClient, fetchSummaryLinks, fetchDisclosureMeta, fetchDisclosureDetail } = require('./functions/src/lobbying/portal')
+async function main() {
+  const client = makePortalClient()
+  const [summaryUrl] = await fetchSummaryLinks(client, 2024)
+  const meta = await fetchDisclosureMeta(client, summaryUrl)
+  console.log('Meta:', JSON.stringify(meta, null, 2))
+  if (meta.disclosureUrls[0]) {
+    const detail = await fetchDisclosureDetail(client, meta.disclosureUrls[0], 2024)
+    console.log('Compensation rows:', detail.compensation.length)
+    console.log('Bill rows:', detail.bills.length)
+    console.log('First bill:', detail.bills[0])
+  }
+}
+main().catch(console.error)
+"
+```
+
+Verify: `meta.entityName` is non-empty, `meta.regType` is `"Lobbyist"` or
+`"Employer"`, bill rows have `billId` set correctly for legislative chambers.
+
+### Step 5 — Backfill: single year, small limit against dev Firestore
+
+Write a small batch to the dev Firestore emulator or dev project.
+
+```bash
+# Against local emulator:
+conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env local -- --year 2024 --limit 3
+
+# Against dev project (writes real Firestore):
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env dev -- --year 2024 --limit 3
+```
+
+Verify in Firestore console or emulator UI:
+
+- `lobbyingRegistrants` has 3 documents with `entityName`, `entityNameNorm`,
+  `regType`, `clients`, `generalCourt`
+- `lobbyingFilings` has documents with `billId` non-null for legislative rows
+  and null for Executive rows
+- `/scrapers/lobbyingBackfill/processedUrls` has entries with `url` and
+  `processedAt` fields
+- Re-running the same command skips already-processed URLs (output shows 0 new
+  disclosures)
+
+### Step 6 — Spot-check: bill join
+
+Pick a `lobbyingFiling` document with a non-null `billId` and a `generalCourt`
+≥ 192. Verify the bill exists in MAPLE:
+
+```
+/generalCourts/{filing.generalCourt}/bills/{filing.billId}
+```
+
+If the bill is found, the join key is correct. If not found, check: (a) whether
+MAPLE has data for that court, (b) whether the bill number format matches
+(prefix + integer, no leading zeros).
+
+### Step 7 — Backfill: full current year
+
+Once Step 5 passes, run without `--limit` for the current year:
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env dev -- --year 2024
+```
+
+Monitor progress via console output. Expected: ~500–600 registrants, ~1,000
+disclosure pages, several thousand filing documents written.
+
+### Step 8 — Backfill: full history (2005–present)
+
+Run without `--year` to process all years. Can be interrupted and resumed:
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \
+  --env dev
+```
+
+Expected runtime: several hours at 1s/request. The subcollection cursor at
+`/scrapers/lobbyingBackfill/processedUrls` allows safe interruption and
+resumption.
+
+### Step 9 — Deploy and verify Cloud Function
+
+Deploy the function to the dev project:
+
+```bash
+conda run -n maple-2025 firebase deploy \
+  --only functions:maple:scrapeLobbying \
+  --project digital-testimony-dev
+```
+
+Trigger a manual run via the Firebase console or:
+
+```bash
+conda run -n maple-2025 yarn firebase-admin run-script runScrapers \
+  --env local --targets scrapeLobbying
+```
+
+Verify: Cloud Function logs show the expected number of new disclosures (should
+be near zero if backfill completed, since current+prior year are already
+processed).
+
+---
+
+## Design Decisions
+
+| Decision                    | Choice                                                                       | Rationale                                                                                                                                                                |
+| --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Collection placement        | Top-level `/lobbyingRegistrants`, `/lobbyingFilings`                         | Lobbying data spans multiple General Courts and is not scoped to a single court like bills/members                                                                       |
+| Single registrant model     | One type, `regType: "Lobbyist" \| "Employer"`                                | Individual lobbyists and firms share the same portal schema; per-bill individual attribution is not available                                                            |
+| `billId` construction       | `{chamberPrefix}{billNumber}` at ingest time                                 | Raw portal data stores chamber and integer separately; the composite is what matches MAPLE's `Bill.id`                                                                   |
+| `billId` null for Executive | `null` instead of agency name                                                | Prevents accidental bill lookups; makes join guard explicit at the type level                                                                                            |
+| Normalized name fields      | Store both raw and `*Norm` fields                                            | Raw names preserved for provenance; normalized names used for grouping and matching                                                                                      |
+| HTML parser                 | `jsdom`                                                                      | Already in `functions/package.json` (used by events scraper); no need to add cheerio                                                                                     |
+| Live scraper cursor         | Array in `/scrapers/lobbying` doc                                            | ~1,000 URLs/year fits well within the 1 MB Firestore doc limit; simple and atomic with other scraper state                                                               |
+| Backfill cursor             | Firestore subcollection `/scrapers/lobbyingBackfill/processedUrls/{urlHash}` | Full 2005-present history (~50,000 URLs) would exceed the 1 MB doc limit; subcollection scales without bound and is durable, inspectable, and resumable from any machine |
+| Incremental strategy        | Skip already-processed disclosure URLs; write docs by logical key (upsert)   | Survives function restarts and re-runs without re-fetching already-scraped pages; natural upsert prevents duplicates without an explicit dedup pass                      |
+| Legacy format (pre-2013)    | Store with `clientName: "_total_salary_"` sentinel                           | Preserves data completeness; callers can filter on this value                                                                                                            |
+| Historical data             | Admin backfill script (2005 → present)                                       | Full history is ingested once; Cloud Function maintains current+prior year going forward                                                                                 |
diff --git a/firestore.indexes.json b/firestore.indexes.json
index 83cb3fa6d..c267a6868 100644
--- a/firestore.indexes.json
+++ b/firestore.indexes.json
@@ -788,25 +788,46 @@
       "collectionGroup": "ballotQuestions",
       "queryScope": "COLLECTION",
       "fields": [
-        { "fieldPath": "electionYear", "order": "ASCENDING" },
-        { "fieldPath": "ballotStatus", "order": "ASCENDING" }
+        {
+          "fieldPath": "electionYear",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "ballotStatus",
+          "order": "ASCENDING"
+        }
       ]
     },
     {
       "collectionGroup": "publishedTestimony",
       "queryScope": "COLLECTION_GROUP",
       "fields": [
-        { "fieldPath": "ballotQuestionId", "order": "ASCENDING" },
-        { "fieldPath": "publishedAt", "order": "DESCENDING" }
+        {
+          "fieldPath": "ballotQuestionId",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "publishedAt",
+          "order": "DESCENDING"
+        }
       ]
     },
     {
       "collectionGroup": "publishedTestimony",
       "queryScope": "COLLECTION",
       "fields": [
-        { "fieldPath": "billId", "order": "ASCENDING" },
-        { "fieldPath": "court", "order": "ASCENDING" },
-        { "fieldPath": "ballotQuestionId", "order": "ASCENDING" }
+        {
+          "fieldPath": "billId",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "court",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "ballotQuestionId",
+          "order": "ASCENDING"
+        }
       ]
     },
     {
@@ -898,6 +919,62 @@
           }
         }
       ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "billId",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "chamber",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "entityNameNorm",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "clientNameNorm",
+          "order": "ASCENDING"
+        }
+      ]
     }
   ],
   "fieldOverrides": [
diff --git a/firestore.rules b/firestore.rules
index a95586279..42db67276 100644
--- a/firestore.rules
+++ b/firestore.rules
@@ -103,6 +103,14 @@ service cloud.firestore {
       allow read: if true;
       allow write: if false;
     }
+    match /lobbyingRegistrants/{id} {
+      allow read: if true;
+      allow write: if false;
+    }
+    match /lobbyingFilings/{id} {
+      allow read: if true;
+      allow write: if false;
+    }
     match /transcriptions/{tid} {
       // public, read-only
       allow read: if true
diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts
new file mode 100644
index 000000000..6d039ae51
--- /dev/null
+++ b/functions/src/lobbying/index.ts
@@ -0,0 +1,2 @@
+export * from "./types"
+export { normalizeEntityName } from "./normalize"
diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts
new file mode 100644
index 000000000..a7beb338f
--- /dev/null
+++ b/functions/src/lobbying/normalize.ts
@@ -0,0 +1,72 @@
+/**
+ * Entity name normalization pipeline.
+ *
+ * The SoS portal does not enforce consistent name formatting. The same client or
+ * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
+ * Acme Consulting", etc. across filings and years.
+ *
+ *  The steps must be applied in the exact order
+ * listed here; changing the order produces different (incorrect) output.
+ */
+
+// Step 2: strip d/b/a trade-name suffix before any other transforms so the
+// trade name doesn't bleed into the canonical form.
+const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i
+
+// Step 5: remove legal entity type words with whole-word matching so
+// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC".
+const LEGAL_ENTITY_RE =
+  /\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g
+
+// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix).
+const THE_RE = /\bTHE\b/g
+
+// Step 9: professional suffix phrases to remove wholesale.
+const MISC_PHRASES = [
+  "LAW OFFICE OF",
+  "AND ASSOCIATES",
+  "& ASSOCIATES",
+  "AND ASSOC",
+  "ATTORNEY AT LAW",
+  "ATTORNEY@LAW",
+  "ATTORNET AT LAW", // known portal typo
+  "AND PARTNERS",
+  "PUBLIC POLICY GROUP",
+  "LEGISLATIVE SERVICES",
+  "POLICY GROUP",
+  "ASSOCIATES",
+  "COUNSELLORS AT LAW"
+]
+
+export function normalizeEntityName(raw: string | null | undefined): string {
+  if (!raw) return ""
+
+  let x = raw.toUpperCase() // Step 1: uppercase
+
+  x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix
+
+  x = x.replace(/-/g, " ") // Step 3: hyphen → space
+
+  // Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught
+  // by step 5's whole-word removal).
+  for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) {
+    x = x.split(ch).join(" ")
+  }
+
+  x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words
+
+  x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere
+
+  x = x.replace(/&/g, "AND") // Step 7: ampersand → AND
+
+  x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo
+
+  // Step 9: remove professional suffix phrases
+  for (const phrase of MISC_PHRASES) {
+    x = x.split(phrase).join(" ")
+  }
+
+  x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace
+
+  return x
+}
diff --git a/functions/src/lobbying/types.ts b/functions/src/lobbying/types.ts
new file mode 100644
index 000000000..83eaab761
--- /dev/null
+++ b/functions/src/lobbying/types.ts
@@ -0,0 +1,101 @@
+import {
+  Array,
+  InstanceOf,
+  Literal,
+  Number,
+  Null,
+  Record,
+  Static,
+  String,
+  Union
+} from "runtypes"
+import { Timestamp } from "../firebase"
+
+export type LobbyingChamber = Static<typeof LobbyingChamber>
+export const LobbyingChamber = Union(
+  Literal("House Bill"),
+  Literal("Senate Bill"),
+  Literal("House Docket"),
+  Literal("Senate Docket"),
+  Literal("Executive"),
+  Literal("Other")
+)
+
+export type LobbyingClient = Static<typeof LobbyingClient>
+export const LobbyingClient = Record({
+  clientName: String,
+  clientNameNorm: String,
+  compensation: Null.Or(Number)
+})
+
+export type LobbyingRegistrant = Static<typeof LobbyingRegistrant>
+export const LobbyingRegistrant = Record({
+  registrantId: String,
+  entityName: String,
+  entityNameNorm: String,
+  year: Number,
+  generalCourt: Number,
+  regType: Union(Literal("Lobbyist"), Literal("Employer")),
+  clients: Array(LobbyingClient),
+  disclosureUrls: Array(String),
+  fetchedAt: InstanceOf(Timestamp)
+})
+
+export type LobbyingFiling = Static<typeof LobbyingFiling>
+export const LobbyingFiling = Record({
+  filingId: String,
+  entityName: String,
+  entityNameNorm: String,
+  clientName: String,
+  clientNameNorm: String,
+  year: Number,
+  generalCourt: Number,
+  chamber: LobbyingChamber,
+  // Non-null only for legislative chambers (House Bill, Senate Bill, House Docket,
+  // Senate Docket). For Executive and Other, no bill join should be attempted.
+  billId: Null.Or(String),
+  activityTitle: String,
+  position: String,
+  amount: Null.Or(Number),
+  fetchedAt: InstanceOf(Timestamp)
+})
+
+/** Firestore path for lobbying registrant documents */
+export const REGISTRANTS_COLLECTION = "lobbyingRegistrants"
+
+/** Firestore path for lobbying filing documents */
+export const FILINGS_COLLECTION = "lobbyingFilings"
+
+/** Firestore path for the live scraper cursor document */
+export const SCRAPER_DOC = "/scrapers/lobbying"
+
+/** Firestore path for the backfill cursor subcollection */
+export const BACKFILL_DOC = "/scrapers/lobbyingBackfill"
+export const BACKFILL_URLS_COLLECTION = "processedUrls"
+
+/** Earliest year with portal data */
+export const FIRST_LOBBYING_YEAR = 2005
+
+/**
+ * Sentinel clientName used for pre-2013 legacy filings where compensation is
+ * reported as a single total rather than broken down per client.
+ */
+export const LEGACY_TOTAL_CLIENT = "_total_salary_"
+
+/**
+ * Chamber prefix map for constructing billId values that match MAPLE's Bill.id.
+ * Typed as a plain index signature so portal.ts can look up any LobbyingChamber
+ * without triggering "Property X does not exist" on the Partial.
+ */
+export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = {
+  "House Bill": "H",
+  "Senate Bill": "S",
+  "House Docket": "HD",
+  "Senate Docket": "SD"
+}
+
+/** Canonical chamber values for legacy short-form codes found in older filings */
+export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = {
+  HB: "House Bill",
+  SB: "Senate Bill"
+}
diff --git a/lobbying-scraper/.dockerignore b/lobbying-scraper/.dockerignore
new file mode 100644
index 000000000..9460c99c4
--- /dev/null
+++ b/lobbying-scraper/.dockerignore
@@ -0,0 +1,4 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile
new file mode 100644
index 000000000..4b2da65b5
--- /dev/null
+++ b/lobbying-scraper/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY normalize.py portal.py writer.py scrape.py ./
+
+# Cloud Run sets PORT; we don't use it (this is a job, not a server).
+# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally.
+ENV PYTHONUNBUFFERED=1
+
+# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides.
+ENTRYPOINT ["python3", "scrape.py"]
+CMD ["--mode", "weekly"]
diff --git a/lobbying-scraper/__pycache__/normalize.cpython-37.pyc b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc
new file mode 100644
index 000000000..47c3ba707
Binary files /dev/null and b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc differ
diff --git a/lobbying-scraper/__pycache__/portal.cpython-37.pyc b/lobbying-scraper/__pycache__/portal.cpython-37.pyc
new file mode 100644
index 000000000..413885e3d
Binary files /dev/null and b/lobbying-scraper/__pycache__/portal.cpython-37.pyc differ
diff --git a/lobbying-scraper/normalize.py b/lobbying-scraper/normalize.py
new file mode 100644
index 000000000..6e6f7418e
--- /dev/null
+++ b/lobbying-scraper/normalize.py
@@ -0,0 +1,50 @@
+"""Entity name normalization pipeline.
+
+Direct port of functions/src/lobbying/normalize.ts. Steps must be applied in
+this exact order — changing the order produces different (incorrect) output.
+"""
+
+from __future__ import annotations
+
+import re
+
+_DBA_RE = re.compile(r"\s+D\s*/+B\s*/+A?\s+.*|\s+DBA\s+.*", re.IGNORECASE)
+_LEGAL_RE = re.compile(
+    r"\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b"
+)
+_THE_RE = re.compile(r"\bTHE\b")
+_WS_RE = re.compile(r"\s+")
+
+_MISC_PHRASES = [
+    "LAW OFFICE OF",
+    "AND ASSOCIATES",
+    "& ASSOCIATES",
+    "AND ASSOC",
+    "ATTORNEY AT LAW",
+    "ATTORNEY@LAW",
+    "ATTORNET AT LAW",  # known portal typo
+    "AND PARTNERS",
+    "PUBLIC POLICY GROUP",
+    "LEGISLATIVE SERVICES",
+    "POLICY GROUP",
+    "ASSOCIATES",
+    "COUNSELLORS AT LAW",
+]
+
+
+def normalize_entity_name(raw: str | None) -> str:
+    if not raw:
+        return ""
+    x = raw.upper()                          # 1. uppercase
+    x = _DBA_RE.sub("", x)                  # 2. strip d/b/a suffix
+    x = x.replace("-", " ")                 # 3. hyphen → space
+    for ch in (",", ".", "'", "‘", "’", "(", ")"):
+        x = x.replace(ch, " ")             # 4. punctuation → space
+    x = _LEGAL_RE.sub(" ", x)              # 5. remove legal entity words
+    x = _THE_RE.sub(" ", x)               # 6. remove THE anywhere
+    x = x.replace("&", "AND")             # 7. ampersand → AND
+    x = x.replace("ASSICIATES", "ASSOCIATES")  # 8. fix known typo
+    for phrase in _MISC_PHRASES:           # 9. remove professional suffix phrases
+        x = x.replace(phrase, " ")
+    x = _WS_RE.sub(" ", x).strip()        # 10. collapse whitespace
+    return x
diff --git a/lobbying-scraper/portal.py b/lobbying-scraper/portal.py
new file mode 100644
index 000000000..257721991
--- /dev/null
+++ b/lobbying-scraper/portal.py
@@ -0,0 +1,376 @@
+"""HTTP client and HTML parser for the MA SoS lobbying portal.
+
+Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/
+
+Page flow:
+  1. Search POST  → summary links table
+  2. Summary.aspx → registrant name/year/type + CompleteDisclosure links
+  3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity
+
+Two disclosure HTML formats:
+  Modern (>=~2013): grdvClientPaidToEntity + grdvActivitiesNew{year}_{n} tables.
+  Legacy (<~2013):  grdvSalaryPaid (total only) + grdvActivities (all bills).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import re
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+# ── Constants ─────────────────────────────────────────────────────────────────
+
+BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/"
+SEARCH_URL = BASE_URL + "Default.aspx"
+
+_UA = (
+    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) "
+    "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
+)
+_REQUEST_DELAY = 1.0
+_MAX_RETRIES = 5
+
+# Lobby disclosure data begins in 2005; GC 183 started Jan 2003.
+FIRST_YEAR = 2005
+FIRST_GC = 183
+FIRST_GC_START_YEAR = 2003
+
+# clientName sentinel for pre-2013 filings where compensation is a single total
+LEGACY_TOTAL_CLIENT = "_total_salary_"
+
+# Maps canonical chamber names to the bill-ID prefix used in MAPLE's Bill.id
+CHAMBER_PREFIXES: dict[str, str] = {
+    "House Bill": "H",
+    "Senate Bill": "S",
+    "House Docket": "HD",
+    "Senate Docket": "SD",
+}
+
+# Legacy short-form chamber codes found in older filings
+LEGACY_CHAMBER_MAP: dict[str, str] = {
+    "HB": "House Bill",
+    "SB": "Senate Bill",
+}
+
+# ── Data types ────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class Compensation:
+    client_name: str
+    amount: Optional[float]
+
+
+@dataclass
+class BillActivity:
+    client_name: str
+    chamber: str          # canonical LobbyingChamber value
+    raw_bill_number: str
+    bill_id: Optional[str]  # e.g. "H1234"; null for Executive/Other
+    activity_title: str
+    position: str
+    amount: Optional[float]
+
+
+@dataclass
+class DisclosureMeta:
+    entity_name: str
+    year: Optional[int]
+    reg_type: str         # "Lobbyist" | "Employer"
+    disclosure_urls: list[str] = field(default_factory=list)
+
+
+@dataclass
+class DisclosureDetail:
+    compensation: list[Compensation] = field(default_factory=list)
+    bills: list[BillActivity] = field(default_factory=list)
+
+
+# ── Derived-value helpers ─────────────────────────────────────────────────────
+
+
+def year_to_general_court(year: int) -> int:
+    return FIRST_GC + (year - FIRST_GC_START_YEAR) // 2
+
+
+def normalize_chamber(raw: str) -> str:
+    t = raw.strip()
+    if t in LEGACY_CHAMBER_MAP:
+        return LEGACY_CHAMBER_MAP[t]
+    known = {"House Bill", "Senate Bill", "House Docket", "Senate Docket", "Executive"}
+    return t if t in known else "Other"
+
+
+def construct_bill_id(chamber: str, raw_bill_number: str) -> Optional[str]:
+    """Construct the MAPLE-compatible billId from chamber + raw integer.
+
+    Returns None for Executive and Other chambers where no bill join is possible.
+    H1234 and S1234 are distinct bills even though they share the same integer —
+    the prefix is required to disambiguate.
+    """
+    prefix = CHAMBER_PREFIXES.get(chamber)
+    if not prefix:
+        return None
+    try:
+        return f"{prefix}{int(raw_bill_number)}"
+    except (ValueError, TypeError):
+        return None
+
+
+def registrant_id(entity_name: str, year: int) -> str:
+    key = f"{year}|{entity_name}"
+    return hashlib.sha256(key.encode()).hexdigest()[:40]
+
+
+def filing_id(
+    entity_name: str,
+    client_name: str,
+    chamber: str,
+    bill_id: Optional[str],
+    general_court: int,
+    position: str,
+) -> str:
+    key = "|".join([entity_name, client_name, chamber, bill_id or "__null__",
+                    str(general_court), position])
+    return hashlib.sha256(key.encode()).hexdigest()[:40]
+
+
+# ── HTTP session ──────────────────────────────────────────────────────────────
+
+
+def make_session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update({
+        "User-Agent": _UA,
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+    })
+    return s
+
+
+def _get(session: requests.Session, url: str) -> BeautifulSoup:
+    for attempt in range(_MAX_RETRIES):
+        time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY)
+        try:
+            r = session.get(url, timeout=60)
+            r.raise_for_status()
+            return BeautifulSoup(r.text, "html.parser")
+        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+            if attempt == _MAX_RETRIES - 1:
+                raise
+            print(f"  GET retry {attempt + 1}: {e}")
+
+
+def _post(session: requests.Session, url: str, data: dict) -> BeautifulSoup:
+    for attempt in range(_MAX_RETRIES):
+        time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY)
+        try:
+            r = session.post(url, data=data, timeout=180)
+            r.raise_for_status()
+            return BeautifulSoup(r.text, "html.parser")
+        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+            if attempt == _MAX_RETRIES - 1:
+                raise
+            print(f"  POST retry {attempt + 1}: {e}")
+
+
+# ── Portal scraping ───────────────────────────────────────────────────────────
+
+
+def _viewstate(soup: BeautifulSoup) -> dict:
+    return {
+        inp["name"]: inp.get("value", "")
+        for inp in soup.find_all("input", type="hidden")
+        if inp.get("name")
+    }
+
+
+def fetch_summary_links(session: requests.Session, year: int) -> list[str]:
+    """Return all Summary.aspx URLs for a given year via a single search POST."""
+    soup = _get(session, SEARCH_URL)
+    data = {
+        **_viewstate(soup),
+        "__EVENTTARGET": "",
+        "__EVENTARGUMENT": "",
+        "ctl00$ContentPlaceHolder1$Search": "rdbSearchByType",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear": str(year),
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame": "",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown": "3",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType": "L",
+        "ctl00$ContentPlaceHolder1$drpPageSize": "20000",
+        "ctl00$ContentPlaceHolder1$btnSearch": "Search",
+    }
+    results = _post(session, SEARCH_URL, data)
+    table = results.find("table", id=lambda x: x and "grdvSearchResultByTypeAndCategory" in x)
+    if not table:
+        return []
+    return [
+        BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"]
+        for a in table.find_all("a", href=True)
+        if "Summary.aspx" in a["href"]
+    ]
+
+
+def fetch_disclosure_meta(session: requests.Session, summary_url: str) -> DisclosureMeta:
+    soup = _get(session, summary_url)
+
+    def text(el_id: str) -> str:
+        el = soup.find(id=el_id)
+        return el.get_text(strip=True) if el else ""
+
+    entity_name = text("ContentPlaceHolder1_lblRegistrantName")
+    year_text = text("ContentPlaceHolder1_lblYear")
+    reg_type_raw = text("ContentPlaceHolder1_lblRegType")
+
+    try:
+        year = int(year_text)
+    except ValueError:
+        year = None
+
+    reg_type = "Employer" if "Entity" in reg_type_raw else "Lobbyist"
+
+    disc_urls = [
+        BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"]
+        for a in soup.find_all("a", href=True)
+        if "CompleteDisclosure" in a["href"]
+    ]
+
+    return DisclosureMeta(
+        entity_name=entity_name,
+        year=year,
+        reg_type=reg_type,
+        disclosure_urls=disc_urls,
+    )
+
+
+def _parse_amount(text: str) -> Optional[float]:
+    cleaned = text.replace("$", "").replace(",", "").strip()
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None
+
+
+def _grid_rows(table: Tag) -> list[Tag]:
+    return table.find_all("tr", class_=lambda c: c and "Grid" in c and "Header" not in c)
+
+
+def fetch_disclosure_detail(
+    session: requests.Session, disc_url: str, year: int
+) -> DisclosureDetail:
+    soup = _get(session, disc_url)
+    compensation: list[Compensation] = []
+    bills: list[BillActivity] = []
+    gc = year_to_general_court(year)
+
+    # ── Modern format (>=~2013) ───────────────────────────────────────────────
+    comp_table = soup.find("table", id=lambda x: x and "grdvClientPaidToEntity" in (x or ""))
+    if comp_table:
+        for row in _grid_rows(comp_table):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) >= 2:
+                compensation.append(Compensation(
+                    client_name=cells[0],
+                    amount=_parse_amount(cells[1]),
+                ))
+
+    act_tables = soup.find_all(
+        "table",
+        id=lambda x: x and re.search(r"grdvActivitiesNew(\d{4})?_\d+", x or ""),
+    )
+    for act_table in act_tables:
+        # Walk backwards to find the nearest lblClientName span
+        client_name = ""
+        node = act_table
+        while node:
+            node = node.find_previous(["span", "div", "td"])
+            if not node:
+                break
+            if node.get("id") and "lblClientName" in node["id"]:
+                client_name = node.get_text(strip=True)
+                break
+
+        for row in _grid_rows(act_table):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) < 4:
+                continue
+            chamber = normalize_chamber(cells[0])
+            raw_num = cells[1]
+            bill_id = construct_bill_id(chamber, raw_num)
+            bills.append(BillActivity(
+                client_name=client_name,
+                chamber=chamber,
+                raw_bill_number=raw_num,
+                bill_id=bill_id,
+                activity_title=cells[2] if len(cells) > 2 else "",
+                position=cells[3] if len(cells) > 3 else "",
+                amount=_parse_amount(cells[4]) if len(cells) > 4 else None,
+            ))
+
+    if comp_table or bills:
+        return DisclosureDetail(compensation=compensation, bills=bills)
+
+    # ── Legacy format (<~2013) ────────────────────────────────────────────────
+    salary_table = soup.find("table", id=lambda x: x and "grdvSalaryPaid" in (x or ""))
+    if salary_table:
+        total = 0.0
+        for row in salary_table.find_all("tr"):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) >= 2 and "Total" not in cells[0]:
+                amt = _parse_amount(cells[1])
+                if amt:
+                    total += amt
+        if total:
+            compensation.append(Compensation(client_name=LEGACY_TOTAL_CLIENT, amount=total))
+
+    act_table = soup.find("table", id=lambda x: x and x.endswith("grdvActivities"))
+    if act_table:
+        all_rows = act_table.find_all("tr")
+        headers = [th.get_text(strip=True)
+                   for th in (all_rows[0].find_all(["th", "td"]) if all_rows else [])]
+
+        if headers and "Activity" in headers[0]:
+            # 6-col entity layout has Lobbyist as second header
+            if len(headers) >= 2 and "Lobbyist" in headers[1]:
+                bill_col, pos_col, client_col = 0, 2, 4
+            else:
+                bill_col, pos_col, client_col = 0, 1, 3
+        else:
+            bill_col, pos_col, client_col = 1, None, 3
+
+        chamber_map = {"H": "House Bill", "S": "Senate Bill",
+                       "HD": "House Docket", "SD": "Senate Docket"}
+        skip = {"Activity or Bill No and Title", "N/A", "None", "", "Total amount"}
+
+        for row in all_rows[1:]:
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) <= max(bill_col, client_col):
+                continue
+            bill_cell = cells[bill_col]
+            if not bill_cell or bill_cell in skip:
+                continue
+            parts = bill_cell.split(None, 1)
+            bill_no = parts[0]
+            m = re.match(r"^([A-Z]+)(\d+)$", bill_no)
+            if not m:
+                continue
+            prefix, number = m.group(1), m.group(2)
+            chamber = chamber_map.get(prefix, "Other")
+            bill_id = construct_bill_id(chamber, number)
+            bills.append(BillActivity(
+                client_name=cells[client_col] if len(cells) > client_col else "",
+                chamber=chamber,
+                raw_bill_number=number,
+                bill_id=bill_id,
+                activity_title=parts[1] if len(parts) > 1 else "",
+                position=cells[pos_col] if pos_col is not None and len(cells) > pos_col else "",
+                amount=None,
+            ))
+
+    return DisclosureDetail(compensation=compensation, bills=bills)
diff --git a/lobbying-scraper/requirements.txt b/lobbying-scraper/requirements.txt
new file mode 100644
index 000000000..5e7b4bcc7
--- /dev/null
+++ b/lobbying-scraper/requirements.txt
@@ -0,0 +1,3 @@
+requests>=2.28
+beautifulsoup4>=4.12
+google-cloud-firestore>=2.14
diff --git a/lobbying-scraper/scrape.py b/lobbying-scraper/scrape.py
new file mode 100644
index 000000000..fb985e05f
--- /dev/null
+++ b/lobbying-scraper/scrape.py
@@ -0,0 +1,269 @@
+"""Lobbying disclosure scraper — Cloud Run entry point.
+
+Runs on a weekly Cloud Scheduler trigger. Checks for new or amended disclosures
+and exits immediately if none are found (fast path). When new disclosures exist,
+fetches and writes them to Firestore.
+
+Also serves as the library used by the TypeScript backfill admin script via
+subprocess.
+
+Environment variables:
+  GOOGLE_CLOUD_PROJECT  — GCP project ID (set automatically in Cloud Run)
+  FIRESTORE_EMULATOR_HOST — set to use the local emulator (e.g. localhost:8080)
+
+CLI flags (for local / backfill use):
+  --year YEAR     Only process this year (default: current + prior)
+  --limit N       Max registrants per year (for testing)
+  --dry-run       Fetch and parse but do not write to Firestore
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import sys
+from datetime import datetime, timezone
+
+from google.cloud import firestore
+
+from portal import (
+    FIRST_YEAR,
+    fetch_disclosure_detail,
+    fetch_disclosure_meta,
+    fetch_summary_links,
+    make_session,
+)
+from writer import (
+    BACKFILL_DOC,
+    BACKFILL_URLS_COLLECTION,
+    SCRAPER_DOC,
+    write_filings,
+    write_registrant,
+)
+
+
+# ── Cursor helpers ────────────────────────────────────────────────────────────
+
+
+def _load_live_cursor(db: firestore.Client) -> tuple[set[str], dict[str, list[str]]]:
+    """Return (processedDiscUrls, summaryDiscCache) from the live scraper doc."""
+    doc = db.document(SCRAPER_DOC).get()
+    data = doc.to_dict() or {}
+    return (
+        set(data.get("processedDiscUrls", [])),
+        data.get("summaryDiscCache", {}),
+    )
+
+
+def _save_live_cursor(
+    db: firestore.Client,
+    processed: set[str],
+    cache: dict[str, list[str]],
+) -> None:
+    db.document(SCRAPER_DOC).set(
+        {"processedDiscUrls": list(processed), "summaryDiscCache": cache},
+        merge=True,
+    )
+
+
+def _is_backfill_processed(db: firestore.Client, disc_url: str) -> bool:
+    h = hashlib.sha256(disc_url.encode()).hexdigest()[:40]
+    return db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).get().exists
+
+
+def _mark_backfill_processed(db: firestore.Client, disc_url: str) -> None:
+    h = hashlib.sha256(disc_url.encode()).hexdigest()[:40]
+    db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).set(
+        {"url": disc_url, "processedAt": datetime.now(tz=timezone.utc).isoformat()}
+    )
+
+
+# ── Core processing ───────────────────────────────────────────────────────────
+
+
+def process_disclosure(
+    db: firestore.Client | None,
+    session,
+    summary_url: str,
+    disc_url: str,
+    year: int,
+    dry_run: bool = False,
+) -> tuple[int, int]:
+    """Fetch one disclosure page and write registrant + filing documents.
+
+    Returns (compensation_rows, filing_rows).
+    """
+    meta = fetch_disclosure_meta(session, summary_url)
+    detail = fetch_disclosure_detail(session, disc_url, year)
+
+    if dry_run or db is None:
+        return len(detail.compensation), len(detail.bills)
+
+    write_registrant(db, meta, detail, disc_url)
+    n_filings = write_filings(db, meta, detail)
+    return len(detail.compensation), n_filings
+
+
+# ── Weekly incremental run ────────────────────────────────────────────────────
+
+
+def run_weekly(
+    db: "firestore.Client | None",
+    years: list[int],
+    limit: int | None = None,
+    dry_run: bool = False,
+) -> int:
+    """Incremental weekly check. Returns number of new disclosures processed."""
+    current_year = datetime.now(tz=timezone.utc).year
+    processed, cache = _load_live_cursor(db) if db is not None else (set(), {})
+
+    session = make_session()
+    new_count = 0
+
+    for year in years:
+        print(f"\n── {year} ──")
+        try:
+            summary_urls = fetch_summary_links(session, year)
+        except Exception as e:
+            print(f"  failed to fetch summary links: {e}", file=sys.stderr)
+            continue
+
+        if limit:
+            summary_urls = summary_urls[:limit]
+
+        print(f"  {len(summary_urls)} registrants on portal")
+
+        for summary_url in summary_urls:
+            # Use cached disc URLs for prior years; always re-check current year
+            disc_urls = cache.get(summary_url)
+            if disc_urls is None or year == current_year:
+                try:
+                    meta = fetch_disclosure_meta(session, summary_url)
+                    disc_urls = meta.disclosure_urls
+                    cache[summary_url] = disc_urls
+                    if not dry_run:
+                        _save_live_cursor(db, processed, cache)
+                except Exception as e:
+                    print(f"  failed to fetch summary {summary_url}: {e}", file=sys.stderr)
+                    continue
+
+            new_disc_urls = [u for u in disc_urls if u not in processed]
+            if not new_disc_urls:
+                continue
+
+            for disc_url in new_disc_urls:
+                try:
+                    comp_n, filing_n = process_disclosure(
+                        db, session, summary_url, disc_url, year, dry_run=dry_run
+                    )
+                    processed.add(disc_url)
+                    new_count += 1
+                    print(f"  processed: {comp_n} clients, {filing_n} filings")
+                    if not dry_run:
+                        _save_live_cursor(db, processed, cache)
+                except Exception as e:
+                    print(f"  failed to process {disc_url}: {e}", file=sys.stderr)
+
+    return new_count
+
+
+# ── Historical backfill ───────────────────────────────────────────────────────
+
+
+def run_backfill(
+    db: "firestore.Client | None",
+    years: list[int],
+    limit: int | None = None,
+    dry_run: bool = False,
+) -> int:
+    """Full historical backfill using the subcollection cursor. Resumable."""
+    session = make_session()
+    total_new = 0
+
+    for year in years:
+        print(f"\n── {year} ──")
+        try:
+            summary_urls = fetch_summary_links(session, year)
+        except Exception as e:
+            print(f"  failed to fetch summary links: {e}", file=sys.stderr)
+            continue
+
+        if limit:
+            summary_urls = summary_urls[:limit]
+
+        print(f"  {len(summary_urls)} registrants on portal")
+        year_new = 0
+
+        for i, summary_url in enumerate(summary_urls):
+            try:
+                meta = fetch_disclosure_meta(session, summary_url)
+            except Exception as e:
+                print(f"  [{i+1}/{len(summary_urls)}] failed to fetch summary: {e}", file=sys.stderr)
+                continue
+
+            for disc_url in meta.disclosure_urls:
+                if db is not None and not dry_run and _is_backfill_processed(db, disc_url):
+                    continue
+                try:
+                    comp_n, filing_n = process_disclosure(
+                        db, session, summary_url, disc_url, year, dry_run=dry_run
+                    )
+                    if not dry_run:
+                        _mark_backfill_processed(db, disc_url)
+                    total_new += 1
+                    year_new += 1
+                except Exception as e:
+                    print(f"  failed to process {disc_url}: {e}", file=sys.stderr)
+
+            if (i + 1) % 50 == 0 or i + 1 == len(summary_urls):
+                print(f"  [{i+1}/{len(summary_urls)}] {year_new} new disclosures so far")
+
+        print(f"  {year} complete: {year_new} new disclosures")
+
+    return total_new
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--year", type=int, default=None)
+    p.add_argument("--limit", type=int, default=None)
+    p.add_argument("--dry-run", action="store_true")
+    p.add_argument(
+        "--mode",
+        choices=["weekly", "backfill"],
+        default="weekly",
+        help="weekly: incremental check; backfill: full history with subcollection cursor",
+    )
+    args = p.parse_args()
+
+    current_year = datetime.now(tz=timezone.utc).year
+
+    if args.year:
+        years = [args.year]
+    elif args.mode == "weekly":
+        years = [current_year, current_year - 1]
+    else:
+        years = list(range(FIRST_YEAR, current_year + 1))
+
+    db = firestore.Client() if not args.dry_run else None
+
+    if args.mode == "weekly":
+        n = run_weekly(db, years, limit=args.limit, dry_run=args.dry_run)
+        if n == 0:
+            print("\nNo new disclosures found.")
+        else:
+            print(f"\nDone: {n} new disclosures written.")
+    else:
+        n = run_backfill(db, years, limit=args.limit, dry_run=args.dry_run)
+        print(f"\nBackfill complete: {n} new disclosures written.")
+
+    # Emit structured result for callers (e.g. TypeScript backfill script)
+    print(json.dumps({"newDisclosures": n}), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lobbying-scraper/writer.py b/lobbying-scraper/writer.py
new file mode 100644
index 000000000..a6804f401
--- /dev/null
+++ b/lobbying-scraper/writer.py
@@ -0,0 +1,126 @@
+"""Firestore document construction and write helpers.
+
+Mirrors the data model in functions/src/lobbying/types.ts. All collection
+names and field names must stay in sync with that file.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING
+
+from normalize import normalize_entity_name
+from portal import (
+    BillActivity,
+    Compensation,
+    DisclosureDetail,
+    DisclosureMeta,
+    filing_id,
+    registrant_id,
+    year_to_general_court,
+)
+
+if TYPE_CHECKING:
+    from google.cloud import firestore
+
+REGISTRANTS_COLLECTION = "lobbyingRegistrants"
+FILINGS_COLLECTION = "lobbyingFilings"
+SCRAPER_DOC = "/scrapers/lobbying"
+BACKFILL_DOC = "/scrapers/lobbyingBackfill"
+BACKFILL_URLS_COLLECTION = "processedUrls"
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+def write_registrant(
+    db: firestore.Client,
+    meta: DisclosureMeta,
+    detail: DisclosureDetail,
+    disc_url: str,
+) -> None:
+    """Upsert a LobbyingRegistrant document."""
+    if not meta.entity_name or meta.year is None:
+        return
+
+    doc_id = registrant_id(meta.entity_name, meta.year)
+    ref = db.collection(REGISTRANTS_COLLECTION).document(doc_id)
+
+    clients = [
+        {
+            "clientName": c.client_name,
+            "clientNameNorm": normalize_entity_name(c.client_name),
+            "compensation": c.amount,
+        }
+        for c in detail.compensation
+    ]
+
+    data = {
+        "registrantId": doc_id,
+        "entityName": meta.entity_name,
+        "entityNameNorm": normalize_entity_name(meta.entity_name),
+        "year": meta.year,
+        "generalCourt": year_to_general_court(meta.year),
+        "regType": meta.reg_type,
+        "clients": clients,
+        "disclosureUrls": firestore.ArrayUnion([disc_url]),
+        "fetchedAt": _now(),
+    }
+    ref.set(data, merge=True)
+
+
+def write_filings(
+    db: firestore.Client,
+    meta: DisclosureMeta,
+    detail: DisclosureDetail,
+) -> int:
+    """Batch-write LobbyingFiling documents. Returns the number written."""
+    if not meta.entity_name or meta.year is None or not detail.bills:
+        return 0
+
+    gc = year_to_general_court(meta.year)
+    entity_name = meta.entity_name
+    entity_norm = normalize_entity_name(entity_name)
+    now = _now()
+
+    batch = db.batch()
+    count = 0
+
+    for bill in detail.bills:
+        fid = filing_id(
+            entity_name,
+            bill.client_name,
+            bill.chamber,
+            bill.bill_id,
+            gc,
+            bill.position,
+        )
+        ref = db.collection(FILINGS_COLLECTION).document(fid)
+        doc = {
+            "filingId": fid,
+            "entityName": entity_name,
+            "entityNameNorm": entity_norm,
+            "clientName": bill.client_name,
+            "clientNameNorm": normalize_entity_name(bill.client_name),
+            "year": meta.year,
+            "generalCourt": gc,
+            "chamber": bill.chamber,
+            "billId": bill.bill_id,
+            "activityTitle": bill.activity_title,
+            "position": bill.position,
+            "amount": bill.amount,
+            "fetchedAt": now,
+        }
+        batch.set(ref, doc)
+        count += 1
+
+        # Firestore batch limit is 500 writes
+        if count % 400 == 0:
+            batch.commit()
+            batch = db.batch()
+
+    if count % 400 != 0:
+        batch.commit()
+
+    return count