From 0348b2933073b8e8940640dede8b4119a9dee130 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 4 Jun 2026 07:01:07 -0400 Subject: [PATCH 1/4] initial plan --- .gitignore | 7 + docs/lobbying-disclosure-ingestion.md | 363 ++++++++++++++++++++++++++ 2 files changed, 370 insertions(+) create mode 100644 docs/lobbying-disclosure-ingestion.md diff --git a/.gitignore b/.gitignore index 571150641..7301e0ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,10 @@ cert.txt # local MCP server config (contains auth tokens) .mcp.json mcp-server/create-agent-key.ts + +# Claude +CLAUDE.md + +#gcloud +.gcloudignore + diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md new file mode 100644 index 000000000..646cc4415 --- /dev/null +++ b/docs/lobbying-disclosure-ingestion.md @@ -0,0 +1,363 @@ +# Lobbying Disclosure Ingestion Pipeline + +## Overview + +The MA Secretary of State lobbying portal +([sec.state.ma.us/LobbyistPublicSearch](https://www.sec.state.ma.us/LobbyistPublicSearch/)) +publishes semi-annual disclosure filings for all registered lobbyists and +lobbying entities. This document describes the plan for scraping that data and +storing it in Firestore in a way that allows joining to MAPLE bill data. + +The portal has three levels of pages: + +1. **Search page** → one row per registrant per year +2. **Summary page** → registrant metadata + links to semi-annual disclosure + filings +3. **CompleteDisclosure page** → per-client compensation table + per-client bill + activity tables + +Historical data goes back to 2005. MAPLE has bill data only from ~2020 onward, +so bill joins will only resolve for filings from the 192nd General Court (2021) +and later. All historical filings are ingested regardless. + +--- + +## Terminology + +The portal has two registrant types: + +- **Lobbyist** — an individual person who lobbies directly on behalf of clients. +- **Employer** — a lobbying firm that employs individual lobbyists and is + retained by clients. Called "Lobbyist Entity" on the portal. + +In both cases, the registrant reports compensation received from each **client** +(the organization that hired them) and which bills they lobbied for that client. + +--- + +## Firestore Data Model + +Two top-level collections, normalized by registrant and by lobbying activity +record. + +### `/lobbyingRegistrants/{registrantId}` + +`registrantId` is a slugified `{entityName}_{year}` (stable, dedup-safe). + +One model covers both individual lobbyists and lobbying firms. A separate model +is not needed because the portal search returns both under the same schema, and +per-filing detail pages do not expose which individual lobbyists within a firm +worked on which bill. + +```typescript +interface LobbyingRegistrant { + registrantId: string // "{entityName}_{year}" slugified + entityName: string // firm name or individual lobbyist name (raw portal value) + entityNameNorm: string // normalized form; see Normalization section + year: number + generalCourt: number // computed from year + regType: "Lobbyist" | "Employer" + clients: LobbyingClient[] + disclosureUrls: string[] // source portal URLs, for audit trail + fetchedAt: Timestamp +} + +interface LobbyingClient { + clientName: string + clientNameNorm: string // normalized form + compensation: number | null +} +``` + +### `/lobbyingFilings/{filingId}` + +`filingId` is a slugified +`{entityName}_{clientName}_{chamber}_{activityRef}_{generalCourt}`. + +```typescript +type LobbyingChamber = + | "House Bill" + | "Senate Bill" + | "House Docket" + | "Senate Docket" + | "Executive" // lobbying of executive branch agencies + | "Other" // catch-all for rare legacy codes (FY, CMR, etc.) + +interface LobbyingFiling { + filingId: string + entityName: string // raw portal value + entityNameNorm: string // normalized form + clientName: string // raw portal value; "_total_salary_" sentinel for pre-2013 + clientNameNorm: string // normalized form + year: number + generalCourt: number + chamber: LobbyingChamber + // For legislative chambers: the bill number string (e.g. "H1234", "HD56"). + // For Executive: the agency name. Not a bill reference. + billId: string | null + activityTitle: string // bill title (legislative) or meeting description (executive) + position: string // "Support" | "Oppose" | "Neutral" | etc.; empty for executive + amount: number | null // compensation allocated to this activity + fetchedAt: Timestamp +} +``` + +### Constructing `billId` from Raw Portal Data + +The portal stores bill numbers as bare integers and records the chamber +separately. The `billId` field — which maps to `Bill.id` in MAPLE — is +constructed during ingest by combining chamber prefix and integer: + +| `chamber` | Prefix | Example raw | `billId` | +| --------------- | ------ | ----------- | -------- | +| `House Bill` | `H` | `1234` | `H1234` | +| `Senate Bill` | `S` | `1234` | `S1234` | +| `House Docket` | `HD` | `56` | `HD56` | +| `Senate Docket` | `SD` | `56` | `SD56` | +| `Executive` | — | agency name | `null` | +| `Other` | — | varies | `null` | + +Note: `H1234` and `S1234` are distinct bills even though they share the same +integer. The prefix is required to disambiguate. `billId` is `null` for +non-legislative chambers. + +#### Legacy chamber code normalization + +The portal uses short-form codes in older filings, normalized during ingest: + +| Raw value | Stored as | +| --------- | ------------- | +| `HB` | `House Bill` | +| `SB` | `Senate Bill` | + +Rare codes (`FY`, `C`, `CMR`, `HR`, etc.) are stored as `Other`. + +### Joining to Bill Data + +**The join only applies to legislative chambers** (`House Bill`, `Senate Bill`, +`House Docket`, `Senate Docket`) where `billId` is non-null. For `Executive` +and `Other`, no join should be attempted. + +```typescript +// Only valid when filing.billId !== null +db.collection(`/generalCourts/${filing.generalCourt}/bills`).doc(filing.billId) +``` + +--- + +## Entity Name Normalization + +The portal does not enforce consistent name formatting. The same client or +registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a +Acme Consulting", etc. across filings and years. Without normalization, +grouping by entity is unreliable. + +Both `entityName` and `clientName` are normalized using the following pipeline, +applied in order. The raw portal value is always preserved alongside the +normalized form. + +### Normalization pipeline + +1. **Uppercase** — convert the entire string to upper case. +2. **Strip d/b/a suffix** — remove everything from the first occurrence of + `D/B/A`, `D/B/A`, `DBA` (and spacing variants) onward, so the registered + name is used rather than a trade name. +3. **Hyphen → space** — replace `-` with ` ` so `LAN-TEL` and `LAN TEL` + collapse to the same key. +4. **Punctuation → space** — replace `,`, `.`, `'`, `'`, `'`, `(`, `)` with + space. Replacement with space (not empty string) prevents adjacent tokens + from concatenating (e.g. `,INC` becomes ` INC`, which is then caught by step + 5). +5. **Remove legal entity type words** — whole-word removal of: `LLC`, `LLP`, + `INC`, `INCORPORATED`, `CORPORATION`, `CORP`, `LTD`, `LIMITED`, `PC`, + `PLLC`. +6. **Remove "THE"** — whole-word removal anywhere in the string (not just as a + leading prefix). +7. **Ampersand → AND** — replace `&` with `AND`. +8. **Fix known typo** — replace `ASSICIATES` with `ASSOCIATES` (legacy portal + data). +9. **Remove professional suffix phrases** — whole-phrase removal of: `LAW +OFFICE OF`, `AND ASSOCIATES`, `& ASSOCIATES`, `AND ASSOC`, `ATTORNEY AT +LAW`, `ATTORNEY@LAW`, `ATTORNET AT LAW`, `AND PARTNERS`, `PUBLIC POLICY +GROUP`, `LEGISLATIVE SERVICES`, `POLICY GROUP`, `ASSOCIATES`, `COUNSELLORS +AT LAW`. +10. **Collapse whitespace** — replace runs of whitespace with a single space and + strip leading/trailing whitespace. + +### Usage + +`entityNameNorm` and `clientNameNorm` are stored on every document and filing. +They should be used for grouping, deduplication, and display-level matching. +Raw names are preserved for provenance and audit. + +--- + +## Deduplication and Amount Aggregation + +### Does lobbying the same bill multiple times mean we should sum amounts? + +The portal collects two semi-annual disclosure filings per registrant per year +(one for each 6-month period). In theory, a registrant could report the same +bill in both H1 and H2 filings with separate compensation amounts that should +be summed. Analysis of the actual data shows this does not occur: after +processing, zero rows share the same `(entityName, clientName, year, +generalCourt, billId, position)` — each (registrant, client, bill, year) +combination appears exactly once. The semi-annual periods report different +activity, not the same activity twice. + +The same registrant can lobby the same bill across multiple General Courts +(observed up to 6 times across years). These are stored as separate documents +per `generalCourt` and should not be summed — each court is a distinct +legislative session. + +### Null-bill row deduplication + +The one real duplication artifact in the portal data is **null-bill rows** — +entries filed when a registrant had no specific bills to report for a client in +a period. These appear in both the H1 and H2 disclosures as identical rows and +should be collapsed. During ingest, if the same `(entityName, clientName, year, +generalCourt, chamber, position)` with a null `billId` is encountered more than +once, keep the row with the highest `amount` so no spend is lost if the two +copies carry different values (in practice amounts are usually both zero). + +### Ingest strategy + +When processing multiple disclosure URLs for the same registrant+year, write +`lobbyingFilings` documents using the logical key as the document ID. A +subsequent disclosure URL that produces the same document ID will naturally +upsert (overwrite) rather than duplicate. For null-bill rows, since `billId` is +null, include `chamber` in the document ID to avoid false merges between +executive and legislative null rows. + +--- + +## Scraper Architecture + +The lobbying portal is an HTML scraper, not a REST API. It does not fit the +`createScraper` factory (which assumes list-IDs → fetch-per-ID against the MA +Legislature API). Instead, we use a custom scheduled function following the +`scrapeEvents` pattern. + +### Cloud Function: `scrapeLobbying` + +**File:** `functions/src/lobbying/scrapeLobbying.ts` + +- Schedule: `every 24 hours` +- Scrapes the current year and prior year (new filers arrive semi-annually) +- Persists a cursor in `/scrapers/lobbying`: + - `lastFetchedAt: Timestamp` + - `processedDiscUrls: string[]` — already-fetched disclosure URLs (skipped on + re-runs) +- For each new disclosure URL: + - Parse registrant + client compensation rows → upsert `lobbyingRegistrants` + doc + - Parse bill activity rows → batch-write `lobbyingFilings` docs +- Uses `axios` (existing dependency) with an iPad `User-Agent` header to match + portal expectations +- Uses `jsdom` for HTML table parsing (already a dependency; used by events scraper) +- 1s delay between requests; exponential backoff on failure (matching existing + scraper retry pattern) +- Function timeout: 540s + +### Incremental Strategy + +Processed disclosure URLs are stored in `/scrapers/lobbying.processedDiscUrls`. +At ~2 disclosure URLs per registrant × ~500 registrants per year, the +current+prior year window stays well within Firestore document limits. +Historical years beyond current-1 are stable (filings are frozen after year +closes) and are handled by the backfill script only. + +The backfill script uses a separate Firestore document +(`/scrapers/lobbyingBackfill`) for its own cursor so it does not interfere with +the live scraper. + +### Legacy Format (pre-2013) + +The portal uses a different HTML layout for filings before ~2013: total salary +is not broken down by client, and all bill activity is in a single table. These +are stored with `clientName: "_total_salary_"` so callers can detect and filter +them. No bill-level compensation amount is available for these years. + +--- + +## New Files + +``` +functions/src/lobbying/ + types.ts — Runtypes definitions for LobbyingRegistrant, LobbyingFiling + scrapeLobbying.ts — Scheduled Cloud Function + shared parsing/normalization logic + index.ts — Re-exports +``` + +--- + +## Firebase Admin Script + +**File:** `scripts/firebase-admin/backfillLobbying.ts` + +Ingests all historical filings from 2005 to the present. This is the primary +path for all data before the current and prior year. Accepts `--year` and +`--limit` CLI args for targeted re-runs or testing. Calls the same parsing +logic exported from `functions/src/lobbying/scrapeLobbying.ts` and writes +directly to Firestore via the firebase-admin SDK. + +```bash +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + yarn firebase-admin run-script backfillLobbying --env dev +``` + +--- + +## Firestore Rules + +Add read-only public rules alongside the existing `generalCourts` rule: + +``` +match /lobbyingRegistrants/{doc} { allow read: if true; } +match /lobbyingFilings/{doc} { allow read: if true; } +``` + +--- + +## Firestore Indexes + +Add composite indexes for common query patterns: + +| Collection | Fields | Use case | +| ----------------- | -------------------------------------- | ---------------------------------------- | +| `lobbyingFilings` | `generalCourt ASC, billId ASC` | Fetch all legislative filings for a bill | +| `lobbyingFilings` | `generalCourt ASC, chamber ASC` | Filter by chamber within a court | +| `lobbyingFilings` | `generalCourt ASC, entityNameNorm ASC` | Fetch all filings for a registrant | +| `lobbyingFilings` | `generalCourt ASC, clientNameNorm ASC` | Fetch all filings for a client | + +Note: bill-join queries should always filter on `chamber` (or check +`billId !== null`) to exclude `Executive` and `Other` rows before treating +`billId` as a MAPLE bill reference. + +--- + +## Function Export + +Add to `functions/src/index.ts`: + +```typescript +export { scrapeLobbying } from "./lobbying" +``` + +--- + +## Design Decisions + +| Decision | Choice | Rationale | +| --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Collection placement | Top-level `/lobbyingRegistrants`, `/lobbyingFilings` | Lobbying data spans multiple General Courts and is not scoped to a single court like bills/members | +| Single registrant model | One type, `regType: "Lobbyist" \| "Employer"` | Individual lobbyists and firms share the same portal schema; per-bill individual attribution is not available | +| `billId` construction | `{chamberPrefix}{billNumber}` at ingest time | Raw portal data stores chamber and integer separately; the composite is what matches MAPLE's `Bill.id` | +| `billId` null for Executive | `null` instead of agency name | Prevents accidental bill lookups; makes join guard explicit at the type level | +| Normalized name fields | Store both raw and `*Norm` fields | Raw names preserved for provenance; normalized names used for grouping and matching | +| HTML parser | `jsdom` | Already in `functions/package.json` (used by events scraper); no need to add cheerio | +| Live scraper cursor | Array in `/scrapers/lobbying` doc | ~1,000 URLs/year fits well within the 1 MB Firestore doc limit; simple and atomic with other scraper state | +| Backfill cursor | Firestore subcollection `/scrapers/lobbyingBackfill/processedUrls/{urlHash}` | Full 2005-present history (~50,000 URLs) would exceed the 1 MB doc limit; subcollection scales without bound and is durable, inspectable, and resumable from any machine | +| Incremental strategy | Skip already-processed disclosure URLs; write docs by logical key (upsert) | Survives function restarts and re-runs without re-fetching already-scraped pages; natural upsert prevents duplicates without an explicit dedup pass | +| Legacy format (pre-2013) | Store with `clientName: "_total_salary_"` sentinel | Preserves data completeness; callers can filter on this value | +| Historical data | Admin backfill script (2005 → present) | Full history is ingested once; Cloud Function maintains current+prior year going forward | From 774568e3371a20e09abf58a3a87ff8d588052b61 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 4 Jun 2026 08:38:21 -0400 Subject: [PATCH 2/4] feat: add lobbying disclosure ingestion pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scrapes the MA Secretary of State lobbying portal (sec.state.ma.us/LobbyistPublicSearch) and writes structured data to Firestore for joining with MAPLE bill data. New collections: - /lobbyingRegistrants — one doc per (registrant, year), regType Lobbyist|Employer - /lobbyingFilings — one doc per (registrant, client, bill, court), with billId null for Executive/Other chambers so the join guard is type-level Key design points: - billId is constructed as {chamberPrefix}{integer} (e.g. H1234, SD56) to match Bill.id in the existing bills collection; raw integer + chamber stored separately - Entity name normalization pipeline ported from reference implementation (10 steps: d/b/a stripping, legal entity words, punctuation, THE, ampersand, typo fix, etc.) - Both raw and *Norm name fields stored for provenance and grouping - Live Cloud Function scrapes current+prior year on a 24h schedule with a summaryDiscCache to avoid re-fetching summary pages in steady state - Backfill admin script handles full 2005-present history with a Firestore subcollection cursor (/scrapers/lobbyingBackfill/processedUrls) that scales to ~50k URLs and is safely resumable Files: - functions/src/lobbying/{types,normalize,portal,scrapeLobbying,index}.ts - scripts/firebase-admin/backfillLobbying.ts - firestore.rules + firestore.indexes.json updated - docs/lobbying-disclosure-ingestion.md: full plan, test plan, future work Co-Authored-By: Claude Sonnet 4.6 --- docs/lobbying-disclosure-ingestion.md | 240 ++++++++++ firestore.indexes.json | 91 +++- firestore.rules | 8 + functions/src/index.ts | 2 + functions/src/lobbying/index.ts | 12 + functions/src/lobbying/normalize.ts | 73 +++ functions/src/lobbying/portal.ts | 491 +++++++++++++++++++++ functions/src/lobbying/scrapeLobbying.ts | 274 ++++++++++++ functions/src/lobbying/types.ts | 101 +++++ scripts/firebase-admin/backfillLobbying.ts | 156 +++++++ 10 files changed, 1441 insertions(+), 7 deletions(-) create mode 100644 functions/src/lobbying/index.ts create mode 100644 functions/src/lobbying/normalize.ts create mode 100644 functions/src/lobbying/portal.ts create mode 100644 functions/src/lobbying/scrapeLobbying.ts create mode 100644 functions/src/lobbying/types.ts create mode 100644 scripts/firebase-admin/backfillLobbying.ts diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md index 646cc4415..ad67fe397 100644 --- a/docs/lobbying-disclosure-ingestion.md +++ b/docs/lobbying-disclosure-ingestion.md @@ -346,6 +346,246 @@ export { scrapeLobbying } from "./lobbying" --- +## Implementation Status + +| File | Status | +| -------------------------------------------- | ------- | +| `functions/src/lobbying/types.ts` | ✅ Done | +| `functions/src/lobbying/normalize.ts` | ✅ Done | +| `functions/src/lobbying/portal.ts` | ✅ Done | +| `functions/src/lobbying/scrapeLobbying.ts` | ✅ Done | +| `functions/src/lobbying/index.ts` | ✅ Done | +| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done | +| `functions/src/index.ts` (export) | ✅ Done | +| `firestore.rules` | ✅ Done | +| `firestore.indexes.json` | ✅ Done | + +### Document ID scheme + +Both `registrantId` and `filingId` are SHA-256 hashes (first 40 hex chars) of +their respective logical keys. Hashes are used rather than slugified strings +because entity names and client names contain arbitrary Unicode and punctuation +that would require aggressive sanitization to fit Firestore ID constraints. The +hash is stable across runs for the same logical record. + +--- + +## Future Work (Subsequent PRs) + +### Frontend + +- **Dedicated lobbying pages** + + - `/lobbyists` index: searchable list of registrants with total compensation, + client count, and year filter + - `/lobbyists/{registrantId}` profile: full client list, all bills lobbied, + compensation over time + - `/clients/{clientNameNorm}` profile: registrants hired, bills lobbied, + total spend per year + +- **Bill page integration** (`/bills/{court}/{billId}`) + + - "Lobbying activity" section listing registrants + clients that lobbied this + bill, with position (Support / Oppose / Neutral) and compensation where + available + - Link to registrant profile pages + +- **Organization profile page integration** + - If an organization's normalized name matches a `clientNameNorm` in + `lobbyingFilings`, surface a "Lobbying history" panel showing which bills + they lobbied and which registrants they hired + +### MCP Tools + +Expose lobbying data via the MAPLE MCP server so that AI agents and Claude can +answer questions like "who lobbied bill H1234?" or "what did Acme Corp lobby +for in 2024?". + +- **`get_lobbying_filings_for_bill`** — given `generalCourt` + `billId`, return + all `lobbyingFilings` for that bill with registrant, client, position, and + amount +- **`get_lobbying_registrant`** — given `registrantId`, return the registrant + document with client list and disclosure URLs +- **`search_lobbying_by_client`** — given a client name (raw or normalized), + return matching filings across all courts +- **`get_lobbying_summary_for_bill`** — aggregate view: unique registrant count, + unique client count, total compensation (where non-null), position breakdown + +--- + +## Incremental Test Plan + +Testing proceeds from the inside out: unit logic first, then live portal +fetches against the real site, then a small Firestore write, then a full +backfill year, then steady-state function operation. + +### Step 1 — Unit test: normalization + +Run the normalization pipeline against known inputs and verify the outputs match +the reference implementation. + +```bash +# In a Node REPL or ts-node session: +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { normalizeEntityName } = require('./functions/src/lobbying/normalize') +console.log(normalizeEntityName('Acme Corp., Inc. d/b/a Acme Consulting')) +// Expected: 'ACME' +console.log(normalizeEntityName('LAN-TEL COMMUNICATIONS, INC.')) +// Expected: 'LAN TEL COMMUNICATIONS' +console.log(normalizeEntityName('Law Office of Jane Smith, LLC')) +// Expected: 'JANE SMITH' +" +``` + +### Step 2 — Unit test: chamber normalization and billId construction + +```bash +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { normalizeChamber, constructBillId } = require('./functions/src/lobbying/portal') +console.log(normalizeChamber('HB')) // House Bill +console.log(normalizeChamber('SB')) // Senate Bill +console.log(normalizeChamber('Executive')) // Executive +console.log(normalizeChamber('FY2024')) // Other +console.log(constructBillId('House Bill', '1234')) // H1234 +console.log(constructBillId('Senate Bill', '567')) // S567 +console.log(constructBillId('House Docket', '89')) // HD89 +console.log(constructBillId('Executive', 'EOEEA')) // null +" +``` + +### Step 3 — Live portal fetch: summary links + +Verify the portal is reachable and returns results for the current year. Use +`--limit 1` to minimize requests. + +```bash +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { makePortalClient, fetchSummaryLinks } = require('./functions/src/lobbying/portal') +const client = makePortalClient() +fetchSummaryLinks(client, 2024).then(urls => { + console.log('Summary links for 2024:', urls.length) + console.log('First URL:', urls[0]) +}).catch(console.error) +" +``` + +Expected: ~400–600 URLs, each containing `Summary.aspx`. + +### Step 4 — Live portal fetch: summary meta + one disclosure + +Pick the first summary URL from Step 3 and fetch its meta and first disclosure. + +```bash +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { makePortalClient, fetchSummaryLinks, fetchDisclosureMeta, fetchDisclosureDetail } = require('./functions/src/lobbying/portal') +async function main() { + const client = makePortalClient() + const [summaryUrl] = await fetchSummaryLinks(client, 2024) + const meta = await fetchDisclosureMeta(client, summaryUrl) + console.log('Meta:', JSON.stringify(meta, null, 2)) + if (meta.disclosureUrls[0]) { + const detail = await fetchDisclosureDetail(client, meta.disclosureUrls[0], 2024) + console.log('Compensation rows:', detail.compensation.length) + console.log('Bill rows:', detail.bills.length) + console.log('First bill:', detail.bills[0]) + } +} +main().catch(console.error) +" +``` + +Verify: `meta.entityName` is non-empty, `meta.regType` is `"Lobbyist"` or +`"Employer"`, bill rows have `billId` set correctly for legislative chambers. + +### Step 5 — Backfill: single year, small limit against dev Firestore + +Write a small batch to the dev Firestore emulator or dev project. + +```bash +# Against local emulator: +conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env local -- --year 2024 --limit 3 + +# Against dev project (writes real Firestore): +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env dev -- --year 2024 --limit 3 +``` + +Verify in Firestore console or emulator UI: + +- `lobbyingRegistrants` has 3 documents with `entityName`, `entityNameNorm`, + `regType`, `clients`, `generalCourt` +- `lobbyingFilings` has documents with `billId` non-null for legislative rows + and null for Executive rows +- `/scrapers/lobbyingBackfill/processedUrls` has entries with `url` and + `processedAt` fields +- Re-running the same command skips already-processed URLs (output shows 0 new + disclosures) + +### Step 6 — Spot-check: bill join + +Pick a `lobbyingFiling` document with a non-null `billId` and a `generalCourt` +≥ 192. Verify the bill exists in MAPLE: + +``` +/generalCourts/{filing.generalCourt}/bills/{filing.billId} +``` + +If the bill is found, the join key is correct. If not found, check: (a) whether +MAPLE has data for that court, (b) whether the bill number format matches +(prefix + integer, no leading zeros). + +### Step 7 — Backfill: full current year + +Once Step 5 passes, run without `--limit` for the current year: + +```bash +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env dev -- --year 2024 +``` + +Monitor progress via console output. Expected: ~500–600 registrants, ~1,000 +disclosure pages, several thousand filing documents written. + +### Step 8 — Backfill: full history (2005–present) + +Run without `--year` to process all years. Can be interrupted and resumed: + +```bash +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env dev +``` + +Expected runtime: several hours at 1s/request. The subcollection cursor at +`/scrapers/lobbyingBackfill/processedUrls` allows safe interruption and +resumption. + +### Step 9 — Deploy and verify Cloud Function + +Deploy the function to the dev project: + +```bash +conda run -n maple-2025 firebase deploy \ + --only functions:maple:scrapeLobbying \ + --project digital-testimony-dev +``` + +Trigger a manual run via the Firebase console or: + +```bash +conda run -n maple-2025 yarn firebase-admin run-script runScrapers \ + --env local --targets scrapeLobbying +``` + +Verify: Cloud Function logs show the expected number of new disclosures (should +be near zero if backfill completed, since current+prior year are already +processed). + +--- + ## Design Decisions | Decision | Choice | Rationale | diff --git a/firestore.indexes.json b/firestore.indexes.json index 83cb3fa6d..c267a6868 100644 --- a/firestore.indexes.json +++ b/firestore.indexes.json @@ -788,25 +788,46 @@ "collectionGroup": "ballotQuestions", "queryScope": "COLLECTION", "fields": [ - { "fieldPath": "electionYear", "order": "ASCENDING" }, - { "fieldPath": "ballotStatus", "order": "ASCENDING" } + { + "fieldPath": "electionYear", + "order": "ASCENDING" + }, + { + "fieldPath": "ballotStatus", + "order": "ASCENDING" + } ] }, { "collectionGroup": "publishedTestimony", "queryScope": "COLLECTION_GROUP", "fields": [ - { "fieldPath": "ballotQuestionId", "order": "ASCENDING" }, - { "fieldPath": "publishedAt", "order": "DESCENDING" } + { + "fieldPath": "ballotQuestionId", + "order": "ASCENDING" + }, + { + "fieldPath": "publishedAt", + "order": "DESCENDING" + } ] }, { "collectionGroup": "publishedTestimony", "queryScope": "COLLECTION", "fields": [ - { "fieldPath": "billId", "order": "ASCENDING" }, - { "fieldPath": "court", "order": "ASCENDING" }, - { "fieldPath": "ballotQuestionId", "order": "ASCENDING" } + { + "fieldPath": "billId", + "order": "ASCENDING" + }, + { + "fieldPath": "court", + "order": "ASCENDING" + }, + { + "fieldPath": "ballotQuestionId", + "order": "ASCENDING" + } ] }, { @@ -898,6 +919,62 @@ } } ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "billId", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "chamber", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "entityNameNorm", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "clientNameNorm", + "order": "ASCENDING" + } + ] } ], "fieldOverrides": [ diff --git a/firestore.rules b/firestore.rules index a95586279..42db67276 100644 --- a/firestore.rules +++ b/firestore.rules @@ -103,6 +103,14 @@ service cloud.firestore { allow read: if true; allow write: if false; } + match /lobbyingRegistrants/{id} { + allow read: if true; + allow write: if false; + } + match /lobbyingFilings/{id} { + allow read: if true; + allow write: if false; + } match /transcriptions/{tid} { // public, read-only allow read: if true diff --git a/functions/src/index.ts b/functions/src/index.ts index 641255bf4..6c52b78c1 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -60,6 +60,8 @@ export { export { transcription } from "./webhooks" +export { scrapeLobbying } from "./lobbying" + export * from "./triggerPubsubFunction" // Export the health check last so it is loaded last. diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts new file mode 100644 index 000000000..5e594cb34 --- /dev/null +++ b/functions/src/lobbying/index.ts @@ -0,0 +1,12 @@ +export { scrapeLobbying } from "./scrapeLobbying" +export * from "./types" +export { normalizeEntityName } from "./normalize" +export { + constructBillId, + fetchDisclosureDetail, + fetchDisclosureMeta, + fetchSummaryLinks, + makePortalClient, + normalizeChamber, + yearToGeneralCourt +} from "./portal" diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts new file mode 100644 index 000000000..8d3d0a0ba --- /dev/null +++ b/functions/src/lobbying/normalize.ts @@ -0,0 +1,73 @@ +/** + * Entity name normalization pipeline. + * + * The SoS portal does not enforce consistent name formatting. The same client or + * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a + * Acme Consulting", etc. across filings and years. + * + * This pipeline is a direct port of the reference implementation used in the + * companion data analysis project. The steps must be applied in the exact order + * listed here; changing the order produces different (incorrect) output. + */ + +// Step 2: strip d/b/a trade-name suffix before any other transforms so the +// trade name doesn't bleed into the canonical form. +const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i + +// Step 5: remove legal entity type words with whole-word matching so +// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC". +const LEGAL_ENTITY_RE = + /\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g + +// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix). +const THE_RE = /\bTHE\b/g + +// Step 9: professional suffix phrases to remove wholesale. +const MISC_PHRASES = [ + "LAW OFFICE OF", + "AND ASSOCIATES", + "& ASSOCIATES", + "AND ASSOC", + "ATTORNEY AT LAW", + "ATTORNEY@LAW", + "ATTORNET AT LAW", // known portal typo + "AND PARTNERS", + "PUBLIC POLICY GROUP", + "LEGISLATIVE SERVICES", + "POLICY GROUP", + "ASSOCIATES", + "COUNSELLORS AT LAW" +] + +export function normalizeEntityName(raw: string | null | undefined): string { + if (!raw) return "" + + let x = raw.toUpperCase() // Step 1: uppercase + + x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix + + x = x.replace(/-/g, " ") // Step 3: hyphen → space + + // Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught + // by step 5's whole-word removal). + for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) { + x = x.split(ch).join(" ") + } + + x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words + + x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere + + x = x.replace(/&/g, "AND") // Step 7: ampersand → AND + + x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo + + // Step 9: remove professional suffix phrases + for (const phrase of MISC_PHRASES) { + x = x.split(phrase).join(" ") + } + + x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace + + return x +} diff --git a/functions/src/lobbying/portal.ts b/functions/src/lobbying/portal.ts new file mode 100644 index 000000000..e441522b8 --- /dev/null +++ b/functions/src/lobbying/portal.ts @@ -0,0 +1,491 @@ +/** + * HTTP client and HTML parser for the MA Secretary of State lobbying portal. + * + * Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/ + * + * Page flow: + * 1. Search POST → grdvSearchResultByTypeAndCategory table + * One row per registrant; each row has a Summary.aspx link. + * 2. Summary.aspx → registrant name/year/type + CompleteDisclosure links + * 3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity + * + * Two disclosure HTML formats exist: + * Modern (≥~2013): per-client compensation in grdvClientPaidToEntity; + * per-client bill tables as grdvActivitiesNew{year}_{n}. + * Legacy (<~2013): total salary in grdvSalaryPaid (no client breakdown); + * all bill activity in a single grdvActivities table. + */ + +import axios, { AxiosInstance } from "axios" +import { JSDOM } from "jsdom" +import { sha256 } from "js-sha256" +import { + CHAMBER_PREFIXES, + LEGACY_CHAMBER_MAP, + LEGACY_TOTAL_CLIENT, + LobbyingChamber +} from "./types" + +// ─── Constants ────────────────────────────────────────────────────────────── + +const BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/" +const SEARCH_URL = BASE_URL + "Default.aspx" +const REQUEST_DELAY_MS = 1000 +const MAX_RETRIES = 5 + +const IPAD_UA = + "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " + + "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" + +const FIRST_GC = 183 +const FIRST_GC_START_YEAR = 2003 + +// ─── Public types ─────────────────────────────────────────────────────────── + +export interface RawCompensation { + clientName: string + amount: number | null +} + +export interface RawBillActivity { + clientName: string + chamber: LobbyingChamber + rawBillNumber: string + billId: string | null // pre-computed from chamber + rawBillNumber + activityTitle: string + position: string + amount: number | null +} + +export interface DisclosureMeta { + entityName: string + year: number | null + /** Portal reg_type mapped to our vocabulary */ + regType: "Lobbyist" | "Employer" + disclosureUrls: string[] +} + +export interface DisclosureDetail { + compensation: RawCompensation[] + bills: RawBillActivity[] +} + +// ─── HTTP helpers ──────────────────────────────────────────────────────────── + +export function makePortalClient(): AxiosInstance { + return axios.create({ + headers: { "User-Agent": IPAD_UA }, + timeout: 60_000 + }) +} + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)) +} + +async function getHtml( + client: AxiosInstance, + url: string, + retries = MAX_RETRIES +): Promise { + for (let attempt = 0; attempt < retries; attempt++) { + await sleep( + attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt + ) + try { + const res = await client.get(url, { + responseType: "text", + headers: { Accept: "text/html" } + }) + return new JSDOM(res.data).window.document + } catch (e) { + if (attempt === retries - 1) throw e + if (axios.isAxiosError(e)) continue + throw e + } + } + throw new Error("unreachable") +} + +async function postHtml( + client: AxiosInstance, + url: string, + data: Record, + retries = MAX_RETRIES +): Promise { + const body = new URLSearchParams(data).toString() + for (let attempt = 0; attempt < retries; attempt++) { + await sleep( + attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt + ) + try { + const res = await client.post(url, body, { + responseType: "text", + headers: { + "Content-Type": "application/x-www-form-urlencoded", + Accept: "text/html" + }, + timeout: 180_000 + }) + return new JSDOM(res.data).window.document + } catch (e) { + if (attempt === retries - 1) throw e + if (axios.isAxiosError(e)) continue + throw e + } + } + throw new Error("unreachable") +} + +// ─── Year / General Court helpers ──────────────────────────────────────────── + +export function yearToGeneralCourt(year: number): number { + return FIRST_GC + Math.floor((year - FIRST_GC_START_YEAR) / 2) +} + +// ─── Chamber normalization ──────────────────────────────────────────────────── + +/** Normalize raw portal chamber string to a canonical LobbyingChamber value. */ +export function normalizeChamber(raw: string): LobbyingChamber { + const trimmed = raw.trim() + if (LEGACY_CHAMBER_MAP[trimmed]) return LEGACY_CHAMBER_MAP[trimmed] + const known: LobbyingChamber[] = [ + "House Bill", + "Senate Bill", + "House Docket", + "Senate Docket", + "Executive" + ] + if (known.includes(trimmed as LobbyingChamber)) + return trimmed as LobbyingChamber + return "Other" +} + +/** + * Construct the MAPLE-compatible billId from the portal's chamber + raw integer. + * + * The portal stores bill numbers as bare integers; the chamber prefix is what + * distinguishes H1234 from S1234. Returns null for Executive and Other chambers + * where no bill join is possible. + */ +export function constructBillId( + chamber: LobbyingChamber, + rawBillNumber: string +): string | null { + const prefix = CHAMBER_PREFIXES[chamber] + if (!prefix) return null + const n = parseInt(rawBillNumber, 10) + if (isNaN(n)) return null + return `${prefix}${n}` +} + +// ─── Document ID generation ─────────────────────────────────────────────────── + +/** Stable Firestore document ID for a registrant (entity + year). */ +export function registrantId(entityName: string, year: number): string { + return sha256(`${year}|${entityName}`).slice(0, 40) +} + +/** + * Stable Firestore document ID for a filing. + * + * Uses a hash of the logical deduplication key. For null-bill rows (billId is + * null) the chamber is included in the key to avoid merging executive null rows + * with legislative null rows. + */ +export function filingId( + entityName: string, + clientName: string, + chamber: LobbyingChamber, + billId: string | null, + generalCourt: number, + position: string +): string { + const key = [ + entityName, + clientName, + chamber, + billId ?? "__null__", + generalCourt, + position + ].join("|") + return sha256(key).slice(0, 40) +} + +// ─── Amount parsing ─────────────────────────────────────────────────────────── + +function parseAmount(text: string): number | null { + const cleaned = text.replace(/[$,]/g, "").trim() + const n = parseFloat(cleaned) + return isNaN(n) ? null : n +} + +// ─── Portal scraping functions ──────────────────────────────────────────────── + +/** Extract ASP.NET WebForms ViewState hidden inputs from a page. */ +function extractViewState(doc: Document): Record { + const fields: Record = {} + doc.querySelectorAll('input[type="hidden"]').forEach(el => { + const input = el as HTMLInputElement + if (input.name) fields[input.name] = input.value ?? "" + }) + return fields +} + +/** + * Fetch all Summary.aspx URLs for a given year. + * Sends a single search POST with page size 20000 to get all registrants at once. + */ +export async function fetchSummaryLinks( + client: AxiosInstance, + year: number +): Promise { + const searchPage = await getHtml(client, SEARCH_URL) + const vs = extractViewState(searchPage) + + const postData: Record = { + ...vs, + __EVENTTARGET: "", + __EVENTARGUMENT: "", + ctl00$ContentPlaceHolder1$Search: "rdbSearchByType", + ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear: String(year), + ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame: "", + ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown: + "3", + ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType: "L", + ctl00$ContentPlaceHolder1$drpPageSize: "20000", + ctl00$ContentPlaceHolder1$btnSearch: "Search" + } + + const resultsPage = await postHtml(client, SEARCH_URL, postData) + + const table = resultsPage.querySelector( + '[id*="grdvSearchResultByTypeAndCategory"]' + ) + if (!table) return [] + + const links: string[] = [] + table.querySelectorAll("a[href]").forEach(el => { + const href = (el as HTMLAnchorElement).href + if (href && href.includes("Summary.aspx")) { + // href from JSDOM is already absolute when base is set; handle both cases + const url = href.startsWith("http") ? href : BASE_URL + href + links.push(url) + } + }) + return links +} + +/** + * Fetch a Summary.aspx page and return the registrant metadata + disclosure URLs. + */ +export async function fetchDisclosureMeta( + client: AxiosInstance, + summaryUrl: string +): Promise { + const doc = await getHtml(client, summaryUrl) + + const text = (id: string) => { + const el = doc.getElementById(id) + return el?.textContent?.trim() ?? "" + } + + const entityName = text("ContentPlaceHolder1_lblRegistrantName") + const yearText = text("ContentPlaceHolder1_lblYear") + const regTypeRaw = text("ContentPlaceHolder1_lblRegType") + + const year = parseInt(yearText, 10) + const regType: "Lobbyist" | "Employer" = regTypeRaw.includes("Entity") + ? "Employer" + : "Lobbyist" + + const disclosureUrls: string[] = [] + doc.querySelectorAll("a[href]").forEach(el => { + const raw = (el as HTMLAnchorElement).getAttribute("href") ?? "" + if (raw.includes("CompleteDisclosure")) { + const url = raw.startsWith("http") ? raw : BASE_URL + raw + disclosureUrls.push(url) + } + }) + + return { + entityName, + year: isNaN(year) ? null : year, + regType, + disclosureUrls + } +} + +/** + * Parse a CompleteDisclosure.aspx page. + * + * Handles both modern (≥~2013) and legacy (<~2013) HTML layouts. + */ +export async function fetchDisclosureDetail( + client: AxiosInstance, + discUrl: string, + year: number +): Promise { + const doc = await getHtml(client, discUrl) + const compensation: RawCompensation[] = [] + const bills: RawBillActivity[] = [] + + // ── Modern format ────────────────────────────────────────────────────────── + const compTable = doc.querySelector('[id*="grdvClientPaidToEntity"]') + if (compTable) { + compTable + .querySelectorAll("tr.GridRow, tr.GridAlternatingRow") + .forEach(row => { + const cells = Array.from(row.querySelectorAll("td")).map( + td => td.textContent?.trim() ?? "" + ) + if (cells.length >= 2) { + compensation.push({ + clientName: cells[0], + amount: parseAmount(cells[1]) + }) + } + }) + } + + // Bill activity tables — one per client per reporting period. Two ID patterns: + // 2014–2018: …rptActivityNew_grdvActivitiesNew_0 (no year suffix) + // 2019+: …rptActivityNew2020_grdvActivitiesNew2020_0 (year suffix) + doc.querySelectorAll('[id*="grdvActivitiesNew"]').forEach(actTable => { + // The client name lives in the nearest preceding span with lblClientName + let clientName = "" + let node: Element | null = actTable + while ((node = node.previousElementSibling ?? node.parentElement)) { + const span = node.id?.includes("lblClientName") + ? node + : node.querySelector?.('[id*="lblClientName"]') + if (span) { + clientName = span.textContent?.trim() ?? "" + break + } + if (node === node.parentElement) break + } + + actTable + .querySelectorAll("tr.GridRow, tr.GridAlternatingRow") + .forEach(row => { + const cells = Array.from(row.querySelectorAll("td")).map( + td => td.textContent?.trim() ?? "" + ) + // Columns: House/Senate, Bill Number, Bill title, Position, Amount, Direct business + if (cells.length < 4) return + const chamber = normalizeChamber(cells[0]) + const rawBillNumber = cells[1] + const billId = constructBillId(chamber, rawBillNumber) + bills.push({ + clientName, + chamber, + rawBillNumber, + billId, + activityTitle: cells[2] ?? "", + position: cells[3] ?? "", + amount: cells.length > 4 ? parseAmount(cells[4]) : null + }) + }) + }) + + if (compTable || bills.length > 0) { + return { compensation, bills } + } + + // ── Legacy format (<~2013) ───────────────────────────────────────────────── + const salaryTable = doc.querySelector('[id*="grdvSalaryPaid"]') + if (salaryTable) { + let total = 0 + salaryTable.querySelectorAll("tr").forEach(row => { + const cells = Array.from(row.querySelectorAll("td")).map( + td => td.textContent?.trim() ?? "" + ) + if (cells.length >= 2 && !cells[0].includes("Total")) { + const amt = parseAmount(cells[1]) + if (amt !== null) total += amt + } + }) + if (total > 0) { + compensation.push({ clientName: LEGACY_TOTAL_CLIENT, amount: total }) + } + } + + // Legacy bill activity: single grdvActivities table. Three known column layouts: + // 2009 4-col: Date | Bill+Title | Lobbyist | Client + // 2010+ individual 5-col: Activity | Position | DirectBiz | Client | Compensation + // 2010+ entity 6-col: Activity | Lobbyist | Position | DirectBiz | Client | Compensation + const actTable = doc.querySelector('[id$="grdvActivities"]') + if (actTable) { + const allRows = Array.from(actTable.querySelectorAll("tr")) + const headerCells = Array.from( + allRows[0]?.querySelectorAll("th, td") ?? [] + ).map(el => el.textContent?.trim() ?? "") + + let billCol = 1 + let positionCol: number | null = null + let clientCol = 3 + + if (headerCells[0]?.includes("Activity")) { + if (headerCells[1]?.includes("Lobbyist")) { + // 6-col entity layout + billCol = 0 + positionCol = 2 + clientCol = 4 + } else { + // 5-col individual layout + billCol = 0 + positionCol = 1 + clientCol = 3 + } + } + + const chamberMap: Record = { + H: "House Bill", + S: "Senate Bill", + HD: "House Docket", + SD: "Senate Docket" + } + + allRows.slice(1).forEach(row => { + const cells = Array.from(row.querySelectorAll("td")).map( + td => td.textContent?.trim() ?? "" + ) + if (cells.length <= Math.max(billCol, clientCol)) return + + const billCell = cells[billCol] + const skipValues = new Set([ + "Activity or Bill No and Title", + "N/A", + "None", + "", + "Total amount" + ]) + if (!billCell || skipValues.has(billCell)) return + + const parts = billCell.split(/\s+/) + const billNo = parts[0] + const activityTitle = parts.slice(1).join(" ") + const match = billNo.match(/^([A-Z]+)(\d+)$/) + if (!match) return + + const [, prefix, number] = match + const chamber: LobbyingChamber = chamberMap[prefix] ?? "Other" + const billId = constructBillId(chamber, number) + const position = positionCol !== null ? cells[positionCol] ?? "" : "" + const clientName = cells[clientCol] ?? "" + + bills.push({ + clientName, + chamber, + rawBillNumber: number, + billId, + activityTitle, + position, + amount: null + }) + }) + } + + return { compensation, bills } +} diff --git a/functions/src/lobbying/scrapeLobbying.ts b/functions/src/lobbying/scrapeLobbying.ts new file mode 100644 index 000000000..7a6140e8e --- /dev/null +++ b/functions/src/lobbying/scrapeLobbying.ts @@ -0,0 +1,274 @@ +import { logger } from "firebase-functions" +import { runWith } from "firebase-functions/v1" +import { db, Timestamp } from "../firebase" +import type { Database } from "../types" +import { normalizeEntityName } from "./normalize" +import { + fetchDisclosureDetail, + fetchDisclosureMeta, + fetchSummaryLinks, + filingId, + makePortalClient, + registrantId, + yearToGeneralCourt +} from "./portal" +import { + FILINGS_COLLECTION, + FIRST_LOBBYING_YEAR, + LobbyingFiling, + LobbyingRegistrant, + REGISTRANTS_COLLECTION, + SCRAPER_DOC +} from "./types" + +/** + * Scraper state stored in Firestore at /scrapers/lobbying. + * + * processedDiscUrls: disc URLs already fetched; skip on re-runs. + * summaryDiscCache: maps summaryUrl → its known disc URLs so we can skip + * summary page GETs for registrants with no new filings. + */ +interface ScraperState { + processedDiscUrls: string[] + summaryDiscCache: Record +} + +/** + * Maximum number of new disclosure pages to fetch per function invocation. + * Each page takes ~1s; this keeps the run well within the 540s timeout. + * Remaining work is picked up on the next scheduled run. + */ +const MAX_DISCLOSURES_PER_RUN = 200 + +/** + * Scrape lobbying disclosure data for the current and prior calendar year. + * + * Runs every 24 hours. New filers arrive semi-annually so daily polling is + * more than sufficient for steady-state freshness. For initial historical + * ingestion (2005-present) use the backfillLobbying admin script instead. + * + * Progress is checkpointed to Firestore after every disclosure page so the + * function is fully resumable if it times out or is interrupted. + */ +export const scrapeLobbying = runWith({ timeoutSeconds: 540, maxInstances: 1 }) + .pubsub.schedule("every 24 hours") + .onRun(async () => { + const currentYear = new Date().getFullYear() + const years = [currentYear, currentYear - 1] + + const scraperRef = db.doc(SCRAPER_DOC) + const scraperDoc = await scraperRef.get() + const state: ScraperState = { + processedDiscUrls: scraperDoc.data()?.processedDiscUrls ?? [], + summaryDiscCache: scraperDoc.data()?.summaryDiscCache ?? {} + } + const processedSet = new Set(state.processedDiscUrls) + const summaryCache: Record = state.summaryDiscCache + + const client = makePortalClient() + let newDiscCount = 0 + + for (const year of years) { + if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break + + logger.info(`scrapeLobbying: fetching summary links for ${year}`) + let summaryUrls: string[] + try { + summaryUrls = await fetchSummaryLinks(client, year) + } catch (e) { + logger.error( + `scrapeLobbying: failed to fetch summary links for ${year}`, + e + ) + continue + } + logger.info( + `scrapeLobbying: ${summaryUrls.length} registrants for ${year}` + ) + + for (const summaryUrl of summaryUrls) { + if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break + + // Use cached disc URLs when available to avoid re-fetching summary pages. + // For current year we always re-check (new filings arrive mid-year). + let discUrls = summaryCache[summaryUrl] + if (!discUrls || year === currentYear) { + try { + const meta = await fetchDisclosureMeta(client, summaryUrl) + discUrls = meta.disclosureUrls + + // Write registrant doc (upsert); don't wait for individual writes to + // finish — use a bulkWriter for the doc contents but checkpoint the + // scraper state separately so interruptions are recoverable. + if (meta.entityName && meta.year) { + await writeRegistrant( + db, + meta.entityName, + meta.year, + meta.regType, + discUrls + ) + } + + summaryCache[summaryUrl] = discUrls + await scraperRef.set( + { summaryDiscCache: summaryCache }, + { merge: true } + ) + } catch (e) { + logger.warn( + `scrapeLobbying: failed to fetch summary ${summaryUrl}`, + e + ) + continue + } + } + + const newDiscUrls = discUrls.filter(u => !processedSet.has(u)) + if (newDiscUrls.length === 0) continue + + for (const discUrl of newDiscUrls) { + if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break + try { + await processDisclosure(db, client, summaryUrl, discUrl, year) + processedSet.add(discUrl) + newDiscCount++ + + // Checkpoint after every disclosure so restarts lose at most one page + await scraperRef.set( + { processedDiscUrls: Array.from(processedSet) }, + { merge: true } + ) + } catch (e) { + logger.warn( + `scrapeLobbying: failed to process disclosure ${discUrl}`, + e + ) + } + } + } + } + + logger.info(`scrapeLobbying: processed ${newDiscCount} new disclosures`) + }) + +// ─── Shared write helpers (also used by backfillLobbying) ──────────────────── + +/** + * Write or update a LobbyingRegistrant document. Client list is assembled from + * the disclosure meta; filing documents are written separately per-bill. + */ +export async function writeRegistrant( + database: Database, + entityName: string, + year: number, + regType: "Lobbyist" | "Employer", + disclosureUrls: string[] +): Promise { + const id = registrantId(entityName, year) + const ref = database.collection(REGISTRANTS_COLLECTION).doc(id) + const partial: Omit & { + fetchedAt: FirebaseFirestore.Timestamp + } = { + registrantId: id, + entityName, + entityNameNorm: normalizeEntityName(entityName), + year, + generalCourt: yearToGeneralCourt(year), + regType, + disclosureUrls, + fetchedAt: Timestamp.now() + } + // Merge so repeated runs don't wipe clients accumulated from multiple disclosures + await ref.set(partial, { merge: true }) +} + +/** + * Fetch one CompleteDisclosure page and write LobbyingFiling documents. + * Also updates the registrant's client list. + */ +export async function processDisclosure( + database: Database, + client: ReturnType, + summaryUrl: string, + discUrl: string, + year: number +): Promise { + const meta = await fetchDisclosureMeta(client, summaryUrl) + const detail = await fetchDisclosureDetail(client, discUrl, year) + + const { entityName, regType } = meta + const gc = yearToGeneralCourt(year) + const entityNameNorm = normalizeEntityName(entityName) + const now = Timestamp.now() + + // Update registrant's client list + if (entityName && year) { + const regRef = database + .collection(REGISTRANTS_COLLECTION) + .doc(registrantId(entityName, year)) + + const clients = detail.compensation.map(c => ({ + clientName: c.clientName, + clientNameNorm: normalizeEntityName(c.clientName), + compensation: c.amount + })) + + await regRef.set( + { + registrantId: registrantId(entityName, year), + entityName, + entityNameNorm, + year, + generalCourt: gc, + regType: regType ?? "Lobbyist", + clients, + disclosureUrls: [discUrl], + fetchedAt: now + }, + { merge: true } + ) + } + + // Write one LobbyingFiling doc per bill row + if (detail.bills.length === 0) return + + const writer = database.bulkWriter() + for (const bill of detail.bills) { + const fid = filingId( + entityName, + bill.clientName, + bill.chamber, + bill.billId, + gc, + bill.position + ) + const doc: LobbyingFiling = { + filingId: fid, + entityName, + entityNameNorm, + clientName: bill.clientName, + clientNameNorm: normalizeEntityName(bill.clientName), + year, + generalCourt: gc, + chamber: bill.chamber, + billId: bill.billId, + activityTitle: bill.activityTitle, + position: bill.position, + amount: bill.amount, + fetchedAt: now + } + writer.set(database.collection(FILINGS_COLLECTION).doc(fid), doc, { + merge: false + }) + } + await writer.close() +} + +/** All years to scrape, for use by the backfill script. */ +export function allLobbyingYears(): number[] { + const current = new Date().getFullYear() + const years: number[] = [] + for (let y = FIRST_LOBBYING_YEAR; y <= current; y++) years.push(y) + return years +} diff --git a/functions/src/lobbying/types.ts b/functions/src/lobbying/types.ts new file mode 100644 index 000000000..83eaab761 --- /dev/null +++ b/functions/src/lobbying/types.ts @@ -0,0 +1,101 @@ +import { + Array, + InstanceOf, + Literal, + Number, + Null, + Record, + Static, + String, + Union +} from "runtypes" +import { Timestamp } from "../firebase" + +export type LobbyingChamber = Static +export const LobbyingChamber = Union( + Literal("House Bill"), + Literal("Senate Bill"), + Literal("House Docket"), + Literal("Senate Docket"), + Literal("Executive"), + Literal("Other") +) + +export type LobbyingClient = Static +export const LobbyingClient = Record({ + clientName: String, + clientNameNorm: String, + compensation: Null.Or(Number) +}) + +export type LobbyingRegistrant = Static +export const LobbyingRegistrant = Record({ + registrantId: String, + entityName: String, + entityNameNorm: String, + year: Number, + generalCourt: Number, + regType: Union(Literal("Lobbyist"), Literal("Employer")), + clients: Array(LobbyingClient), + disclosureUrls: Array(String), + fetchedAt: InstanceOf(Timestamp) +}) + +export type LobbyingFiling = Static +export const LobbyingFiling = Record({ + filingId: String, + entityName: String, + entityNameNorm: String, + clientName: String, + clientNameNorm: String, + year: Number, + generalCourt: Number, + chamber: LobbyingChamber, + // Non-null only for legislative chambers (House Bill, Senate Bill, House Docket, + // Senate Docket). For Executive and Other, no bill join should be attempted. + billId: Null.Or(String), + activityTitle: String, + position: String, + amount: Null.Or(Number), + fetchedAt: InstanceOf(Timestamp) +}) + +/** Firestore path for lobbying registrant documents */ +export const REGISTRANTS_COLLECTION = "lobbyingRegistrants" + +/** Firestore path for lobbying filing documents */ +export const FILINGS_COLLECTION = "lobbyingFilings" + +/** Firestore path for the live scraper cursor document */ +export const SCRAPER_DOC = "/scrapers/lobbying" + +/** Firestore path for the backfill cursor subcollection */ +export const BACKFILL_DOC = "/scrapers/lobbyingBackfill" +export const BACKFILL_URLS_COLLECTION = "processedUrls" + +/** Earliest year with portal data */ +export const FIRST_LOBBYING_YEAR = 2005 + +/** + * Sentinel clientName used for pre-2013 legacy filings where compensation is + * reported as a single total rather than broken down per client. + */ +export const LEGACY_TOTAL_CLIENT = "_total_salary_" + +/** + * Chamber prefix map for constructing billId values that match MAPLE's Bill.id. + * Typed as a plain index signature so portal.ts can look up any LobbyingChamber + * without triggering "Property X does not exist" on the Partial. + */ +export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = { + "House Bill": "H", + "Senate Bill": "S", + "House Docket": "HD", + "Senate Docket": "SD" +} + +/** Canonical chamber values for legacy short-form codes found in older filings */ +export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = { + HB: "House Bill", + SB: "Senate Bill" +} diff --git a/scripts/firebase-admin/backfillLobbying.ts b/scripts/firebase-admin/backfillLobbying.ts new file mode 100644 index 000000000..f7914dd84 --- /dev/null +++ b/scripts/firebase-admin/backfillLobbying.ts @@ -0,0 +1,156 @@ +/** + * Backfill lobbying disclosure data from 2005 to the present. + * + * This script is the primary ingestion path for all historical data. The live + * Cloud Function (scrapeLobbying) only handles the current and prior year in + * steady state. Run this once to populate the full history, and re-run with + * --year to refresh specific years. + * + * Usage: + * GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + * yarn firebase-admin run-script backfillLobbying --env dev + * + * Options: + * --year NUMBER Only process this year (useful for testing or re-runs) + * --limit NUMBER Max registrants to process per year (for testing) + * + * Cursor storage: + * Processed disclosure URLs are stored as documents in the Firestore + * subcollection /scrapers/lobbyingBackfill/processedUrls/{urlHash}. + * This scales to the full historical URL set (~50,000+) without hitting the + * 1MB Firestore document size limit. Restart the script at any time; it will + * resume from where it left off. + */ + +import { createHash } from "crypto" +import { z } from "zod" +import { + allLobbyingYears, + processDisclosure, + writeRegistrant +} from "../../functions/src/lobbying/scrapeLobbying" +import { + fetchDisclosureMeta, + fetchSummaryLinks, + makePortalClient +} from "../../functions/src/lobbying/portal" +import { + BACKFILL_DOC, + BACKFILL_URLS_COLLECTION, + FIRST_LOBBYING_YEAR +} from "../../functions/src/lobbying/types" +import { Script } from "./types" + +const Args = z + .object({ + year: z.number().int().min(FIRST_LOBBYING_YEAR).optional(), + limit: z.number().int().positive().optional() + }) + .passthrough() + +export const script: Script = async ({ db, args }) => { + const { year: onlyYear, limit } = Args.parse(args) + + const years = onlyYear ? [onlyYear] : allLobbyingYears() + console.log( + `backfillLobbying: processing years ${years[0]}–${years[years.length - 1]}` + ) + + // Load already-processed disc URLs from the subcollection cursor. + const backfillRef = db.doc(BACKFILL_DOC) + const processedSnap = await backfillRef + .collection(BACKFILL_URLS_COLLECTION) + .select() // fetch only doc IDs (the URL hash), no field data needed + .get() + const processedHashes = new Set(processedSnap.docs.map(d => d.id)) + console.log( + `backfillLobbying: ${processedHashes.size} disc URLs already processed` + ) + + const client = makePortalClient() + let totalNew = 0 + + for (const year of years) { + console.log(`\n── ${year} ──`) + + let summaryUrls: string[] + try { + summaryUrls = await fetchSummaryLinks(client, year) + } catch (e) { + console.error(` Failed to fetch summary links for ${year}:`, e) + continue + } + + if (limit) summaryUrls = summaryUrls.slice(0, limit) + console.log(` ${summaryUrls.length} registrants on portal`) + + let yearNew = 0 + + for (let i = 0; i < summaryUrls.length; i++) { + const summaryUrl = summaryUrls[i] + let meta: Awaited> + + try { + meta = await fetchDisclosureMeta(client, summaryUrl) + } catch (e) { + console.warn( + ` [${i + 1}/${ + summaryUrls.length + }] Failed to fetch summary: ${summaryUrl}`, + e + ) + continue + } + + if (meta.entityName && meta.year) { + try { + await writeRegistrant( + db, + meta.entityName, + meta.year, + meta.regType, + meta.disclosureUrls + ) + } catch (e) { + console.warn(` Failed to write registrant ${meta.entityName}:`, e) + } + } + + for (const discUrl of meta.disclosureUrls) { + const urlHash = createHash("sha256") + .update(discUrl) + .digest("hex") + .slice(0, 40) + if (processedHashes.has(urlHash)) continue + + try { + await processDisclosure(db, client, summaryUrl, discUrl, year) + + // Mark as processed in the subcollection cursor + await backfillRef + .collection(BACKFILL_URLS_COLLECTION) + .doc(urlHash) + .set({ url: discUrl, processedAt: new Date().toISOString() }) + + processedHashes.add(urlHash) + totalNew++ + yearNew++ + } catch (e) { + console.warn(` Failed to process disclosure ${discUrl}:`, e) + } + } + + if ((i + 1) % 50 === 0 || i + 1 === summaryUrls.length) { + console.log( + ` [${i + 1}/${ + summaryUrls.length + }] ${yearNew} new disclosures this year` + ) + } + } + + console.log(` ${year} complete: ${yearNew} new disclosures`) + } + + console.log(`\nbackfillLobbying complete: ${totalNew} new disclosures total`) +} From 0074294861aece13daea9c90be3af634afd407fa Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 5 Jun 2026 16:34:07 -0400 Subject: [PATCH 3/4] feat: add Python Cloud Run scraper for lobbying disclosures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to classify HTTP clients before examining headers. Python's requests library produces a fingerprint that Imperva allows through; Node.js does not. A standalone Cloud Run container (Python 3.12) is therefore used for the scheduled ingestion instead of a Cloud Function. lobbying-scraper/ — Cloud Run container (3 pip deps: requests, beautifulsoup4, google-cloud-firestore): - scrape.py: entry point with --mode weekly (incremental, fast exit if nothing new) and --mode backfill (full 2005-present history, resumable subcollection cursor). Weekly mode caches summary URL→disc URL mappings so prior-year registrants with no new filings require zero additional HTTP requests. - portal.py: HTTP session management + HTML parsing for all three portal page levels (search POST, summary GET, disclosure GET). Handles both modern (>=2013) and legacy (<2013) disclosure formats. - normalize.py: port of functions/src/lobbying/normalize.ts — 10-step entity name normalization pipeline, must match the TypeScript version exactly. - writer.py: Firestore document construction and batch writes. Schema matches types.ts (lobbyingRegistrants, lobbyingFilings collections). scripts/firebase-admin/backfillLobbying.ts — simplified to spawn scrape.py as a subprocess; all HTTP and Firestore logic moved to Python. functions/src/lobbying/http/ — thin Python HTTP helper kept for reference; not used in the current architecture. Note: server-side IP reputation behavior with Imperva untested. Build and run the container on Cloud Run with --dry-run to validate before full deploy. Co-Authored-By: Claude Sonnet 4.6 --- docs/lobbying-disclosure-ingestion.md | 167 +++++--- functions/src/lobbying/http/.gitignore | 3 + functions/src/lobbying/http/fetch.py | 81 ++++ functions/src/lobbying/http/requirements.txt | 1 + functions/src/lobbying/normalize.ts | 3 +- functions/src/lobbying/portal.ts | 96 ++++- lobbying-scraper/.dockerignore | 4 + lobbying-scraper/Dockerfile | 14 + .../__pycache__/normalize.cpython-37.pyc | Bin 0 -> 1412 bytes .../__pycache__/portal.cpython-37.pyc | Bin 0 -> 11941 bytes lobbying-scraper/normalize.py | 50 +++ lobbying-scraper/portal.py | 376 ++++++++++++++++++ lobbying-scraper/requirements.txt | 3 + lobbying-scraper/scrape.py | 269 +++++++++++++ lobbying-scraper/writer.py | 126 ++++++ scripts/firebase-admin/backfillLobbying.ts | 168 ++------ 16 files changed, 1157 insertions(+), 204 deletions(-) create mode 100644 functions/src/lobbying/http/.gitignore create mode 100644 functions/src/lobbying/http/fetch.py create mode 100644 functions/src/lobbying/http/requirements.txt create mode 100644 lobbying-scraper/.dockerignore create mode 100644 lobbying-scraper/Dockerfile create mode 100644 lobbying-scraper/__pycache__/normalize.cpython-37.pyc create mode 100644 lobbying-scraper/__pycache__/portal.cpython-37.pyc create mode 100644 lobbying-scraper/normalize.py create mode 100644 lobbying-scraper/portal.py create mode 100644 lobbying-scraper/requirements.txt create mode 100644 lobbying-scraper/scrape.py create mode 100644 lobbying-scraper/writer.py diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md index ad67fe397..264c77c52 100644 --- a/docs/lobbying-disclosure-ingestion.md +++ b/docs/lobbying-disclosure-ingestion.md @@ -233,43 +233,57 @@ executive and legislative null rows. ## Scraper Architecture -The lobbying portal is an HTML scraper, not a REST API. It does not fit the -`createScraper` factory (which assumes list-IDs → fetch-per-ID against the MA -Legislature API). Instead, we use a custom scheduled function following the -`scrapeEvents` pattern. - -### Cloud Function: `scrapeLobbying` - -**File:** `functions/src/lobbying/scrapeLobbying.ts` - -- Schedule: `every 24 hours` -- Scrapes the current year and prior year (new filers arrive semi-annually) +### Why a standalone Cloud Run container + +The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to +classify HTTP clients at the network layer before examining any headers. Node.js +produces a TLS fingerprint that Imperva challenges with a JavaScript +verification page; Python's `requests` library produces a fingerprint that +Imperva allows through without challenge. This is a runtime-level constraint +that cannot be addressed by header configuration or cipher reordering alone. + +The scraper therefore runs as a standalone **Cloud Run container** written in +Python, deployed alongside the existing MCP server container. All data modeling, +Firestore collection/field names, and normalization logic are documented here and +kept consistent between the Python container and the TypeScript type definitions +in `functions/src/lobbying/types.ts`. + +### Cloud Run container: `lobbying-scraper/` + +**Files:** `lobbying-scraper/{scrape,portal,normalize,writer}.py` + +- Scheduled weekly by Cloud Scheduler +- Runs an incremental check: fetches the current and prior year's summary links + (one POST), compares disc URLs against the Firestore cursor, and **exits + immediately if nothing is new** (fast path, typically seconds) +- When new or updated disclosures are found, fetches and processes them - Persists a cursor in `/scrapers/lobbying`: - - `lastFetchedAt: Timestamp` - - `processedDiscUrls: string[]` — already-fetched disclosure URLs (skipped on - re-runs) + - `processedDiscUrls: string[]` — disc URLs already written; skipped on + re-runs + - `summaryDiscCache: {[summaryUrl]: string[]}` — maps summary page URLs to + their disc URLs so summary page GETs are skipped for prior-year registrants + whose disclosures are all already processed - For each new disclosure URL: - Parse registrant + client compensation rows → upsert `lobbyingRegistrants` - doc - - Parse bill activity rows → batch-write `lobbyingFilings` docs -- Uses `axios` (existing dependency) with an iPad `User-Agent` header to match - portal expectations -- Uses `jsdom` for HTML table parsing (already a dependency; used by events scraper) -- 1s delay between requests; exponential backoff on failure (matching existing - scraper retry pattern) -- Function timeout: 540s - -### Incremental Strategy - -Processed disclosure URLs are stored in `/scrapers/lobbying.processedDiscUrls`. -At ~2 disclosure URLs per registrant × ~500 registrants per year, the -current+prior year window stays well within Firestore document limits. -Historical years beyond current-1 are stable (filings are frozen after year -closes) and are handled by the backfill script only. - -The backfill script uses a separate Firestore document -(`/scrapers/lobbyingBackfill`) for its own cursor so it does not interfere with -the live scraper. + - Parse bill activity rows → batch-write `lobbyingFilings` +- 1s delay between requests; exponential backoff on transient failures + +### Incremental strategy + +In steady state (after the initial backfill), each weekly run: + +1. One POST to fetch all summary links for current + prior year +2. For prior-year registrants with all disc URLs in the cursor: zero GETs +3. For current-year registrants: one GET per summary page to check for new + disclosure periods +4. For any new disc URLs: one GET per disclosure page + +New filings arrive twice a year (semi-annual reporting periods). Between +periods, the run completes in under a minute. + +The backfill script (`--mode backfill`) uses a separate subcollection cursor at +`/scrapers/lobbyingBackfill/processedUrls/{urlHash}` so it does not interfere +with the live scraper state. ### Legacy Format (pre-2013) @@ -284,26 +298,64 @@ them. No bill-level compensation amount is available for these years. ``` functions/src/lobbying/ - types.ts — Runtypes definitions for LobbyingRegistrant, LobbyingFiling - scrapeLobbying.ts — Scheduled Cloud Function + shared parsing/normalization logic - index.ts — Re-exports + types.ts — Runtypes definitions for LobbyingRegistrant, LobbyingFiling + normalize.ts — Entity name normalization pipeline + portal.ts — Reference implementation (HTTP layer not used in production) + scrapeLobbying.ts — Reference implementation (superseded by Cloud Run container) + index.ts — Re-exports + +lobbying-scraper/ + scrape.py — Entry point: --mode weekly (incremental) | --mode backfill + portal.py — HTTP + HTML parsing + normalize.py — Port of normalize.ts + writer.py — Firestore document construction + writes + requirements.txt — requests, beautifulsoup4, google-cloud-firestore + Dockerfile — Python 3.12-slim image ``` --- -## Firebase Admin Script +## Deploying the Cloud Run Container + +Follows the same pattern as the MCP server. Requires the +`maple-lobbying-scraper` Artifact Registry repository to exist. + +```bash +cd lobbying-scraper +IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest +docker build -t $IMAGE . && docker push $IMAGE + +gcloud run jobs create maple-lobbying-scraper \ + --image=$IMAGE \ + --project=digital-testimony-dev \ + --region=us-central1 \ + --service-account=@digital-testimony-dev.iam.gserviceaccount.com + +# Schedule weekly via Cloud Scheduler +gcloud scheduler jobs create http maple-lobbying-weekly \ + --schedule="0 6 * * 1" \ + --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/digital-testimony-dev/jobs/maple-lobbying-scraper:run" \ + --http-method=POST \ + --oauth-service-account-email=@digital-testimony-dev.iam.gserviceaccount.com \ + --location=us-central1 +``` -**File:** `scripts/firebase-admin/backfillLobbying.ts` +## Historical Backfill (Admin Script) -Ingests all historical filings from 2005 to the present. This is the primary -path for all data before the current and prior year. Accepts `--year` and -`--limit` CLI args for targeted re-runs or testing. Calls the same parsing -logic exported from `functions/src/lobbying/scrapeLobbying.ts` and writes -directly to Firestore via the firebase-admin SDK. +Ingests all historical filings from 2005 to the present. Delegates to +`scrape.py --mode backfill` via subprocess. Resumable — the subcollection +cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks what has been +processed. Run directly on the machine (requires `lobbying-scraper/` deps +installed or the `maple-2025` conda environment). ```bash GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ yarn firebase-admin run-script backfillLobbying --env dev + +# Or call scrape.py directly for more control: +cd lobbying-scraper +python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run +python3 scrape.py --mode backfill --year 2024 ``` --- @@ -348,17 +400,22 @@ export { scrapeLobbying } from "./lobbying" ## Implementation Status -| File | Status | -| -------------------------------------------- | ------- | -| `functions/src/lobbying/types.ts` | ✅ Done | -| `functions/src/lobbying/normalize.ts` | ✅ Done | -| `functions/src/lobbying/portal.ts` | ✅ Done | -| `functions/src/lobbying/scrapeLobbying.ts` | ✅ Done | -| `functions/src/lobbying/index.ts` | ✅ Done | -| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done | -| `functions/src/index.ts` (export) | ✅ Done | -| `firestore.rules` | ✅ Done | -| `firestore.indexes.json` | ✅ Done | +| File | Status | Notes | +| -------------------------------------------- | ------- | ---------------------------------------------------------- | +| `functions/src/lobbying/types.ts` | ✅ Done | TypeScript type definitions; source of truth for schema | +| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline (also ported to `normalize.py`) | +| `functions/src/lobbying/portal.ts` | ✅ Done | Kept for reference; HTTP layer not used (see architecture) | +| `functions/src/lobbying/scrapeLobbying.ts` | ✅ Done | Not deployed; superseded by Cloud Run container | +| `functions/src/lobbying/index.ts` | ✅ Done | | +| `functions/src/index.ts` (export) | ✅ Done | | +| `firestore.rules` | ✅ Done | | +| `firestore.indexes.json` | ✅ Done | | +| `lobbying-scraper/normalize.py` | ✅ Done | Port of normalize.ts | +| `lobbying-scraper/portal.py` | ✅ Done | HTTP + HTML parsing | +| `lobbying-scraper/writer.py` | ✅ Done | Firestore document construction | +| `lobbying-scraper/scrape.py` | ✅ Done | Entry point; `--mode weekly` and `--mode backfill` | +| `lobbying-scraper/Dockerfile` | ✅ Done | Python 3.12 slim | +| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done | Calls `scrape.py --mode backfill` as subprocess | ### Document ID scheme diff --git a/functions/src/lobbying/http/.gitignore b/functions/src/lobbying/http/.gitignore new file mode 100644 index 000000000..d0ee3b17c --- /dev/null +++ b/functions/src/lobbying/http/.gitignore @@ -0,0 +1,3 @@ +venv/ +__pycache__/ +*.pyc diff --git a/functions/src/lobbying/http/fetch.py b/functions/src/lobbying/http/fetch.py new file mode 100644 index 000000000..4e6c2c4ec --- /dev/null +++ b/functions/src/lobbying/http/fetch.py @@ -0,0 +1,81 @@ +"""Minimal HTTP fetch helper for the lobbying portal. + +Handles the portal's session cookie requirements that standard Node.js HTTP +clients cannot satisfy due to TLS-layer constraints. + +Usage: + python3 fetch.py --url URL [--method GET|POST] [--jar PATH] + +POST body is read from stdin as application/x-www-form-urlencoded. +Cookies are persisted to/from the JSON file at --jar so the session survives +across multiple subprocess invocations. +HTML response is written to stdout. Errors go to stderr with exit code 1. +""" + +import argparse +import json +import sys +from pathlib import Path + +import requests + +_UA = ( + "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" +) + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--url", required=True) + p.add_argument("--method", default="GET", choices=["GET", "POST"]) + p.add_argument("--jar", default=None, help="Path to JSON cookie-jar file") + args = p.parse_args() + + session = requests.Session() + session.headers.update( + { + "User-Agent": _UA, + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + } + ) + + if args.jar: + jar = Path(args.jar) + if jar.exists(): + try: + session.cookies.update(json.loads(jar.read_text())) + except Exception as e: + print(f"warning: could not read cookie jar: {e}", file=sys.stderr) + + try: + if args.method == "POST": + body = sys.stdin.buffer.read() + resp = session.post( + args.url, + data=body, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + timeout=180, + ) + else: + resp = session.get(args.url, timeout=60) + + resp.raise_for_status() + + if args.jar: + Path(args.jar).write_text(json.dumps(dict(session.cookies))) + + sys.stdout.buffer.write(resp.content) + + except requests.exceptions.HTTPError as e: + print(f"HTTP error {e.response.status_code}: {args.url}", file=sys.stderr) + sys.exit(1) + except requests.exceptions.RequestException as e: + print(f"request failed: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/functions/src/lobbying/http/requirements.txt b/functions/src/lobbying/http/requirements.txt new file mode 100644 index 000000000..b18d51347 --- /dev/null +++ b/functions/src/lobbying/http/requirements.txt @@ -0,0 +1 @@ +requests>=2.28 diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts index 8d3d0a0ba..a7beb338f 100644 --- a/functions/src/lobbying/normalize.ts +++ b/functions/src/lobbying/normalize.ts @@ -5,8 +5,7 @@ * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a * Acme Consulting", etc. across filings and years. * - * This pipeline is a direct port of the reference implementation used in the - * companion data analysis project. The steps must be applied in the exact order + * The steps must be applied in the exact order * listed here; changing the order produces different (incorrect) output. */ diff --git a/functions/src/lobbying/portal.ts b/functions/src/lobbying/portal.ts index e441522b8..64d65831b 100644 --- a/functions/src/lobbying/portal.ts +++ b/functions/src/lobbying/portal.ts @@ -19,6 +19,7 @@ import axios, { AxiosInstance } from "axios" import { JSDOM } from "jsdom" import { sha256 } from "js-sha256" +import { CookieJar } from "tough-cookie" import { CHAMBER_PREFIXES, LEGACY_CHAMBER_MAP, @@ -72,19 +73,68 @@ export interface DisclosureDetail { // ─── HTTP helpers ──────────────────────────────────────────────────────────── -export function makePortalClient(): AxiosInstance { - return axios.create({ - headers: { "User-Agent": IPAD_UA }, - timeout: 60_000 +/** + * Create an axios instance pre-configured for the MA SoS portal. + * + * Includes a cookie jar via interceptors so ASP.NET session state (ViewState, + * anti-forgery tokens) is preserved across the GET → POST page flow without + * requiring the axios-cookiejar-support package. + */ +export interface PortalClient { + jar: CookieJar + client: AxiosInstance +} + +/** + * Create a portal client pre-configured for the MA SoS portal. + * + * Uses maxRedirects: 0 so our manual redirect loop (inside getHtml / postHtml) + * can extract Set-Cookie headers at each hop before following. This is necessary + * because the portal is protected by Incapsula, which issues a 302 challenge on + * first contact and requires the session cookies to be sent on the retried request. + * Axios's built-in redirect following happens before response interceptors fire, + * so the cookies from the challenge response are never captured automatically. + */ +export function makePortalClient(): PortalClient { + const jar = new CookieJar() + const client = axios.create({ + headers: { + "User-Agent": IPAD_UA, + Accept: "*/*", + "Accept-Encoding": "gzip, deflate, br", + Connection: "keep-alive" + }, + timeout: 60_000, + maxRedirects: 10, // let axios handle ordinary redirects; only Incapsula challenges need manual handling + validateStatus: s => s < 500 // surface 4xx so we can log them }) + return { jar, client } } function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)) } +function cookieHeader(jar: CookieJar, url: string): string { + return jar + .getCookiesSync(url) + .map(c => c.cookieString()) + .join("; ") +} + +function saveCookies( + jar: CookieJar, + url: string, + headers: Record +): void { + const raw = headers["set-cookie"] + if (!raw) return + const list = Array.isArray(raw) ? raw : [raw] + for (const c of list) jar.setCookieSync(c, url) +} + async function getHtml( - client: AxiosInstance, + pc: PortalClient, url: string, retries = MAX_RETRIES ): Promise { @@ -93,10 +143,16 @@ async function getHtml( attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt ) try { - const res = await client.get(url, { + const res = await pc.client.get(url, { responseType: "text", - headers: { Accept: "text/html" } + headers: { Cookie: cookieHeader(pc.jar, url) } }) + saveCookies( + pc.jar, + url, + res.headers as Record + ) + if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`) return new JSDOM(res.data).window.document } catch (e) { if (attempt === retries - 1) throw e @@ -108,7 +164,7 @@ async function getHtml( } async function postHtml( - client: AxiosInstance, + pc: PortalClient, url: string, data: Record, retries = MAX_RETRIES @@ -119,14 +175,20 @@ async function postHtml( attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt ) try { - const res = await client.post(url, body, { + const res = await pc.client.post(url, body, { responseType: "text", headers: { "Content-Type": "application/x-www-form-urlencoded", - Accept: "text/html" + Cookie: cookieHeader(pc.jar, url) }, timeout: 180_000 }) + saveCookies( + pc.jar, + url, + res.headers as Record + ) + if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`) return new JSDOM(res.data).window.document } catch (e) { if (attempt === retries - 1) throw e @@ -237,10 +299,10 @@ function extractViewState(doc: Document): Record { * Sends a single search POST with page size 20000 to get all registrants at once. */ export async function fetchSummaryLinks( - client: AxiosInstance, + pc: PortalClient, year: number ): Promise { - const searchPage = await getHtml(client, SEARCH_URL) + const searchPage = await getHtml(pc, SEARCH_URL) const vs = extractViewState(searchPage) const postData: Record = { @@ -257,7 +319,7 @@ export async function fetchSummaryLinks( ctl00$ContentPlaceHolder1$btnSearch: "Search" } - const resultsPage = await postHtml(client, SEARCH_URL, postData) + const resultsPage = await postHtml(pc, SEARCH_URL, postData) const table = resultsPage.querySelector( '[id*="grdvSearchResultByTypeAndCategory"]' @@ -280,10 +342,10 @@ export async function fetchSummaryLinks( * Fetch a Summary.aspx page and return the registrant metadata + disclosure URLs. */ export async function fetchDisclosureMeta( - client: AxiosInstance, + pc: PortalClient, summaryUrl: string ): Promise { - const doc = await getHtml(client, summaryUrl) + const doc = await getHtml(pc, summaryUrl) const text = (id: string) => { const el = doc.getElementById(id) @@ -322,11 +384,11 @@ export async function fetchDisclosureMeta( * Handles both modern (≥~2013) and legacy (<~2013) HTML layouts. */ export async function fetchDisclosureDetail( - client: AxiosInstance, + pc: PortalClient, discUrl: string, year: number ): Promise { - const doc = await getHtml(client, discUrl) + const doc = await getHtml(pc, discUrl) const compensation: RawCompensation[] = [] const bills: RawBillActivity[] = [] diff --git a/lobbying-scraper/.dockerignore b/lobbying-scraper/.dockerignore new file mode 100644 index 000000000..9460c99c4 --- /dev/null +++ b/lobbying-scraper/.dockerignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +*.pyo +.env diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile new file mode 100644 index 000000000..738293459 --- /dev/null +++ b/lobbying-scraper/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY normalize.py portal.py writer.py scrape.py ./ + +# Cloud Run sets PORT; we don't use it (this is a job, not a server). +# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally. +ENV PYTHONUNBUFFERED=1 + +CMD ["python3", "scrape.py", "--mode", "weekly"] diff --git a/lobbying-scraper/__pycache__/normalize.cpython-37.pyc b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47c3ba707ebdca4ef16776e15507d9f385437cab GIT binary patch literal 1412 zcmZux-EP}96c#1_Dt4N-N!xWp!EHbnrK|k{!&XIB5^bt&SWbfq83b8mVimHaQqr0j z3ebx^!Y;O97tZA#W3Qph4j8ak@YN2LIL*aK@bKd~JRc7ae|Fn#S%C87pU>a?UK51B z`CwJr0A2_iLh=&;BG>{!L=4bOv_-_R6iBv2rA38QsPv|KDcZGiRJ}xYy+k!YjS|%X zHA#asiA>f=i?qo)xkEO{U2>1yC!42jTP9oN0eSd`NFI?-ey-SSM6v|sarPH1u7|0g zp5xG+Q5;5W<_7-UP5mguu^-dG4{1l1dp@IHisOi-I6A>6S?KX8NjG6$H;5*ab3Z)o zuH0!SO*+_0X`J9$mZW$>u^Y#MPYL!zIi30mre`h?MvPE~KfM11ds8<&1rnT2>9sm$ z5y?E7VB(*gP)5TPZ~LJaF&>M8qb!ZHv?Kq^bE=?X%?-mSEkjTG^DmB*XT9TOr~7QT zlJc97?HQU5-F#i`_poYN1I++ZYu@nF=~ zUt@KQfn(lUDPHi=$~^vP2I8SQ8u#^)HNQVR+%*ji4+kJ$=DsUxjHVe2J;ZYl zr2?%AP2*R30U#HW20<3|3*l0@Kq*>?1m);bya2h7h)5)k&mNIVPxx`;Lgd=~TKl$h zB%~Gakghz^>PK99S4r)LsZJVKOlhrT0v}+KuK3@c0aJbDy=d@Qa)?W5_6uQ&o-Ywr z>Pz%>fjUd{RVG2`AKw3Z-T$^k+e@S@(NkN3qq2FnjH{Dme5TSYs#zRUR@5E1RL)3; zBxRGL&S)IC9xa*`4X;_*wJQ%5#ZWT4>$l~fMWY>?C3gc-k(3vfI zQ@a*VnVZl*1@;)U%kR6>XhypsO}f3v%VzK*lJ0?NIlDh-&K?bd=v^}P4UQ+%sH`ID65V zS&3ZFx@qLL2~rh`YXm`~6lBqd)Q?4g0xi&oqJKe)2F0L2ffNm(zO+D#JQ#iG?>n=* zyogMiq}j!sIp;fd5KEWOjc56O0?6xbR}bE+I@P=9<)h#*-FmL zReH@{z>{X5)5lVKBs0&_<_?xI`&o~$GAlv&)G4~0`zRQx6ye6^z?7*iIJMdU}ESviU0}w+% z4hqO&b_9?k0&<8Q#sAndg&k)vf2x@;u@m^8M7h9Dp=^lqX?EtB%FePOKn??P6wfdq z=h%5bj)*oV*l7Vj!7e;YvRBwe)Es56ve%x;>?`bbl#Zcvg8de|gr0A(%YYmgkT(Il z@=RlIv9|$x8KpPb2xf4VjiP)4ZH}@rHjZBJuxo&v6uk^K0m!@TtAG?R?t5$!eXg@2 z%BR>3HpSjYd790jd|LE4%8syEeCH3qer#FAWbrdh}?> zcS=KkU6~@$!nOVd5 z$?tt;_>D@%=F5iXRu_FEu;)AnEia&DTrp((^(E1gJ7w1oxQ!K5?TRzJjBgAF%XP;% zZH(3`b+r$dA-IQUNuf$dGnJC=gz-UxMY+$dpIhPKV`dYwl-c3 zFc7{$6(jMuuHzS-M<3IwTGq$aR*VaO2m?+!WxKR&oV?r_PxPCyJs^$-G)@LJP|B!P zz2(9SqZ%h|%$}Cu7lty=Ni!7WsK>5W(TUdT7Yj<1W_DngJlpppecp9EhBm1ib*i^L z)aR}`b|Y}-8{SNJLb&W4KEM`XtL|ICgx=izdCfzIO$H=>ZWmH#yEe$x&Rt3f-UV50d5zJkvfQc%b9!64TfJk_v+C3q4TdAC*LFdpNZREb4rV~ z0w{pxI>YB*9Y25Z^>VBW|MRWqI&$tmCMD_lUsCQLkTZKNfji3=*t8;fB9dn!WnwH+ zX2uFyGc!?Z_>OTE(`@$4I8|_bT*$^1W3|$v6Et(}s&=uLxuxJkaO)WZ$=xVFIx%wn z>iD!ZH9h{$ySK+@wjCF!Rwk~R%FNZMe{%fV$mkub)A#zwR4HCdoT_*U-F^reM>1t- zcLh3Lc2uT7qpEmocduq(AjH7FLe zC}oK@mK9|zt5RbP&p|$GSq~bvN4-(%v#fP2Wm$hN@x7RrNWQ!_yikM4RvmvB7;03Y zhWz1)$rrP+`+(bQ~FjQR3T5Fhat6(gRbX=5zKDS7HL(%lHSSwibz)g)WH`5+eJ;~2q z)O9tj)!-hKRdP#w5JN;7_2VxiyCr@Oqhbn=Pi9dEKb)XjW99;C)-dC!4h$qo=Zui}l=gH`0lBu=3RA zmD8g62}%l-2*x-?xvjY4Br0y>Y0FN(%n1^yAWA{uM=77+PSD`h5{yOhqqmA@luXDx zGJzCIfjkK5*(dFhuvVq`9OebyQREK`BR&H%pg2q<-oYE2Ap+^@M+!}VdB z`aXS>tRusiIDg@li=y*PoI?+2jliv9u0nJS=b=-@v}=v>0?mHe@E2^fBD2@W{?(bH z@sqEAE{3FvI(O#XCHfq99yDC;Fe4C#9~8b@ZjkLO+7=5sCxsiSWhdaLDVO=c_8QJO z=QSRsXQAXoc0I|WL_GWp{7~v(Io?cMR>6Qmp-ah(tl_W7c^Sg{2H+U+2wt&hy}$&d z9Rw4QoQEJZ0~>(cYD*C;p{yvYV2c%X6^A$kor@Ac z*o9YLixN(?1p78hFE~rgEjxa&eh!hk=qztuQf|$1w_Mi3z(NB_G8lW5K9h_WLHlXU z(`3*@xeY3T8&oTnm_UwxFyEvU9y&|sZ=lu}i%I1P7i6zA^Tcbwsy{U2fX8c9C{D3zREi&|WLmjcH4&dk z35|!9-^U6A^5KQ1d#0_*GN2%-UTmtThEFwjl$&n-jDg+O1NWXW=6Ex+=s5MWP(TkI zvUYF6Av`+*%m-#-q*QY1L9;J@pB*Q{W9DWCilFMiYr*7n+Q;Jwo5}W!!8DEy7s!ti zjXK;4CsM7OBiQC;e={<&6?@UK{CK6GVKFhIL-8fR=Jd}fd=hX@cq*t-vfVaPVGC?j zd$bMo;1P)VAu>yU8ep$_ERRWdzYC5h@+i3eNb9d_``2VL5?Lth3R`B zX)GCPkT)=)EZrg*Lyu3i+tMBBLFP6df`L4#B0r1yJ|`Xonv#KAacYgA*|QK-yrI}j ze@=97W)0)o_$>5M0GGL`Trvv1kq(!*?7$koK62Zd9-p0lcYG$&Xbfyb9;h4j0{s5X zacs_GIR}8i-`_WL0X=uEiKf# zWY>NU*;hbzU65Vvl6|R7b{NWSvg@13u79F$OZFdO+AzT8@ZwWQHqf2FPmr7uhfLNJ zo=;PU8A@g;xk(Sv5{r^e z`b)QHbVBt8`sLTtkD(TJkme&)5Yvmemwxo_+G}L4lD7JkbQkvd&i5!Mng-OGtz0w~k4>a1#B8}YBNPkHCaRliS z%#XP#L+4CbzV%JZ`!aKx*pAGaD}RkIiJdB;D01_V_0V-5#UZcDUae$%{*|G%#*vP} z8qk^xE@KX~HY6!_Zc_7Zm05_v;NxW5ZTccy>h8dQ)%l_%Clp>qX`Ls8eO8FVQ5s{z zUcW&IJXFV4`eTK(UWm{skPO@eDw%u82}CpaanI7x<6Y6>RFcMKq&owg7vaKmhTd3NAp^2(is$I+7Pz8IX&qM6VEov*kGy#S^LIA z*EVd!$5sNX(FdfoAh>&oFJSIwA7^uMnDFW{nNE@XN+eI3`h|1&nTfcnkT!FcHU7bP zaTYEHbfSs)gD9eO^E$vyqZD}O&K-lk3b36`d3MQ}sCn3z&L4}%XuiIEi$*(yIm+F@ z;jSG|^)WaSzsl@+orY*0-+l^= z*C}K>1AnA>bo*v=K{cLIq>GSkS5!&9NnC}fW|0Y{T87NdMe?$gqIsA?o^ju4$44YE z#_ULyjlxnW!}%BTLDs_3W=Tn1_F#H5`^w*A;QzuSHIL6bL21EiN0BWNMIKrc&vtu^ zrnsy-Ntz%jAu-8I$dpN+TxdaaKzqPlglh&TTQ zGhZd`|2ZoZ_(xfz4lD`eQ923wNJC;;xhfQ3@c7y3A{DDlX&aP+1MRuC2mR8 zLAbl*>Y8m=S865hz#fFVn+kPMe}Sc|3d^*TtcPVkmA)z6S6PnqUaa+6Dd%bFLB$4I z5;ZU4EtDfzhy14~1ry35OAUap17Ir7#_eSv^`L$KU+}cut*qQU8dGdtNUehK)SAH1 zkmm6%RHo(4dDGnkm?9LJxqq8xq#(^h+f-r>)Hn`qYRib9raC7`zZo$# z`$w^-Z#*LsQo=08|26@Lu%f@2j^k%UwrnS$YEhI-$T1CUrit1jbShTU zu|kMLqFr0g{-045%P{11Kodb^m_S(==@})jCggrCcFtg`KStXZ0@pyNn4+*!I;J1V z0LiiN&q(W1sE*`!Rlyra7I>2*D=(+wz*HQRf|OqDNT~=&L2VZ_#3+sliTF}JLtzT; z)M8VKY|q3@mORIRXg-H2(_Ccul$>UadNoZ zrWg*s?qXD-4>zoj)~VqU>+45G7_~yGVM}RG;6*~JBPHNn4&Sg_4u~!_#OG+Y#9P5T ziB4+K)UI)tnTZMESGUO+$)!!J;Xa0dZzAykndSkjj9wPU*~*%Pcec<10{P;&&jLe; z4M5&f$7j|CfwS9<%6=OkX!~YSs>5LG!ce5_=yw|od0)I5}+BDhuiT5+Y#niUKk8tgt_}Ybguy_Jv${m_~Mqd z^dro@N6h_)nESxCbKkXyuk%9>eimfoxno_u@$6fDK`!hK`(VuHLmjK@4YO1Z^WAl8 z=z9;XQ+u(6v0NLzdy(KvfbZS_-zDIm3+(J7?4S>BUpxUmfuWz>k{vH3r$gO2^@Kf7HGV4W ziD%OqU@wW8450sK;Q;XlY7CYvgDa{^SrUA-E8KE^4uupb~JoE9Nbfe$0-b z4jd)we$0-3q_%dq_Ou488Fp-4{&*+PUdFABlXTDInbz8i+aqUi!z9Osa00OpTz3d_ z$uNmxkb?WNz}fJ2I16@!yTif9%A)*KA0AMOWaFx(&R!@N{JFY5NM z^n^$1`3jV|GnhyWP{1py@k5v9on8jesU*JkZaODADEF)@vhHnF3#FgP5CM~8xm%(=b zJ!pXv`tONbtZ^aD>z1^h?+O=+LOX4(Iq9GUenu~8&U%*NWGB{|ySKh~A@+^VUdYjR zD>b^Kt43ek;h8wn+e)Vji>*6J+VQ%6#vls9&&ZR-hj53m{@PxI(y{WOjDH6WvB3xz zK?yeRSi%Gu=xE}E)I9yd7XIS&y6(XfTJ2d%FoGmvOZe1I%G37KojKLgUM4)G&c;N6 zW`Saa)>mmaQ;W@mFWg```4Rj0Vyp0$^$|PWwpk;EJ2*fPo<0`19m)^lt77L8-4H#h z+f`gma~}$Gs4qim+S7eT&(B3xKnWh__USqFHZpr6G6$`?i9S378iv6Z3W zDA>&@u)=dlng=^GF>sJdmj-aGDo#RY>266=Ee?-F8lAK@vw|*0Z0P>~;cQ!}xz|M+ z{DZHZ{BY!~dG~Yy+TnEJ7=HzGFYM;`XdUM$A+MH`Q7LTENcEg5E+Ex$VTp+wKEez) zb0Ur=BgW=AHi(aCoa6pYl0y@X?ty5|P` z5AnH3$F056LZp{DV%3Giv6J-&G?Al}h)76RhE1hhGBXsPu|(L*RAF43dI=Ypd^3Z) zD>2wN)7`w#5~fzAJAtGt5K+NBAzTsSq%X{D``%eRoG$L7nW|k0%oL)j!gl8q)G$To z*ti|xnMuUoMJCg}zD2;E_KCZN8&uGoi*DUa$2~B&NQaXX_-1?ssalKV3MU85OvG1} zHt~G4XPYer7l9@JN9ZW2@kw+_rC3g{JOJM?f&XrmPN%a%P2@xnAayyu(szY4ia(7c0W8N?$MV29@T&pKss`$chthTgAg|*#LZUy>-_xJ! cPv^CKsz2Ya@5}f0^ydchDDCSv@~OP`zs>ea9{>OV literal 0 HcmV?d00001 diff --git a/lobbying-scraper/normalize.py b/lobbying-scraper/normalize.py new file mode 100644 index 000000000..6e6f7418e --- /dev/null +++ b/lobbying-scraper/normalize.py @@ -0,0 +1,50 @@ +"""Entity name normalization pipeline. + +Direct port of functions/src/lobbying/normalize.ts. Steps must be applied in +this exact order — changing the order produces different (incorrect) output. +""" + +from __future__ import annotations + +import re + +_DBA_RE = re.compile(r"\s+D\s*/+B\s*/+A?\s+.*|\s+DBA\s+.*", re.IGNORECASE) +_LEGAL_RE = re.compile( + r"\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b" +) +_THE_RE = re.compile(r"\bTHE\b") +_WS_RE = re.compile(r"\s+") + +_MISC_PHRASES = [ + "LAW OFFICE OF", + "AND ASSOCIATES", + "& ASSOCIATES", + "AND ASSOC", + "ATTORNEY AT LAW", + "ATTORNEY@LAW", + "ATTORNET AT LAW", # known portal typo + "AND PARTNERS", + "PUBLIC POLICY GROUP", + "LEGISLATIVE SERVICES", + "POLICY GROUP", + "ASSOCIATES", + "COUNSELLORS AT LAW", +] + + +def normalize_entity_name(raw: str | None) -> str: + if not raw: + return "" + x = raw.upper() # 1. uppercase + x = _DBA_RE.sub("", x) # 2. strip d/b/a suffix + x = x.replace("-", " ") # 3. hyphen → space + for ch in (",", ".", "'", "‘", "’", "(", ")"): + x = x.replace(ch, " ") # 4. punctuation → space + x = _LEGAL_RE.sub(" ", x) # 5. remove legal entity words + x = _THE_RE.sub(" ", x) # 6. remove THE anywhere + x = x.replace("&", "AND") # 7. ampersand → AND + x = x.replace("ASSICIATES", "ASSOCIATES") # 8. fix known typo + for phrase in _MISC_PHRASES: # 9. remove professional suffix phrases + x = x.replace(phrase, " ") + x = _WS_RE.sub(" ", x).strip() # 10. collapse whitespace + return x diff --git a/lobbying-scraper/portal.py b/lobbying-scraper/portal.py new file mode 100644 index 000000000..257721991 --- /dev/null +++ b/lobbying-scraper/portal.py @@ -0,0 +1,376 @@ +"""HTTP client and HTML parser for the MA SoS lobbying portal. + +Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/ + +Page flow: + 1. Search POST → summary links table + 2. Summary.aspx → registrant name/year/type + CompleteDisclosure links + 3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity + +Two disclosure HTML formats: + Modern (>=~2013): grdvClientPaidToEntity + grdvActivitiesNew{year}_{n} tables. + Legacy (<~2013): grdvSalaryPaid (total only) + grdvActivities (all bills). +""" + +from __future__ import annotations + +import hashlib +import re +import time +from dataclasses import dataclass, field +from typing import Optional + +import requests +from bs4 import BeautifulSoup, Tag + +# ── Constants ───────────────────────────────────────────────────────────────── + +BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/" +SEARCH_URL = BASE_URL + "Default.aspx" + +_UA = ( + "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" +) +_REQUEST_DELAY = 1.0 +_MAX_RETRIES = 5 + +# Lobby disclosure data begins in 2005; GC 183 started Jan 2003. +FIRST_YEAR = 2005 +FIRST_GC = 183 +FIRST_GC_START_YEAR = 2003 + +# clientName sentinel for pre-2013 filings where compensation is a single total +LEGACY_TOTAL_CLIENT = "_total_salary_" + +# Maps canonical chamber names to the bill-ID prefix used in MAPLE's Bill.id +CHAMBER_PREFIXES: dict[str, str] = { + "House Bill": "H", + "Senate Bill": "S", + "House Docket": "HD", + "Senate Docket": "SD", +} + +# Legacy short-form chamber codes found in older filings +LEGACY_CHAMBER_MAP: dict[str, str] = { + "HB": "House Bill", + "SB": "Senate Bill", +} + +# ── Data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class Compensation: + client_name: str + amount: Optional[float] + + +@dataclass +class BillActivity: + client_name: str + chamber: str # canonical LobbyingChamber value + raw_bill_number: str + bill_id: Optional[str] # e.g. "H1234"; null for Executive/Other + activity_title: str + position: str + amount: Optional[float] + + +@dataclass +class DisclosureMeta: + entity_name: str + year: Optional[int] + reg_type: str # "Lobbyist" | "Employer" + disclosure_urls: list[str] = field(default_factory=list) + + +@dataclass +class DisclosureDetail: + compensation: list[Compensation] = field(default_factory=list) + bills: list[BillActivity] = field(default_factory=list) + + +# ── Derived-value helpers ───────────────────────────────────────────────────── + + +def year_to_general_court(year: int) -> int: + return FIRST_GC + (year - FIRST_GC_START_YEAR) // 2 + + +def normalize_chamber(raw: str) -> str: + t = raw.strip() + if t in LEGACY_CHAMBER_MAP: + return LEGACY_CHAMBER_MAP[t] + known = {"House Bill", "Senate Bill", "House Docket", "Senate Docket", "Executive"} + return t if t in known else "Other" + + +def construct_bill_id(chamber: str, raw_bill_number: str) -> Optional[str]: + """Construct the MAPLE-compatible billId from chamber + raw integer. + + Returns None for Executive and Other chambers where no bill join is possible. + H1234 and S1234 are distinct bills even though they share the same integer — + the prefix is required to disambiguate. + """ + prefix = CHAMBER_PREFIXES.get(chamber) + if not prefix: + return None + try: + return f"{prefix}{int(raw_bill_number)}" + except (ValueError, TypeError): + return None + + +def registrant_id(entity_name: str, year: int) -> str: + key = f"{year}|{entity_name}" + return hashlib.sha256(key.encode()).hexdigest()[:40] + + +def filing_id( + entity_name: str, + client_name: str, + chamber: str, + bill_id: Optional[str], + general_court: int, + position: str, +) -> str: + key = "|".join([entity_name, client_name, chamber, bill_id or "__null__", + str(general_court), position]) + return hashlib.sha256(key.encode()).hexdigest()[:40] + + +# ── HTTP session ────────────────────────────────────────────────────────────── + + +def make_session() -> requests.Session: + s = requests.Session() + s.headers.update({ + "User-Agent": _UA, + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }) + return s + + +def _get(session: requests.Session, url: str) -> BeautifulSoup: + for attempt in range(_MAX_RETRIES): + time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY) + try: + r = session.get(url, timeout=60) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt == _MAX_RETRIES - 1: + raise + print(f" GET retry {attempt + 1}: {e}") + + +def _post(session: requests.Session, url: str, data: dict) -> BeautifulSoup: + for attempt in range(_MAX_RETRIES): + time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY) + try: + r = session.post(url, data=data, timeout=180) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt == _MAX_RETRIES - 1: + raise + print(f" POST retry {attempt + 1}: {e}") + + +# ── Portal scraping ─────────────────────────────────────────────────────────── + + +def _viewstate(soup: BeautifulSoup) -> dict: + return { + inp["name"]: inp.get("value", "") + for inp in soup.find_all("input", type="hidden") + if inp.get("name") + } + + +def fetch_summary_links(session: requests.Session, year: int) -> list[str]: + """Return all Summary.aspx URLs for a given year via a single search POST.""" + soup = _get(session, SEARCH_URL) + data = { + **_viewstate(soup), + "__EVENTTARGET": "", + "__EVENTARGUMENT": "", + "ctl00$ContentPlaceHolder1$Search": "rdbSearchByType", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear": str(year), + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame": "", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown": "3", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType": "L", + "ctl00$ContentPlaceHolder1$drpPageSize": "20000", + "ctl00$ContentPlaceHolder1$btnSearch": "Search", + } + results = _post(session, SEARCH_URL, data) + table = results.find("table", id=lambda x: x and "grdvSearchResultByTypeAndCategory" in x) + if not table: + return [] + return [ + BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"] + for a in table.find_all("a", href=True) + if "Summary.aspx" in a["href"] + ] + + +def fetch_disclosure_meta(session: requests.Session, summary_url: str) -> DisclosureMeta: + soup = _get(session, summary_url) + + def text(el_id: str) -> str: + el = soup.find(id=el_id) + return el.get_text(strip=True) if el else "" + + entity_name = text("ContentPlaceHolder1_lblRegistrantName") + year_text = text("ContentPlaceHolder1_lblYear") + reg_type_raw = text("ContentPlaceHolder1_lblRegType") + + try: + year = int(year_text) + except ValueError: + year = None + + reg_type = "Employer" if "Entity" in reg_type_raw else "Lobbyist" + + disc_urls = [ + BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"] + for a in soup.find_all("a", href=True) + if "CompleteDisclosure" in a["href"] + ] + + return DisclosureMeta( + entity_name=entity_name, + year=year, + reg_type=reg_type, + disclosure_urls=disc_urls, + ) + + +def _parse_amount(text: str) -> Optional[float]: + cleaned = text.replace("$", "").replace(",", "").strip() + try: + return float(cleaned) + except ValueError: + return None + + +def _grid_rows(table: Tag) -> list[Tag]: + return table.find_all("tr", class_=lambda c: c and "Grid" in c and "Header" not in c) + + +def fetch_disclosure_detail( + session: requests.Session, disc_url: str, year: int +) -> DisclosureDetail: + soup = _get(session, disc_url) + compensation: list[Compensation] = [] + bills: list[BillActivity] = [] + gc = year_to_general_court(year) + + # ── Modern format (>=~2013) ─────────────────────────────────────────────── + comp_table = soup.find("table", id=lambda x: x and "grdvClientPaidToEntity" in (x or "")) + if comp_table: + for row in _grid_rows(comp_table): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) >= 2: + compensation.append(Compensation( + client_name=cells[0], + amount=_parse_amount(cells[1]), + )) + + act_tables = soup.find_all( + "table", + id=lambda x: x and re.search(r"grdvActivitiesNew(\d{4})?_\d+", x or ""), + ) + for act_table in act_tables: + # Walk backwards to find the nearest lblClientName span + client_name = "" + node = act_table + while node: + node = node.find_previous(["span", "div", "td"]) + if not node: + break + if node.get("id") and "lblClientName" in node["id"]: + client_name = node.get_text(strip=True) + break + + for row in _grid_rows(act_table): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) < 4: + continue + chamber = normalize_chamber(cells[0]) + raw_num = cells[1] + bill_id = construct_bill_id(chamber, raw_num) + bills.append(BillActivity( + client_name=client_name, + chamber=chamber, + raw_bill_number=raw_num, + bill_id=bill_id, + activity_title=cells[2] if len(cells) > 2 else "", + position=cells[3] if len(cells) > 3 else "", + amount=_parse_amount(cells[4]) if len(cells) > 4 else None, + )) + + if comp_table or bills: + return DisclosureDetail(compensation=compensation, bills=bills) + + # ── Legacy format (<~2013) ──────────────────────────────────────────────── + salary_table = soup.find("table", id=lambda x: x and "grdvSalaryPaid" in (x or "")) + if salary_table: + total = 0.0 + for row in salary_table.find_all("tr"): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) >= 2 and "Total" not in cells[0]: + amt = _parse_amount(cells[1]) + if amt: + total += amt + if total: + compensation.append(Compensation(client_name=LEGACY_TOTAL_CLIENT, amount=total)) + + act_table = soup.find("table", id=lambda x: x and x.endswith("grdvActivities")) + if act_table: + all_rows = act_table.find_all("tr") + headers = [th.get_text(strip=True) + for th in (all_rows[0].find_all(["th", "td"]) if all_rows else [])] + + if headers and "Activity" in headers[0]: + # 6-col entity layout has Lobbyist as second header + if len(headers) >= 2 and "Lobbyist" in headers[1]: + bill_col, pos_col, client_col = 0, 2, 4 + else: + bill_col, pos_col, client_col = 0, 1, 3 + else: + bill_col, pos_col, client_col = 1, None, 3 + + chamber_map = {"H": "House Bill", "S": "Senate Bill", + "HD": "House Docket", "SD": "Senate Docket"} + skip = {"Activity or Bill No and Title", "N/A", "None", "", "Total amount"} + + for row in all_rows[1:]: + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) <= max(bill_col, client_col): + continue + bill_cell = cells[bill_col] + if not bill_cell or bill_cell in skip: + continue + parts = bill_cell.split(None, 1) + bill_no = parts[0] + m = re.match(r"^([A-Z]+)(\d+)$", bill_no) + if not m: + continue + prefix, number = m.group(1), m.group(2) + chamber = chamber_map.get(prefix, "Other") + bill_id = construct_bill_id(chamber, number) + bills.append(BillActivity( + client_name=cells[client_col] if len(cells) > client_col else "", + chamber=chamber, + raw_bill_number=number, + bill_id=bill_id, + activity_title=parts[1] if len(parts) > 1 else "", + position=cells[pos_col] if pos_col is not None and len(cells) > pos_col else "", + amount=None, + )) + + return DisclosureDetail(compensation=compensation, bills=bills) diff --git a/lobbying-scraper/requirements.txt b/lobbying-scraper/requirements.txt new file mode 100644 index 000000000..5e7b4bcc7 --- /dev/null +++ b/lobbying-scraper/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.28 +beautifulsoup4>=4.12 +google-cloud-firestore>=2.14 diff --git a/lobbying-scraper/scrape.py b/lobbying-scraper/scrape.py new file mode 100644 index 000000000..fb985e05f --- /dev/null +++ b/lobbying-scraper/scrape.py @@ -0,0 +1,269 @@ +"""Lobbying disclosure scraper — Cloud Run entry point. + +Runs on a weekly Cloud Scheduler trigger. Checks for new or amended disclosures +and exits immediately if none are found (fast path). When new disclosures exist, +fetches and writes them to Firestore. + +Also serves as the library used by the TypeScript backfill admin script via +subprocess. + +Environment variables: + GOOGLE_CLOUD_PROJECT — GCP project ID (set automatically in Cloud Run) + FIRESTORE_EMULATOR_HOST — set to use the local emulator (e.g. localhost:8080) + +CLI flags (for local / backfill use): + --year YEAR Only process this year (default: current + prior) + --limit N Max registrants per year (for testing) + --dry-run Fetch and parse but do not write to Firestore +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from datetime import datetime, timezone + +from google.cloud import firestore + +from portal import ( + FIRST_YEAR, + fetch_disclosure_detail, + fetch_disclosure_meta, + fetch_summary_links, + make_session, +) +from writer import ( + BACKFILL_DOC, + BACKFILL_URLS_COLLECTION, + SCRAPER_DOC, + write_filings, + write_registrant, +) + + +# ── Cursor helpers ──────────────────────────────────────────────────────────── + + +def _load_live_cursor(db: firestore.Client) -> tuple[set[str], dict[str, list[str]]]: + """Return (processedDiscUrls, summaryDiscCache) from the live scraper doc.""" + doc = db.document(SCRAPER_DOC).get() + data = doc.to_dict() or {} + return ( + set(data.get("processedDiscUrls", [])), + data.get("summaryDiscCache", {}), + ) + + +def _save_live_cursor( + db: firestore.Client, + processed: set[str], + cache: dict[str, list[str]], +) -> None: + db.document(SCRAPER_DOC).set( + {"processedDiscUrls": list(processed), "summaryDiscCache": cache}, + merge=True, + ) + + +def _is_backfill_processed(db: firestore.Client, disc_url: str) -> bool: + h = hashlib.sha256(disc_url.encode()).hexdigest()[:40] + return db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).get().exists + + +def _mark_backfill_processed(db: firestore.Client, disc_url: str) -> None: + h = hashlib.sha256(disc_url.encode()).hexdigest()[:40] + db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).set( + {"url": disc_url, "processedAt": datetime.now(tz=timezone.utc).isoformat()} + ) + + +# ── Core processing ─────────────────────────────────────────────────────────── + + +def process_disclosure( + db: firestore.Client | None, + session, + summary_url: str, + disc_url: str, + year: int, + dry_run: bool = False, +) -> tuple[int, int]: + """Fetch one disclosure page and write registrant + filing documents. + + Returns (compensation_rows, filing_rows). + """ + meta = fetch_disclosure_meta(session, summary_url) + detail = fetch_disclosure_detail(session, disc_url, year) + + if dry_run or db is None: + return len(detail.compensation), len(detail.bills) + + write_registrant(db, meta, detail, disc_url) + n_filings = write_filings(db, meta, detail) + return len(detail.compensation), n_filings + + +# ── Weekly incremental run ──────────────────────────────────────────────────── + + +def run_weekly( + db: "firestore.Client | None", + years: list[int], + limit: int | None = None, + dry_run: bool = False, +) -> int: + """Incremental weekly check. Returns number of new disclosures processed.""" + current_year = datetime.now(tz=timezone.utc).year + processed, cache = _load_live_cursor(db) if db is not None else (set(), {}) + + session = make_session() + new_count = 0 + + for year in years: + print(f"\n── {year} ──") + try: + summary_urls = fetch_summary_links(session, year) + except Exception as e: + print(f" failed to fetch summary links: {e}", file=sys.stderr) + continue + + if limit: + summary_urls = summary_urls[:limit] + + print(f" {len(summary_urls)} registrants on portal") + + for summary_url in summary_urls: + # Use cached disc URLs for prior years; always re-check current year + disc_urls = cache.get(summary_url) + if disc_urls is None or year == current_year: + try: + meta = fetch_disclosure_meta(session, summary_url) + disc_urls = meta.disclosure_urls + cache[summary_url] = disc_urls + if not dry_run: + _save_live_cursor(db, processed, cache) + except Exception as e: + print(f" failed to fetch summary {summary_url}: {e}", file=sys.stderr) + continue + + new_disc_urls = [u for u in disc_urls if u not in processed] + if not new_disc_urls: + continue + + for disc_url in new_disc_urls: + try: + comp_n, filing_n = process_disclosure( + db, session, summary_url, disc_url, year, dry_run=dry_run + ) + processed.add(disc_url) + new_count += 1 + print(f" processed: {comp_n} clients, {filing_n} filings") + if not dry_run: + _save_live_cursor(db, processed, cache) + except Exception as e: + print(f" failed to process {disc_url}: {e}", file=sys.stderr) + + return new_count + + +# ── Historical backfill ─────────────────────────────────────────────────────── + + +def run_backfill( + db: "firestore.Client | None", + years: list[int], + limit: int | None = None, + dry_run: bool = False, +) -> int: + """Full historical backfill using the subcollection cursor. Resumable.""" + session = make_session() + total_new = 0 + + for year in years: + print(f"\n── {year} ──") + try: + summary_urls = fetch_summary_links(session, year) + except Exception as e: + print(f" failed to fetch summary links: {e}", file=sys.stderr) + continue + + if limit: + summary_urls = summary_urls[:limit] + + print(f" {len(summary_urls)} registrants on portal") + year_new = 0 + + for i, summary_url in enumerate(summary_urls): + try: + meta = fetch_disclosure_meta(session, summary_url) + except Exception as e: + print(f" [{i+1}/{len(summary_urls)}] failed to fetch summary: {e}", file=sys.stderr) + continue + + for disc_url in meta.disclosure_urls: + if db is not None and not dry_run and _is_backfill_processed(db, disc_url): + continue + try: + comp_n, filing_n = process_disclosure( + db, session, summary_url, disc_url, year, dry_run=dry_run + ) + if not dry_run: + _mark_backfill_processed(db, disc_url) + total_new += 1 + year_new += 1 + except Exception as e: + print(f" failed to process {disc_url}: {e}", file=sys.stderr) + + if (i + 1) % 50 == 0 or i + 1 == len(summary_urls): + print(f" [{i+1}/{len(summary_urls)}] {year_new} new disclosures so far") + + print(f" {year} complete: {year_new} new disclosures") + + return total_new + + +# ── Entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--year", type=int, default=None) + p.add_argument("--limit", type=int, default=None) + p.add_argument("--dry-run", action="store_true") + p.add_argument( + "--mode", + choices=["weekly", "backfill"], + default="weekly", + help="weekly: incremental check; backfill: full history with subcollection cursor", + ) + args = p.parse_args() + + current_year = datetime.now(tz=timezone.utc).year + + if args.year: + years = [args.year] + elif args.mode == "weekly": + years = [current_year, current_year - 1] + else: + years = list(range(FIRST_YEAR, current_year + 1)) + + db = firestore.Client() if not args.dry_run else None + + if args.mode == "weekly": + n = run_weekly(db, years, limit=args.limit, dry_run=args.dry_run) + if n == 0: + print("\nNo new disclosures found.") + else: + print(f"\nDone: {n} new disclosures written.") + else: + n = run_backfill(db, years, limit=args.limit, dry_run=args.dry_run) + print(f"\nBackfill complete: {n} new disclosures written.") + + # Emit structured result for callers (e.g. TypeScript backfill script) + print(json.dumps({"newDisclosures": n}), file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/lobbying-scraper/writer.py b/lobbying-scraper/writer.py new file mode 100644 index 000000000..a6804f401 --- /dev/null +++ b/lobbying-scraper/writer.py @@ -0,0 +1,126 @@ +"""Firestore document construction and write helpers. + +Mirrors the data model in functions/src/lobbying/types.ts. All collection +names and field names must stay in sync with that file. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import TYPE_CHECKING + +from normalize import normalize_entity_name +from portal import ( + BillActivity, + Compensation, + DisclosureDetail, + DisclosureMeta, + filing_id, + registrant_id, + year_to_general_court, +) + +if TYPE_CHECKING: + from google.cloud import firestore + +REGISTRANTS_COLLECTION = "lobbyingRegistrants" +FILINGS_COLLECTION = "lobbyingFilings" +SCRAPER_DOC = "/scrapers/lobbying" +BACKFILL_DOC = "/scrapers/lobbyingBackfill" +BACKFILL_URLS_COLLECTION = "processedUrls" + + +def _now() -> datetime: + return datetime.now(tz=timezone.utc) + + +def write_registrant( + db: firestore.Client, + meta: DisclosureMeta, + detail: DisclosureDetail, + disc_url: str, +) -> None: + """Upsert a LobbyingRegistrant document.""" + if not meta.entity_name or meta.year is None: + return + + doc_id = registrant_id(meta.entity_name, meta.year) + ref = db.collection(REGISTRANTS_COLLECTION).document(doc_id) + + clients = [ + { + "clientName": c.client_name, + "clientNameNorm": normalize_entity_name(c.client_name), + "compensation": c.amount, + } + for c in detail.compensation + ] + + data = { + "registrantId": doc_id, + "entityName": meta.entity_name, + "entityNameNorm": normalize_entity_name(meta.entity_name), + "year": meta.year, + "generalCourt": year_to_general_court(meta.year), + "regType": meta.reg_type, + "clients": clients, + "disclosureUrls": firestore.ArrayUnion([disc_url]), + "fetchedAt": _now(), + } + ref.set(data, merge=True) + + +def write_filings( + db: firestore.Client, + meta: DisclosureMeta, + detail: DisclosureDetail, +) -> int: + """Batch-write LobbyingFiling documents. Returns the number written.""" + if not meta.entity_name or meta.year is None or not detail.bills: + return 0 + + gc = year_to_general_court(meta.year) + entity_name = meta.entity_name + entity_norm = normalize_entity_name(entity_name) + now = _now() + + batch = db.batch() + count = 0 + + for bill in detail.bills: + fid = filing_id( + entity_name, + bill.client_name, + bill.chamber, + bill.bill_id, + gc, + bill.position, + ) + ref = db.collection(FILINGS_COLLECTION).document(fid) + doc = { + "filingId": fid, + "entityName": entity_name, + "entityNameNorm": entity_norm, + "clientName": bill.client_name, + "clientNameNorm": normalize_entity_name(bill.client_name), + "year": meta.year, + "generalCourt": gc, + "chamber": bill.chamber, + "billId": bill.bill_id, + "activityTitle": bill.activity_title, + "position": bill.position, + "amount": bill.amount, + "fetchedAt": now, + } + batch.set(ref, doc) + count += 1 + + # Firestore batch limit is 500 writes + if count % 400 == 0: + batch.commit() + batch = db.batch() + + if count % 400 != 0: + batch.commit() + + return count diff --git a/scripts/firebase-admin/backfillLobbying.ts b/scripts/firebase-admin/backfillLobbying.ts index f7914dd84..a2a66330e 100644 --- a/scripts/firebase-admin/backfillLobbying.ts +++ b/scripts/firebase-admin/backfillLobbying.ts @@ -1,156 +1,64 @@ /** * Backfill lobbying disclosure data from 2005 to the present. * - * This script is the primary ingestion path for all historical data. The live - * Cloud Function (scrapeLobbying) only handles the current and prior year in - * steady state. Run this once to populate the full history, and re-run with - * --year to refresh specific years. + * Delegates all HTTP fetching and Firestore writes to the Python scraper in + * lobbying-scraper/. The TypeScript layer handles argument parsing and + * environment setup only. * * Usage: * GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ * yarn firebase-admin run-script backfillLobbying --env dev * - * Options: - * --year NUMBER Only process this year (useful for testing or re-runs) - * --limit NUMBER Max registrants to process per year (for testing) + * Options (passed through to scrape.py): + * --year NUMBER Only process this year + * --limit NUMBER Max registrants per year (for testing) * - * Cursor storage: - * Processed disclosure URLs are stored as documents in the Firestore - * subcollection /scrapers/lobbyingBackfill/processedUrls/{urlHash}. - * This scales to the full historical URL set (~50,000+) without hitting the - * 1MB Firestore document size limit. Restart the script at any time; it will - * resume from where it left off. + * Requires: pip install -r lobbying-scraper/requirements.txt + * Or run inside the maple-2025 conda environment. */ -import { createHash } from "crypto" +import { spawn } from "child_process" +import path from "path" import { z } from "zod" -import { - allLobbyingYears, - processDisclosure, - writeRegistrant -} from "../../functions/src/lobbying/scrapeLobbying" -import { - fetchDisclosureMeta, - fetchSummaryLinks, - makePortalClient -} from "../../functions/src/lobbying/portal" -import { - BACKFILL_DOC, - BACKFILL_URLS_COLLECTION, - FIRST_LOBBYING_YEAR -} from "../../functions/src/lobbying/types" import { Script } from "./types" const Args = z .object({ - year: z.number().int().min(FIRST_LOBBYING_YEAR).optional(), + year: z.number().int().min(2005).optional(), limit: z.number().int().positive().optional() }) .passthrough() -export const script: Script = async ({ db, args }) => { - const { year: onlyYear, limit } = Args.parse(args) +const SCRAPER = path.resolve(__dirname, "../../lobbying-scraper/scrape.py") - const years = onlyYear ? [onlyYear] : allLobbyingYears() - console.log( - `backfillLobbying: processing years ${years[0]}–${years[years.length - 1]}` - ) - - // Load already-processed disc URLs from the subcollection cursor. - const backfillRef = db.doc(BACKFILL_DOC) - const processedSnap = await backfillRef - .collection(BACKFILL_URLS_COLLECTION) - .select() // fetch only doc IDs (the URL hash), no field data needed - .get() - const processedHashes = new Set(processedSnap.docs.map(d => d.id)) - console.log( - `backfillLobbying: ${processedHashes.size} disc URLs already processed` - ) - - const client = makePortalClient() - let totalNew = 0 - - for (const year of years) { - console.log(`\n── ${year} ──`) - - let summaryUrls: string[] - try { - summaryUrls = await fetchSummaryLinks(client, year) - } catch (e) { - console.error(` Failed to fetch summary links for ${year}:`, e) - continue - } - - if (limit) summaryUrls = summaryUrls.slice(0, limit) - console.log(` ${summaryUrls.length} registrants on portal`) - - let yearNew = 0 - - for (let i = 0; i < summaryUrls.length; i++) { - const summaryUrl = summaryUrls[i] - let meta: Awaited> +export const script: Script = async ({ env, args }) => { + const { year, limit } = Args.parse(args) - try { - meta = await fetchDisclosureMeta(client, summaryUrl) - } catch (e) { - console.warn( - ` [${i + 1}/${ - summaryUrls.length - }] Failed to fetch summary: ${summaryUrl}`, - e - ) - continue - } - - if (meta.entityName && meta.year) { - try { - await writeRegistrant( - db, - meta.entityName, - meta.year, - meta.regType, - meta.disclosureUrls - ) - } catch (e) { - console.warn(` Failed to write registrant ${meta.entityName}:`, e) - } - } - - for (const discUrl of meta.disclosureUrls) { - const urlHash = createHash("sha256") - .update(discUrl) - .digest("hex") - .slice(0, 40) - if (processedHashes.has(urlHash)) continue - - try { - await processDisclosure(db, client, summaryUrl, discUrl, year) - - // Mark as processed in the subcollection cursor - await backfillRef - .collection(BACKFILL_URLS_COLLECTION) - .doc(urlHash) - .set({ url: discUrl, processedAt: new Date().toISOString() }) - - processedHashes.add(urlHash) - totalNew++ - yearNew++ - } catch (e) { - console.warn(` Failed to process disclosure ${discUrl}:`, e) - } - } + if (env === "local") { + throw new Error( + "backfillLobbying requires --env dev or --env prod " + + "(it writes to a real Firestore project; local emulator not supported yet)" + ) + } - if ((i + 1) % 50 === 0 || i + 1 === summaryUrls.length) { - console.log( - ` [${i + 1}/${ - summaryUrls.length - }] ${yearNew} new disclosures this year` - ) - } - } + const pyArgs = ["--mode", "backfill"] + if (year) pyArgs.push("--year", String(year)) + if (limit) pyArgs.push("--limit", String(limit)) - console.log(` ${year} complete: ${yearNew} new disclosures`) - } + console.log(`Running: python3 ${SCRAPER} ${pyArgs.join(" ")}`) + console.log( + `Firestore project: ${process.env.GCLOUD_PROJECT || "(from ADC)"}` + ) - console.log(`\nbackfillLobbying complete: ${totalNew} new disclosures total`) + await new Promise((resolve, reject) => { + const proc = spawn("python3", [SCRAPER, ...pyArgs], { + stdio: ["ignore", "inherit", "inherit"], + env: { ...process.env } + }) + proc.on("close", code => { + if (code === 0) resolve() + else reject(new Error(`scrape.py exited with code ${code}`)) + }) + proc.on("error", reject) + }) } From 2bcb783f2e2e71631b524f49f5b194eb2d551e63 Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 8 Jun 2026 22:05:04 -0400 Subject: [PATCH 4/4] refactor: remove dead TypeScript scraper code from lobbying module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per code review feedback: the TypeScript Firebase Function and backfill script added no value — the portal's TLS fingerprinting requirements mean Node.js cannot reach it, so the TS HTTP layer was non-functional and the backfill script was just a thin subprocess wrapper with no benefit over calling scrape.py directly. Removed: - functions/src/lobbying/scrapeLobbying.ts (broken Cloud Function) - functions/src/lobbying/portal.ts (non-functional TS HTTP layer) - functions/src/lobbying/http/ (unused Python fetch helper) - scripts/firebase-admin/backfillLobbying.ts (shell wrapper, no value) - scrapeLobbying export from functions/src/index.ts Kept: - functions/src/lobbying/types.ts — Firestore schema; imported by frontend - functions/src/lobbying/normalize.ts — normalization pipeline - lobbying-scraper/ — the working Cloud Run container (unchanged) The historical backfill is now run directly: python3 lobbying-scraper/scrape.py --mode backfill Co-Authored-By: Claude Sonnet 4.6 --- docs/lobbying-disclosure-ingestion.md | 108 ++-- functions/src/index.ts | 2 - functions/src/lobbying/http/.gitignore | 3 - functions/src/lobbying/http/fetch.py | 81 --- functions/src/lobbying/http/requirements.txt | 1 - functions/src/lobbying/index.ts | 10 - functions/src/lobbying/portal.ts | 553 ------------------- functions/src/lobbying/scrapeLobbying.ts | 274 --------- lobbying-scraper/Dockerfile | 4 +- scripts/firebase-admin/backfillLobbying.ts | 64 --- 10 files changed, 66 insertions(+), 1034 deletions(-) delete mode 100644 functions/src/lobbying/http/.gitignore delete mode 100644 functions/src/lobbying/http/fetch.py delete mode 100644 functions/src/lobbying/http/requirements.txt delete mode 100644 functions/src/lobbying/portal.ts delete mode 100644 functions/src/lobbying/scrapeLobbying.ts delete mode 100644 scripts/firebase-admin/backfillLobbying.ts diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md index 264c77c52..51719f342 100644 --- a/docs/lobbying-disclosure-ingestion.md +++ b/docs/lobbying-disclosure-ingestion.md @@ -298,64 +298,86 @@ them. No bill-level compensation amount is available for these years. ``` functions/src/lobbying/ - types.ts — Runtypes definitions for LobbyingRegistrant, LobbyingFiling - normalize.ts — Entity name normalization pipeline - portal.ts — Reference implementation (HTTP layer not used in production) - scrapeLobbying.ts — Reference implementation (superseded by Cloud Run container) - index.ts — Re-exports + types.ts — Runtypes schema definitions for LobbyingRegistrant, LobbyingFiling + normalize.ts — Entity name normalization pipeline (also used client-side) + index.ts — Re-exports lobbying-scraper/ - scrape.py — Entry point: --mode weekly (incremental) | --mode backfill - portal.py — HTTP + HTML parsing - normalize.py — Port of normalize.ts - writer.py — Firestore document construction + writes - requirements.txt — requests, beautifulsoup4, google-cloud-firestore - Dockerfile — Python 3.12-slim image + scrape.py — Entry point: --mode weekly (incremental) | --mode backfill + portal.py — HTTP + HTML parsing + normalize.py — Port of normalize.ts + writer.py — Firestore document construction + writes + requirements.txt — requests, beautifulsoup4, google-cloud-firestore + Dockerfile — Python 3.12-slim image ``` +The TypeScript lobbying module (`functions/src/lobbying/`) contains only the +schema types and normalization logic. There is no TypeScript scraper or +Firebase Function — ingestion is handled entirely by the Cloud Run container. +This follows the same pattern as the MCP server and avoids the complexity of +running multiple language runtimes in the same Firebase Functions deployment. + --- ## Deploying the Cloud Run Container -Follows the same pattern as the MCP server. Requires the -`maple-lobbying-scraper` Artifact Registry repository to exist. +Follows the same pattern as the MCP server. The Artifact Registry repo +(`maple-lobbying`) and Cloud Run job (`maple-lobbying-scraper`) are already +created in `digital-testimony-dev`. ```bash cd lobbying-scraper IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest docker build -t $IMAGE . && docker push $IMAGE -gcloud run jobs create maple-lobbying-scraper \ +gcloud run jobs update maple-lobbying-scraper \ --image=$IMAGE \ --project=digital-testimony-dev \ + --region=us-central1 +``` + +For a new project (prod), create the job first: + +```bash +gcloud artifacts repositories create maple-lobbying \ + --repository-format=docker --location=us-central1 --project= + +gcloud run jobs create maple-lobbying-scraper \ + --image=$IMAGE \ + --project= \ --region=us-central1 \ - --service-account=@digital-testimony-dev.iam.gserviceaccount.com + --task-timeout=30m \ + --max-retries=0 -# Schedule weekly via Cloud Scheduler +# Schedule weekly (Mondays 6am UTC) gcloud scheduler jobs create http maple-lobbying-weekly \ --schedule="0 6 * * 1" \ - --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/digital-testimony-dev/jobs/maple-lobbying-scraper:run" \ + --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces//jobs/maple-lobbying-scraper:run" \ --http-method=POST \ - --oauth-service-account-email=@digital-testimony-dev.iam.gserviceaccount.com \ + --oauth-service-account-email=@.iam.gserviceaccount.com \ --location=us-central1 ``` -## Historical Backfill (Admin Script) +## Historical Backfill -Ingests all historical filings from 2005 to the present. Delegates to -`scrape.py --mode backfill` via subprocess. Resumable — the subcollection -cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks what has been -processed. Run directly on the machine (requires `lobbying-scraper/` deps -installed or the `maple-2025` conda environment). +Runs `scrape.py --mode backfill` directly. Resumable — the subcollection +cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks progress. +Requires `lobbying-scraper/` deps or the `maple-2025` conda environment. ```bash +cd lobbying-scraper + +# Test a single year with no writes GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ - yarn firebase-admin run-script backfillLobbying --env dev + python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run -# Or call scrape.py directly for more control: -cd lobbying-scraper -python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run -python3 scrape.py --mode backfill --year 2024 +# Run a single year for real +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill --year 2024 + +# Full history (2005-present, resumable) +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill ``` --- @@ -400,22 +422,18 @@ export { scrapeLobbying } from "./lobbying" ## Implementation Status -| File | Status | Notes | -| -------------------------------------------- | ------- | ---------------------------------------------------------- | -| `functions/src/lobbying/types.ts` | ✅ Done | TypeScript type definitions; source of truth for schema | -| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline (also ported to `normalize.py`) | -| `functions/src/lobbying/portal.ts` | ✅ Done | Kept for reference; HTTP layer not used (see architecture) | -| `functions/src/lobbying/scrapeLobbying.ts` | ✅ Done | Not deployed; superseded by Cloud Run container | -| `functions/src/lobbying/index.ts` | ✅ Done | | -| `functions/src/index.ts` (export) | ✅ Done | | -| `firestore.rules` | ✅ Done | | -| `firestore.indexes.json` | ✅ Done | | -| `lobbying-scraper/normalize.py` | ✅ Done | Port of normalize.ts | -| `lobbying-scraper/portal.py` | ✅ Done | HTTP + HTML parsing | -| `lobbying-scraper/writer.py` | ✅ Done | Firestore document construction | -| `lobbying-scraper/scrape.py` | ✅ Done | Entry point; `--mode weekly` and `--mode backfill` | -| `lobbying-scraper/Dockerfile` | ✅ Done | Python 3.12 slim | -| `scripts/firebase-admin/backfillLobbying.ts` | ✅ Done | Calls `scrape.py --mode backfill` as subprocess | +| File | Status | Notes | +| ------------------------------------- | ------- | -------------------------------------------------------- | +| `functions/src/lobbying/types.ts` | ✅ Done | Firestore schema types; imported by future frontend code | +| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline; also ported to `normalize.py` | +| `functions/src/lobbying/index.ts` | ✅ Done | Re-exports types and normalize | +| `firestore.rules` | ✅ Done | | +| `firestore.indexes.json` | ✅ Done | | +| `lobbying-scraper/normalize.py` | ✅ Done | Port of normalize.ts | +| `lobbying-scraper/portal.py` | ✅ Done | HTTP + HTML parsing | +| `lobbying-scraper/writer.py` | ✅ Done | Firestore document construction | +| `lobbying-scraper/scrape.py` | ✅ Done | Entry point; `--mode weekly` and `--mode backfill` | +| `lobbying-scraper/Dockerfile` | ✅ Done | Python 3.12-slim; deployed to Cloud Run | ### Document ID scheme diff --git a/functions/src/index.ts b/functions/src/index.ts index 6c52b78c1..641255bf4 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -60,8 +60,6 @@ export { export { transcription } from "./webhooks" -export { scrapeLobbying } from "./lobbying" - export * from "./triggerPubsubFunction" // Export the health check last so it is loaded last. diff --git a/functions/src/lobbying/http/.gitignore b/functions/src/lobbying/http/.gitignore deleted file mode 100644 index d0ee3b17c..000000000 --- a/functions/src/lobbying/http/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -venv/ -__pycache__/ -*.pyc diff --git a/functions/src/lobbying/http/fetch.py b/functions/src/lobbying/http/fetch.py deleted file mode 100644 index 4e6c2c4ec..000000000 --- a/functions/src/lobbying/http/fetch.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Minimal HTTP fetch helper for the lobbying portal. - -Handles the portal's session cookie requirements that standard Node.js HTTP -clients cannot satisfy due to TLS-layer constraints. - -Usage: - python3 fetch.py --url URL [--method GET|POST] [--jar PATH] - -POST body is read from stdin as application/x-www-form-urlencoded. -Cookies are persisted to/from the JSON file at --jar so the session survives -across multiple subprocess invocations. -HTML response is written to stdout. Errors go to stderr with exit code 1. -""" - -import argparse -import json -import sys -from pathlib import Path - -import requests - -_UA = ( - "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" -) - - -def main() -> None: - p = argparse.ArgumentParser() - p.add_argument("--url", required=True) - p.add_argument("--method", default="GET", choices=["GET", "POST"]) - p.add_argument("--jar", default=None, help="Path to JSON cookie-jar file") - args = p.parse_args() - - session = requests.Session() - session.headers.update( - { - "User-Agent": _UA, - "Accept": "*/*", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - } - ) - - if args.jar: - jar = Path(args.jar) - if jar.exists(): - try: - session.cookies.update(json.loads(jar.read_text())) - except Exception as e: - print(f"warning: could not read cookie jar: {e}", file=sys.stderr) - - try: - if args.method == "POST": - body = sys.stdin.buffer.read() - resp = session.post( - args.url, - data=body, - headers={"Content-Type": "application/x-www-form-urlencoded"}, - timeout=180, - ) - else: - resp = session.get(args.url, timeout=60) - - resp.raise_for_status() - - if args.jar: - Path(args.jar).write_text(json.dumps(dict(session.cookies))) - - sys.stdout.buffer.write(resp.content) - - except requests.exceptions.HTTPError as e: - print(f"HTTP error {e.response.status_code}: {args.url}", file=sys.stderr) - sys.exit(1) - except requests.exceptions.RequestException as e: - print(f"request failed: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/functions/src/lobbying/http/requirements.txt b/functions/src/lobbying/http/requirements.txt deleted file mode 100644 index b18d51347..000000000 --- a/functions/src/lobbying/http/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -requests>=2.28 diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts index 5e594cb34..6d039ae51 100644 --- a/functions/src/lobbying/index.ts +++ b/functions/src/lobbying/index.ts @@ -1,12 +1,2 @@ -export { scrapeLobbying } from "./scrapeLobbying" export * from "./types" export { normalizeEntityName } from "./normalize" -export { - constructBillId, - fetchDisclosureDetail, - fetchDisclosureMeta, - fetchSummaryLinks, - makePortalClient, - normalizeChamber, - yearToGeneralCourt -} from "./portal" diff --git a/functions/src/lobbying/portal.ts b/functions/src/lobbying/portal.ts deleted file mode 100644 index 64d65831b..000000000 --- a/functions/src/lobbying/portal.ts +++ /dev/null @@ -1,553 +0,0 @@ -/** - * HTTP client and HTML parser for the MA Secretary of State lobbying portal. - * - * Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/ - * - * Page flow: - * 1. Search POST → grdvSearchResultByTypeAndCategory table - * One row per registrant; each row has a Summary.aspx link. - * 2. Summary.aspx → registrant name/year/type + CompleteDisclosure links - * 3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity - * - * Two disclosure HTML formats exist: - * Modern (≥~2013): per-client compensation in grdvClientPaidToEntity; - * per-client bill tables as grdvActivitiesNew{year}_{n}. - * Legacy (<~2013): total salary in grdvSalaryPaid (no client breakdown); - * all bill activity in a single grdvActivities table. - */ - -import axios, { AxiosInstance } from "axios" -import { JSDOM } from "jsdom" -import { sha256 } from "js-sha256" -import { CookieJar } from "tough-cookie" -import { - CHAMBER_PREFIXES, - LEGACY_CHAMBER_MAP, - LEGACY_TOTAL_CLIENT, - LobbyingChamber -} from "./types" - -// ─── Constants ────────────────────────────────────────────────────────────── - -const BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/" -const SEARCH_URL = BASE_URL + "Default.aspx" -const REQUEST_DELAY_MS = 1000 -const MAX_RETRIES = 5 - -const IPAD_UA = - "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " + - "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" - -const FIRST_GC = 183 -const FIRST_GC_START_YEAR = 2003 - -// ─── Public types ─────────────────────────────────────────────────────────── - -export interface RawCompensation { - clientName: string - amount: number | null -} - -export interface RawBillActivity { - clientName: string - chamber: LobbyingChamber - rawBillNumber: string - billId: string | null // pre-computed from chamber + rawBillNumber - activityTitle: string - position: string - amount: number | null -} - -export interface DisclosureMeta { - entityName: string - year: number | null - /** Portal reg_type mapped to our vocabulary */ - regType: "Lobbyist" | "Employer" - disclosureUrls: string[] -} - -export interface DisclosureDetail { - compensation: RawCompensation[] - bills: RawBillActivity[] -} - -// ─── HTTP helpers ──────────────────────────────────────────────────────────── - -/** - * Create an axios instance pre-configured for the MA SoS portal. - * - * Includes a cookie jar via interceptors so ASP.NET session state (ViewState, - * anti-forgery tokens) is preserved across the GET → POST page flow without - * requiring the axios-cookiejar-support package. - */ -export interface PortalClient { - jar: CookieJar - client: AxiosInstance -} - -/** - * Create a portal client pre-configured for the MA SoS portal. - * - * Uses maxRedirects: 0 so our manual redirect loop (inside getHtml / postHtml) - * can extract Set-Cookie headers at each hop before following. This is necessary - * because the portal is protected by Incapsula, which issues a 302 challenge on - * first contact and requires the session cookies to be sent on the retried request. - * Axios's built-in redirect following happens before response interceptors fire, - * so the cookies from the challenge response are never captured automatically. - */ -export function makePortalClient(): PortalClient { - const jar = new CookieJar() - const client = axios.create({ - headers: { - "User-Agent": IPAD_UA, - Accept: "*/*", - "Accept-Encoding": "gzip, deflate, br", - Connection: "keep-alive" - }, - timeout: 60_000, - maxRedirects: 10, // let axios handle ordinary redirects; only Incapsula challenges need manual handling - validateStatus: s => s < 500 // surface 4xx so we can log them - }) - return { jar, client } -} - -function sleep(ms: number): Promise { - return new Promise(resolve => setTimeout(resolve, ms)) -} - -function cookieHeader(jar: CookieJar, url: string): string { - return jar - .getCookiesSync(url) - .map(c => c.cookieString()) - .join("; ") -} - -function saveCookies( - jar: CookieJar, - url: string, - headers: Record -): void { - const raw = headers["set-cookie"] - if (!raw) return - const list = Array.isArray(raw) ? raw : [raw] - for (const c of list) jar.setCookieSync(c, url) -} - -async function getHtml( - pc: PortalClient, - url: string, - retries = MAX_RETRIES -): Promise { - for (let attempt = 0; attempt < retries; attempt++) { - await sleep( - attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt - ) - try { - const res = await pc.client.get(url, { - responseType: "text", - headers: { Cookie: cookieHeader(pc.jar, url) } - }) - saveCookies( - pc.jar, - url, - res.headers as Record - ) - if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`) - return new JSDOM(res.data).window.document - } catch (e) { - if (attempt === retries - 1) throw e - if (axios.isAxiosError(e)) continue - throw e - } - } - throw new Error("unreachable") -} - -async function postHtml( - pc: PortalClient, - url: string, - data: Record, - retries = MAX_RETRIES -): Promise { - const body = new URLSearchParams(data).toString() - for (let attempt = 0; attempt < retries; attempt++) { - await sleep( - attempt === 0 ? REQUEST_DELAY_MS : REQUEST_DELAY_MS * 2 ** attempt - ) - try { - const res = await pc.client.post(url, body, { - responseType: "text", - headers: { - "Content-Type": "application/x-www-form-urlencoded", - Cookie: cookieHeader(pc.jar, url) - }, - timeout: 180_000 - }) - saveCookies( - pc.jar, - url, - res.headers as Record - ) - if (res.status >= 400) throw new Error(`HTTP ${res.status} for ${url}`) - return new JSDOM(res.data).window.document - } catch (e) { - if (attempt === retries - 1) throw e - if (axios.isAxiosError(e)) continue - throw e - } - } - throw new Error("unreachable") -} - -// ─── Year / General Court helpers ──────────────────────────────────────────── - -export function yearToGeneralCourt(year: number): number { - return FIRST_GC + Math.floor((year - FIRST_GC_START_YEAR) / 2) -} - -// ─── Chamber normalization ──────────────────────────────────────────────────── - -/** Normalize raw portal chamber string to a canonical LobbyingChamber value. */ -export function normalizeChamber(raw: string): LobbyingChamber { - const trimmed = raw.trim() - if (LEGACY_CHAMBER_MAP[trimmed]) return LEGACY_CHAMBER_MAP[trimmed] - const known: LobbyingChamber[] = [ - "House Bill", - "Senate Bill", - "House Docket", - "Senate Docket", - "Executive" - ] - if (known.includes(trimmed as LobbyingChamber)) - return trimmed as LobbyingChamber - return "Other" -} - -/** - * Construct the MAPLE-compatible billId from the portal's chamber + raw integer. - * - * The portal stores bill numbers as bare integers; the chamber prefix is what - * distinguishes H1234 from S1234. Returns null for Executive and Other chambers - * where no bill join is possible. - */ -export function constructBillId( - chamber: LobbyingChamber, - rawBillNumber: string -): string | null { - const prefix = CHAMBER_PREFIXES[chamber] - if (!prefix) return null - const n = parseInt(rawBillNumber, 10) - if (isNaN(n)) return null - return `${prefix}${n}` -} - -// ─── Document ID generation ─────────────────────────────────────────────────── - -/** Stable Firestore document ID for a registrant (entity + year). */ -export function registrantId(entityName: string, year: number): string { - return sha256(`${year}|${entityName}`).slice(0, 40) -} - -/** - * Stable Firestore document ID for a filing. - * - * Uses a hash of the logical deduplication key. For null-bill rows (billId is - * null) the chamber is included in the key to avoid merging executive null rows - * with legislative null rows. - */ -export function filingId( - entityName: string, - clientName: string, - chamber: LobbyingChamber, - billId: string | null, - generalCourt: number, - position: string -): string { - const key = [ - entityName, - clientName, - chamber, - billId ?? "__null__", - generalCourt, - position - ].join("|") - return sha256(key).slice(0, 40) -} - -// ─── Amount parsing ─────────────────────────────────────────────────────────── - -function parseAmount(text: string): number | null { - const cleaned = text.replace(/[$,]/g, "").trim() - const n = parseFloat(cleaned) - return isNaN(n) ? null : n -} - -// ─── Portal scraping functions ──────────────────────────────────────────────── - -/** Extract ASP.NET WebForms ViewState hidden inputs from a page. */ -function extractViewState(doc: Document): Record { - const fields: Record = {} - doc.querySelectorAll('input[type="hidden"]').forEach(el => { - const input = el as HTMLInputElement - if (input.name) fields[input.name] = input.value ?? "" - }) - return fields -} - -/** - * Fetch all Summary.aspx URLs for a given year. - * Sends a single search POST with page size 20000 to get all registrants at once. - */ -export async function fetchSummaryLinks( - pc: PortalClient, - year: number -): Promise { - const searchPage = await getHtml(pc, SEARCH_URL) - const vs = extractViewState(searchPage) - - const postData: Record = { - ...vs, - __EVENTTARGET: "", - __EVENTARGUMENT: "", - ctl00$ContentPlaceHolder1$Search: "rdbSearchByType", - ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear: String(year), - ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame: "", - ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown: - "3", - ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType: "L", - ctl00$ContentPlaceHolder1$drpPageSize: "20000", - ctl00$ContentPlaceHolder1$btnSearch: "Search" - } - - const resultsPage = await postHtml(pc, SEARCH_URL, postData) - - const table = resultsPage.querySelector( - '[id*="grdvSearchResultByTypeAndCategory"]' - ) - if (!table) return [] - - const links: string[] = [] - table.querySelectorAll("a[href]").forEach(el => { - const href = (el as HTMLAnchorElement).href - if (href && href.includes("Summary.aspx")) { - // href from JSDOM is already absolute when base is set; handle both cases - const url = href.startsWith("http") ? href : BASE_URL + href - links.push(url) - } - }) - return links -} - -/** - * Fetch a Summary.aspx page and return the registrant metadata + disclosure URLs. - */ -export async function fetchDisclosureMeta( - pc: PortalClient, - summaryUrl: string -): Promise { - const doc = await getHtml(pc, summaryUrl) - - const text = (id: string) => { - const el = doc.getElementById(id) - return el?.textContent?.trim() ?? "" - } - - const entityName = text("ContentPlaceHolder1_lblRegistrantName") - const yearText = text("ContentPlaceHolder1_lblYear") - const regTypeRaw = text("ContentPlaceHolder1_lblRegType") - - const year = parseInt(yearText, 10) - const regType: "Lobbyist" | "Employer" = regTypeRaw.includes("Entity") - ? "Employer" - : "Lobbyist" - - const disclosureUrls: string[] = [] - doc.querySelectorAll("a[href]").forEach(el => { - const raw = (el as HTMLAnchorElement).getAttribute("href") ?? "" - if (raw.includes("CompleteDisclosure")) { - const url = raw.startsWith("http") ? raw : BASE_URL + raw - disclosureUrls.push(url) - } - }) - - return { - entityName, - year: isNaN(year) ? null : year, - regType, - disclosureUrls - } -} - -/** - * Parse a CompleteDisclosure.aspx page. - * - * Handles both modern (≥~2013) and legacy (<~2013) HTML layouts. - */ -export async function fetchDisclosureDetail( - pc: PortalClient, - discUrl: string, - year: number -): Promise { - const doc = await getHtml(pc, discUrl) - const compensation: RawCompensation[] = [] - const bills: RawBillActivity[] = [] - - // ── Modern format ────────────────────────────────────────────────────────── - const compTable = doc.querySelector('[id*="grdvClientPaidToEntity"]') - if (compTable) { - compTable - .querySelectorAll("tr.GridRow, tr.GridAlternatingRow") - .forEach(row => { - const cells = Array.from(row.querySelectorAll("td")).map( - td => td.textContent?.trim() ?? "" - ) - if (cells.length >= 2) { - compensation.push({ - clientName: cells[0], - amount: parseAmount(cells[1]) - }) - } - }) - } - - // Bill activity tables — one per client per reporting period. Two ID patterns: - // 2014–2018: …rptActivityNew_grdvActivitiesNew_0 (no year suffix) - // 2019+: …rptActivityNew2020_grdvActivitiesNew2020_0 (year suffix) - doc.querySelectorAll('[id*="grdvActivitiesNew"]').forEach(actTable => { - // The client name lives in the nearest preceding span with lblClientName - let clientName = "" - let node: Element | null = actTable - while ((node = node.previousElementSibling ?? node.parentElement)) { - const span = node.id?.includes("lblClientName") - ? node - : node.querySelector?.('[id*="lblClientName"]') - if (span) { - clientName = span.textContent?.trim() ?? "" - break - } - if (node === node.parentElement) break - } - - actTable - .querySelectorAll("tr.GridRow, tr.GridAlternatingRow") - .forEach(row => { - const cells = Array.from(row.querySelectorAll("td")).map( - td => td.textContent?.trim() ?? "" - ) - // Columns: House/Senate, Bill Number, Bill title, Position, Amount, Direct business - if (cells.length < 4) return - const chamber = normalizeChamber(cells[0]) - const rawBillNumber = cells[1] - const billId = constructBillId(chamber, rawBillNumber) - bills.push({ - clientName, - chamber, - rawBillNumber, - billId, - activityTitle: cells[2] ?? "", - position: cells[3] ?? "", - amount: cells.length > 4 ? parseAmount(cells[4]) : null - }) - }) - }) - - if (compTable || bills.length > 0) { - return { compensation, bills } - } - - // ── Legacy format (<~2013) ───────────────────────────────────────────────── - const salaryTable = doc.querySelector('[id*="grdvSalaryPaid"]') - if (salaryTable) { - let total = 0 - salaryTable.querySelectorAll("tr").forEach(row => { - const cells = Array.from(row.querySelectorAll("td")).map( - td => td.textContent?.trim() ?? "" - ) - if (cells.length >= 2 && !cells[0].includes("Total")) { - const amt = parseAmount(cells[1]) - if (amt !== null) total += amt - } - }) - if (total > 0) { - compensation.push({ clientName: LEGACY_TOTAL_CLIENT, amount: total }) - } - } - - // Legacy bill activity: single grdvActivities table. Three known column layouts: - // 2009 4-col: Date | Bill+Title | Lobbyist | Client - // 2010+ individual 5-col: Activity | Position | DirectBiz | Client | Compensation - // 2010+ entity 6-col: Activity | Lobbyist | Position | DirectBiz | Client | Compensation - const actTable = doc.querySelector('[id$="grdvActivities"]') - if (actTable) { - const allRows = Array.from(actTable.querySelectorAll("tr")) - const headerCells = Array.from( - allRows[0]?.querySelectorAll("th, td") ?? [] - ).map(el => el.textContent?.trim() ?? "") - - let billCol = 1 - let positionCol: number | null = null - let clientCol = 3 - - if (headerCells[0]?.includes("Activity")) { - if (headerCells[1]?.includes("Lobbyist")) { - // 6-col entity layout - billCol = 0 - positionCol = 2 - clientCol = 4 - } else { - // 5-col individual layout - billCol = 0 - positionCol = 1 - clientCol = 3 - } - } - - const chamberMap: Record = { - H: "House Bill", - S: "Senate Bill", - HD: "House Docket", - SD: "Senate Docket" - } - - allRows.slice(1).forEach(row => { - const cells = Array.from(row.querySelectorAll("td")).map( - td => td.textContent?.trim() ?? "" - ) - if (cells.length <= Math.max(billCol, clientCol)) return - - const billCell = cells[billCol] - const skipValues = new Set([ - "Activity or Bill No and Title", - "N/A", - "None", - "", - "Total amount" - ]) - if (!billCell || skipValues.has(billCell)) return - - const parts = billCell.split(/\s+/) - const billNo = parts[0] - const activityTitle = parts.slice(1).join(" ") - const match = billNo.match(/^([A-Z]+)(\d+)$/) - if (!match) return - - const [, prefix, number] = match - const chamber: LobbyingChamber = chamberMap[prefix] ?? "Other" - const billId = constructBillId(chamber, number) - const position = positionCol !== null ? cells[positionCol] ?? "" : "" - const clientName = cells[clientCol] ?? "" - - bills.push({ - clientName, - chamber, - rawBillNumber: number, - billId, - activityTitle, - position, - amount: null - }) - }) - } - - return { compensation, bills } -} diff --git a/functions/src/lobbying/scrapeLobbying.ts b/functions/src/lobbying/scrapeLobbying.ts deleted file mode 100644 index 7a6140e8e..000000000 --- a/functions/src/lobbying/scrapeLobbying.ts +++ /dev/null @@ -1,274 +0,0 @@ -import { logger } from "firebase-functions" -import { runWith } from "firebase-functions/v1" -import { db, Timestamp } from "../firebase" -import type { Database } from "../types" -import { normalizeEntityName } from "./normalize" -import { - fetchDisclosureDetail, - fetchDisclosureMeta, - fetchSummaryLinks, - filingId, - makePortalClient, - registrantId, - yearToGeneralCourt -} from "./portal" -import { - FILINGS_COLLECTION, - FIRST_LOBBYING_YEAR, - LobbyingFiling, - LobbyingRegistrant, - REGISTRANTS_COLLECTION, - SCRAPER_DOC -} from "./types" - -/** - * Scraper state stored in Firestore at /scrapers/lobbying. - * - * processedDiscUrls: disc URLs already fetched; skip on re-runs. - * summaryDiscCache: maps summaryUrl → its known disc URLs so we can skip - * summary page GETs for registrants with no new filings. - */ -interface ScraperState { - processedDiscUrls: string[] - summaryDiscCache: Record -} - -/** - * Maximum number of new disclosure pages to fetch per function invocation. - * Each page takes ~1s; this keeps the run well within the 540s timeout. - * Remaining work is picked up on the next scheduled run. - */ -const MAX_DISCLOSURES_PER_RUN = 200 - -/** - * Scrape lobbying disclosure data for the current and prior calendar year. - * - * Runs every 24 hours. New filers arrive semi-annually so daily polling is - * more than sufficient for steady-state freshness. For initial historical - * ingestion (2005-present) use the backfillLobbying admin script instead. - * - * Progress is checkpointed to Firestore after every disclosure page so the - * function is fully resumable if it times out or is interrupted. - */ -export const scrapeLobbying = runWith({ timeoutSeconds: 540, maxInstances: 1 }) - .pubsub.schedule("every 24 hours") - .onRun(async () => { - const currentYear = new Date().getFullYear() - const years = [currentYear, currentYear - 1] - - const scraperRef = db.doc(SCRAPER_DOC) - const scraperDoc = await scraperRef.get() - const state: ScraperState = { - processedDiscUrls: scraperDoc.data()?.processedDiscUrls ?? [], - summaryDiscCache: scraperDoc.data()?.summaryDiscCache ?? {} - } - const processedSet = new Set(state.processedDiscUrls) - const summaryCache: Record = state.summaryDiscCache - - const client = makePortalClient() - let newDiscCount = 0 - - for (const year of years) { - if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break - - logger.info(`scrapeLobbying: fetching summary links for ${year}`) - let summaryUrls: string[] - try { - summaryUrls = await fetchSummaryLinks(client, year) - } catch (e) { - logger.error( - `scrapeLobbying: failed to fetch summary links for ${year}`, - e - ) - continue - } - logger.info( - `scrapeLobbying: ${summaryUrls.length} registrants for ${year}` - ) - - for (const summaryUrl of summaryUrls) { - if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break - - // Use cached disc URLs when available to avoid re-fetching summary pages. - // For current year we always re-check (new filings arrive mid-year). - let discUrls = summaryCache[summaryUrl] - if (!discUrls || year === currentYear) { - try { - const meta = await fetchDisclosureMeta(client, summaryUrl) - discUrls = meta.disclosureUrls - - // Write registrant doc (upsert); don't wait for individual writes to - // finish — use a bulkWriter for the doc contents but checkpoint the - // scraper state separately so interruptions are recoverable. - if (meta.entityName && meta.year) { - await writeRegistrant( - db, - meta.entityName, - meta.year, - meta.regType, - discUrls - ) - } - - summaryCache[summaryUrl] = discUrls - await scraperRef.set( - { summaryDiscCache: summaryCache }, - { merge: true } - ) - } catch (e) { - logger.warn( - `scrapeLobbying: failed to fetch summary ${summaryUrl}`, - e - ) - continue - } - } - - const newDiscUrls = discUrls.filter(u => !processedSet.has(u)) - if (newDiscUrls.length === 0) continue - - for (const discUrl of newDiscUrls) { - if (newDiscCount >= MAX_DISCLOSURES_PER_RUN) break - try { - await processDisclosure(db, client, summaryUrl, discUrl, year) - processedSet.add(discUrl) - newDiscCount++ - - // Checkpoint after every disclosure so restarts lose at most one page - await scraperRef.set( - { processedDiscUrls: Array.from(processedSet) }, - { merge: true } - ) - } catch (e) { - logger.warn( - `scrapeLobbying: failed to process disclosure ${discUrl}`, - e - ) - } - } - } - } - - logger.info(`scrapeLobbying: processed ${newDiscCount} new disclosures`) - }) - -// ─── Shared write helpers (also used by backfillLobbying) ──────────────────── - -/** - * Write or update a LobbyingRegistrant document. Client list is assembled from - * the disclosure meta; filing documents are written separately per-bill. - */ -export async function writeRegistrant( - database: Database, - entityName: string, - year: number, - regType: "Lobbyist" | "Employer", - disclosureUrls: string[] -): Promise { - const id = registrantId(entityName, year) - const ref = database.collection(REGISTRANTS_COLLECTION).doc(id) - const partial: Omit & { - fetchedAt: FirebaseFirestore.Timestamp - } = { - registrantId: id, - entityName, - entityNameNorm: normalizeEntityName(entityName), - year, - generalCourt: yearToGeneralCourt(year), - regType, - disclosureUrls, - fetchedAt: Timestamp.now() - } - // Merge so repeated runs don't wipe clients accumulated from multiple disclosures - await ref.set(partial, { merge: true }) -} - -/** - * Fetch one CompleteDisclosure page and write LobbyingFiling documents. - * Also updates the registrant's client list. - */ -export async function processDisclosure( - database: Database, - client: ReturnType, - summaryUrl: string, - discUrl: string, - year: number -): Promise { - const meta = await fetchDisclosureMeta(client, summaryUrl) - const detail = await fetchDisclosureDetail(client, discUrl, year) - - const { entityName, regType } = meta - const gc = yearToGeneralCourt(year) - const entityNameNorm = normalizeEntityName(entityName) - const now = Timestamp.now() - - // Update registrant's client list - if (entityName && year) { - const regRef = database - .collection(REGISTRANTS_COLLECTION) - .doc(registrantId(entityName, year)) - - const clients = detail.compensation.map(c => ({ - clientName: c.clientName, - clientNameNorm: normalizeEntityName(c.clientName), - compensation: c.amount - })) - - await regRef.set( - { - registrantId: registrantId(entityName, year), - entityName, - entityNameNorm, - year, - generalCourt: gc, - regType: regType ?? "Lobbyist", - clients, - disclosureUrls: [discUrl], - fetchedAt: now - }, - { merge: true } - ) - } - - // Write one LobbyingFiling doc per bill row - if (detail.bills.length === 0) return - - const writer = database.bulkWriter() - for (const bill of detail.bills) { - const fid = filingId( - entityName, - bill.clientName, - bill.chamber, - bill.billId, - gc, - bill.position - ) - const doc: LobbyingFiling = { - filingId: fid, - entityName, - entityNameNorm, - clientName: bill.clientName, - clientNameNorm: normalizeEntityName(bill.clientName), - year, - generalCourt: gc, - chamber: bill.chamber, - billId: bill.billId, - activityTitle: bill.activityTitle, - position: bill.position, - amount: bill.amount, - fetchedAt: now - } - writer.set(database.collection(FILINGS_COLLECTION).doc(fid), doc, { - merge: false - }) - } - await writer.close() -} - -/** All years to scrape, for use by the backfill script. */ -export function allLobbyingYears(): number[] { - const current = new Date().getFullYear() - const years: number[] = [] - for (let y = FIRST_LOBBYING_YEAR; y <= current; y++) years.push(y) - return years -} diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile index 738293459..4b2da65b5 100644 --- a/lobbying-scraper/Dockerfile +++ b/lobbying-scraper/Dockerfile @@ -11,4 +11,6 @@ COPY normalize.py portal.py writer.py scrape.py ./ # Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally. ENV PYTHONUNBUFFERED=1 -CMD ["python3", "scrape.py", "--mode", "weekly"] +# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides. +ENTRYPOINT ["python3", "scrape.py"] +CMD ["--mode", "weekly"] diff --git a/scripts/firebase-admin/backfillLobbying.ts b/scripts/firebase-admin/backfillLobbying.ts deleted file mode 100644 index a2a66330e..000000000 --- a/scripts/firebase-admin/backfillLobbying.ts +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Backfill lobbying disclosure data from 2005 to the present. - * - * Delegates all HTTP fetching and Firestore writes to the Python scraper in - * lobbying-scraper/. The TypeScript layer handles argument parsing and - * environment setup only. - * - * Usage: - * GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ - * yarn firebase-admin run-script backfillLobbying --env dev - * - * Options (passed through to scrape.py): - * --year NUMBER Only process this year - * --limit NUMBER Max registrants per year (for testing) - * - * Requires: pip install -r lobbying-scraper/requirements.txt - * Or run inside the maple-2025 conda environment. - */ - -import { spawn } from "child_process" -import path from "path" -import { z } from "zod" -import { Script } from "./types" - -const Args = z - .object({ - year: z.number().int().min(2005).optional(), - limit: z.number().int().positive().optional() - }) - .passthrough() - -const SCRAPER = path.resolve(__dirname, "../../lobbying-scraper/scrape.py") - -export const script: Script = async ({ env, args }) => { - const { year, limit } = Args.parse(args) - - if (env === "local") { - throw new Error( - "backfillLobbying requires --env dev or --env prod " + - "(it writes to a real Firestore project; local emulator not supported yet)" - ) - } - - const pyArgs = ["--mode", "backfill"] - if (year) pyArgs.push("--year", String(year)) - if (limit) pyArgs.push("--limit", String(limit)) - - console.log(`Running: python3 ${SCRAPER} ${pyArgs.join(" ")}`) - console.log( - `Firestore project: ${process.env.GCLOUD_PROJECT || "(from ADC)"}` - ) - - await new Promise((resolve, reject) => { - const proc = spawn("python3", [SCRAPER, ...pyArgs], { - stdio: ["ignore", "inherit", "inherit"], - env: { ...process.env } - }) - proc.on("close", code => { - if (code === 0) resolve() - else reject(new Error(`scrape.py exited with code ${code}`)) - }) - proc.on("error", reject) - }) -}