diff --git a/.gitignore b/.gitignore index 571150641..7301e0ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,10 @@ cert.txt # local MCP server config (contains auth tokens) .mcp.json mcp-server/create-agent-key.ts + +# Claude +CLAUDE.md + +#gcloud +.gcloudignore + diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md new file mode 100644 index 000000000..51719f342 --- /dev/null +++ b/docs/lobbying-disclosure-ingestion.md @@ -0,0 +1,678 @@ +# Lobbying Disclosure Ingestion Pipeline + +## Overview + +The MA Secretary of State lobbying portal +([sec.state.ma.us/LobbyistPublicSearch](https://www.sec.state.ma.us/LobbyistPublicSearch/)) +publishes semi-annual disclosure filings for all registered lobbyists and +lobbying entities. This document describes the plan for scraping that data and +storing it in Firestore in a way that allows joining to MAPLE bill data. + +The portal has three levels of pages: + +1. **Search page** → one row per registrant per year +2. **Summary page** → registrant metadata + links to semi-annual disclosure + filings +3. **CompleteDisclosure page** → per-client compensation table + per-client bill + activity tables + +Historical data goes back to 2005. MAPLE has bill data only from ~2020 onward, +so bill joins will only resolve for filings from the 192nd General Court (2021) +and later. All historical filings are ingested regardless. + +--- + +## Terminology + +The portal has two registrant types: + +- **Lobbyist** — an individual person who lobbies directly on behalf of clients. +- **Employer** — a lobbying firm that employs individual lobbyists and is + retained by clients. Called "Lobbyist Entity" on the portal. + +In both cases, the registrant reports compensation received from each **client** +(the organization that hired them) and which bills they lobbied for that client. + +--- + +## Firestore Data Model + +Two top-level collections, normalized by registrant and by lobbying activity +record. + +### `/lobbyingRegistrants/{registrantId}` + +`registrantId` is a slugified `{entityName}_{year}` (stable, dedup-safe). + +One model covers both individual lobbyists and lobbying firms. A separate model +is not needed because the portal search returns both under the same schema, and +per-filing detail pages do not expose which individual lobbyists within a firm +worked on which bill. + +```typescript +interface LobbyingRegistrant { + registrantId: string // "{entityName}_{year}" slugified + entityName: string // firm name or individual lobbyist name (raw portal value) + entityNameNorm: string // normalized form; see Normalization section + year: number + generalCourt: number // computed from year + regType: "Lobbyist" | "Employer" + clients: LobbyingClient[] + disclosureUrls: string[] // source portal URLs, for audit trail + fetchedAt: Timestamp +} + +interface LobbyingClient { + clientName: string + clientNameNorm: string // normalized form + compensation: number | null +} +``` + +### `/lobbyingFilings/{filingId}` + +`filingId` is a slugified +`{entityName}_{clientName}_{chamber}_{activityRef}_{generalCourt}`. + +```typescript +type LobbyingChamber = + | "House Bill" + | "Senate Bill" + | "House Docket" + | "Senate Docket" + | "Executive" // lobbying of executive branch agencies + | "Other" // catch-all for rare legacy codes (FY, CMR, etc.) + +interface LobbyingFiling { + filingId: string + entityName: string // raw portal value + entityNameNorm: string // normalized form + clientName: string // raw portal value; "_total_salary_" sentinel for pre-2013 + clientNameNorm: string // normalized form + year: number + generalCourt: number + chamber: LobbyingChamber + // For legislative chambers: the bill number string (e.g. "H1234", "HD56"). + // For Executive: the agency name. Not a bill reference. + billId: string | null + activityTitle: string // bill title (legislative) or meeting description (executive) + position: string // "Support" | "Oppose" | "Neutral" | etc.; empty for executive + amount: number | null // compensation allocated to this activity + fetchedAt: Timestamp +} +``` + +### Constructing `billId` from Raw Portal Data + +The portal stores bill numbers as bare integers and records the chamber +separately. The `billId` field — which maps to `Bill.id` in MAPLE — is +constructed during ingest by combining chamber prefix and integer: + +| `chamber` | Prefix | Example raw | `billId` | +| --------------- | ------ | ----------- | -------- | +| `House Bill` | `H` | `1234` | `H1234` | +| `Senate Bill` | `S` | `1234` | `S1234` | +| `House Docket` | `HD` | `56` | `HD56` | +| `Senate Docket` | `SD` | `56` | `SD56` | +| `Executive` | — | agency name | `null` | +| `Other` | — | varies | `null` | + +Note: `H1234` and `S1234` are distinct bills even though they share the same +integer. The prefix is required to disambiguate. `billId` is `null` for +non-legislative chambers. + +#### Legacy chamber code normalization + +The portal uses short-form codes in older filings, normalized during ingest: + +| Raw value | Stored as | +| --------- | ------------- | +| `HB` | `House Bill` | +| `SB` | `Senate Bill` | + +Rare codes (`FY`, `C`, `CMR`, `HR`, etc.) are stored as `Other`. + +### Joining to Bill Data + +**The join only applies to legislative chambers** (`House Bill`, `Senate Bill`, +`House Docket`, `Senate Docket`) where `billId` is non-null. For `Executive` +and `Other`, no join should be attempted. + +```typescript +// Only valid when filing.billId !== null +db.collection(`/generalCourts/${filing.generalCourt}/bills`).doc(filing.billId) +``` + +--- + +## Entity Name Normalization + +The portal does not enforce consistent name formatting. The same client or +registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a +Acme Consulting", etc. across filings and years. Without normalization, +grouping by entity is unreliable. + +Both `entityName` and `clientName` are normalized using the following pipeline, +applied in order. The raw portal value is always preserved alongside the +normalized form. + +### Normalization pipeline + +1. **Uppercase** — convert the entire string to upper case. +2. **Strip d/b/a suffix** — remove everything from the first occurrence of + `D/B/A`, `D/B/A`, `DBA` (and spacing variants) onward, so the registered + name is used rather than a trade name. +3. **Hyphen → space** — replace `-` with ` ` so `LAN-TEL` and `LAN TEL` + collapse to the same key. +4. **Punctuation → space** — replace `,`, `.`, `'`, `'`, `'`, `(`, `)` with + space. Replacement with space (not empty string) prevents adjacent tokens + from concatenating (e.g. `,INC` becomes ` INC`, which is then caught by step + 5). +5. **Remove legal entity type words** — whole-word removal of: `LLC`, `LLP`, + `INC`, `INCORPORATED`, `CORPORATION`, `CORP`, `LTD`, `LIMITED`, `PC`, + `PLLC`. +6. **Remove "THE"** — whole-word removal anywhere in the string (not just as a + leading prefix). +7. **Ampersand → AND** — replace `&` with `AND`. +8. **Fix known typo** — replace `ASSICIATES` with `ASSOCIATES` (legacy portal + data). +9. **Remove professional suffix phrases** — whole-phrase removal of: `LAW +OFFICE OF`, `AND ASSOCIATES`, `& ASSOCIATES`, `AND ASSOC`, `ATTORNEY AT +LAW`, `ATTORNEY@LAW`, `ATTORNET AT LAW`, `AND PARTNERS`, `PUBLIC POLICY +GROUP`, `LEGISLATIVE SERVICES`, `POLICY GROUP`, `ASSOCIATES`, `COUNSELLORS +AT LAW`. +10. **Collapse whitespace** — replace runs of whitespace with a single space and + strip leading/trailing whitespace. + +### Usage + +`entityNameNorm` and `clientNameNorm` are stored on every document and filing. +They should be used for grouping, deduplication, and display-level matching. +Raw names are preserved for provenance and audit. + +--- + +## Deduplication and Amount Aggregation + +### Does lobbying the same bill multiple times mean we should sum amounts? + +The portal collects two semi-annual disclosure filings per registrant per year +(one for each 6-month period). In theory, a registrant could report the same +bill in both H1 and H2 filings with separate compensation amounts that should +be summed. Analysis of the actual data shows this does not occur: after +processing, zero rows share the same `(entityName, clientName, year, +generalCourt, billId, position)` — each (registrant, client, bill, year) +combination appears exactly once. The semi-annual periods report different +activity, not the same activity twice. + +The same registrant can lobby the same bill across multiple General Courts +(observed up to 6 times across years). These are stored as separate documents +per `generalCourt` and should not be summed — each court is a distinct +legislative session. + +### Null-bill row deduplication + +The one real duplication artifact in the portal data is **null-bill rows** — +entries filed when a registrant had no specific bills to report for a client in +a period. These appear in both the H1 and H2 disclosures as identical rows and +should be collapsed. During ingest, if the same `(entityName, clientName, year, +generalCourt, chamber, position)` with a null `billId` is encountered more than +once, keep the row with the highest `amount` so no spend is lost if the two +copies carry different values (in practice amounts are usually both zero). + +### Ingest strategy + +When processing multiple disclosure URLs for the same registrant+year, write +`lobbyingFilings` documents using the logical key as the document ID. A +subsequent disclosure URL that produces the same document ID will naturally +upsert (overwrite) rather than duplicate. For null-bill rows, since `billId` is +null, include `chamber` in the document ID to avoid false merges between +executive and legislative null rows. + +--- + +## Scraper Architecture + +### Why a standalone Cloud Run container + +The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to +classify HTTP clients at the network layer before examining any headers. Node.js +produces a TLS fingerprint that Imperva challenges with a JavaScript +verification page; Python's `requests` library produces a fingerprint that +Imperva allows through without challenge. This is a runtime-level constraint +that cannot be addressed by header configuration or cipher reordering alone. + +The scraper therefore runs as a standalone **Cloud Run container** written in +Python, deployed alongside the existing MCP server container. All data modeling, +Firestore collection/field names, and normalization logic are documented here and +kept consistent between the Python container and the TypeScript type definitions +in `functions/src/lobbying/types.ts`. + +### Cloud Run container: `lobbying-scraper/` + +**Files:** `lobbying-scraper/{scrape,portal,normalize,writer}.py` + +- Scheduled weekly by Cloud Scheduler +- Runs an incremental check: fetches the current and prior year's summary links + (one POST), compares disc URLs against the Firestore cursor, and **exits + immediately if nothing is new** (fast path, typically seconds) +- When new or updated disclosures are found, fetches and processes them +- Persists a cursor in `/scrapers/lobbying`: + - `processedDiscUrls: string[]` — disc URLs already written; skipped on + re-runs + - `summaryDiscCache: {[summaryUrl]: string[]}` — maps summary page URLs to + their disc URLs so summary page GETs are skipped for prior-year registrants + whose disclosures are all already processed +- For each new disclosure URL: + - Parse registrant + client compensation rows → upsert `lobbyingRegistrants` + - Parse bill activity rows → batch-write `lobbyingFilings` +- 1s delay between requests; exponential backoff on transient failures + +### Incremental strategy + +In steady state (after the initial backfill), each weekly run: + +1. One POST to fetch all summary links for current + prior year +2. For prior-year registrants with all disc URLs in the cursor: zero GETs +3. For current-year registrants: one GET per summary page to check for new + disclosure periods +4. For any new disc URLs: one GET per disclosure page + +New filings arrive twice a year (semi-annual reporting periods). Between +periods, the run completes in under a minute. + +The backfill script (`--mode backfill`) uses a separate subcollection cursor at +`/scrapers/lobbyingBackfill/processedUrls/{urlHash}` so it does not interfere +with the live scraper state. + +### Legacy Format (pre-2013) + +The portal uses a different HTML layout for filings before ~2013: total salary +is not broken down by client, and all bill activity is in a single table. These +are stored with `clientName: "_total_salary_"` so callers can detect and filter +them. No bill-level compensation amount is available for these years. + +--- + +## New Files + +``` +functions/src/lobbying/ + types.ts — Runtypes schema definitions for LobbyingRegistrant, LobbyingFiling + normalize.ts — Entity name normalization pipeline (also used client-side) + index.ts — Re-exports + +lobbying-scraper/ + scrape.py — Entry point: --mode weekly (incremental) | --mode backfill + portal.py — HTTP + HTML parsing + normalize.py — Port of normalize.ts + writer.py — Firestore document construction + writes + requirements.txt — requests, beautifulsoup4, google-cloud-firestore + Dockerfile — Python 3.12-slim image +``` + +The TypeScript lobbying module (`functions/src/lobbying/`) contains only the +schema types and normalization logic. There is no TypeScript scraper or +Firebase Function — ingestion is handled entirely by the Cloud Run container. +This follows the same pattern as the MCP server and avoids the complexity of +running multiple language runtimes in the same Firebase Functions deployment. + +--- + +## Deploying the Cloud Run Container + +Follows the same pattern as the MCP server. The Artifact Registry repo +(`maple-lobbying`) and Cloud Run job (`maple-lobbying-scraper`) are already +created in `digital-testimony-dev`. + +```bash +cd lobbying-scraper +IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest +docker build -t $IMAGE . && docker push $IMAGE + +gcloud run jobs update maple-lobbying-scraper \ + --image=$IMAGE \ + --project=digital-testimony-dev \ + --region=us-central1 +``` + +For a new project (prod), create the job first: + +```bash +gcloud artifacts repositories create maple-lobbying \ + --repository-format=docker --location=us-central1 --project= + +gcloud run jobs create maple-lobbying-scraper \ + --image=$IMAGE \ + --project= \ + --region=us-central1 \ + --task-timeout=30m \ + --max-retries=0 + +# Schedule weekly (Mondays 6am UTC) +gcloud scheduler jobs create http maple-lobbying-weekly \ + --schedule="0 6 * * 1" \ + --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces//jobs/maple-lobbying-scraper:run" \ + --http-method=POST \ + --oauth-service-account-email=@.iam.gserviceaccount.com \ + --location=us-central1 +``` + +## Historical Backfill + +Runs `scrape.py --mode backfill` directly. Resumable — the subcollection +cursor at `/scrapers/lobbyingBackfill/processedUrls` tracks progress. +Requires `lobbying-scraper/` deps or the `maple-2025` conda environment. + +```bash +cd lobbying-scraper + +# Test a single year with no writes +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run + +# Run a single year for real +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill --year 2024 + +# Full history (2005-present, resumable) +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill +``` + +--- + +## Firestore Rules + +Add read-only public rules alongside the existing `generalCourts` rule: + +``` +match /lobbyingRegistrants/{doc} { allow read: if true; } +match /lobbyingFilings/{doc} { allow read: if true; } +``` + +--- + +## Firestore Indexes + +Add composite indexes for common query patterns: + +| Collection | Fields | Use case | +| ----------------- | -------------------------------------- | ---------------------------------------- | +| `lobbyingFilings` | `generalCourt ASC, billId ASC` | Fetch all legislative filings for a bill | +| `lobbyingFilings` | `generalCourt ASC, chamber ASC` | Filter by chamber within a court | +| `lobbyingFilings` | `generalCourt ASC, entityNameNorm ASC` | Fetch all filings for a registrant | +| `lobbyingFilings` | `generalCourt ASC, clientNameNorm ASC` | Fetch all filings for a client | + +Note: bill-join queries should always filter on `chamber` (or check +`billId !== null`) to exclude `Executive` and `Other` rows before treating +`billId` as a MAPLE bill reference. + +--- + +## Function Export + +Add to `functions/src/index.ts`: + +```typescript +export { scrapeLobbying } from "./lobbying" +``` + +--- + +## Implementation Status + +| File | Status | Notes | +| ------------------------------------- | ------- | -------------------------------------------------------- | +| `functions/src/lobbying/types.ts` | ✅ Done | Firestore schema types; imported by future frontend code | +| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline; also ported to `normalize.py` | +| `functions/src/lobbying/index.ts` | ✅ Done | Re-exports types and normalize | +| `firestore.rules` | ✅ Done | | +| `firestore.indexes.json` | ✅ Done | | +| `lobbying-scraper/normalize.py` | ✅ Done | Port of normalize.ts | +| `lobbying-scraper/portal.py` | ✅ Done | HTTP + HTML parsing | +| `lobbying-scraper/writer.py` | ✅ Done | Firestore document construction | +| `lobbying-scraper/scrape.py` | ✅ Done | Entry point; `--mode weekly` and `--mode backfill` | +| `lobbying-scraper/Dockerfile` | ✅ Done | Python 3.12-slim; deployed to Cloud Run | + +### Document ID scheme + +Both `registrantId` and `filingId` are SHA-256 hashes (first 40 hex chars) of +their respective logical keys. Hashes are used rather than slugified strings +because entity names and client names contain arbitrary Unicode and punctuation +that would require aggressive sanitization to fit Firestore ID constraints. The +hash is stable across runs for the same logical record. + +--- + +## Future Work (Subsequent PRs) + +### Frontend + +- **Dedicated lobbying pages** + + - `/lobbyists` index: searchable list of registrants with total compensation, + client count, and year filter + - `/lobbyists/{registrantId}` profile: full client list, all bills lobbied, + compensation over time + - `/clients/{clientNameNorm}` profile: registrants hired, bills lobbied, + total spend per year + +- **Bill page integration** (`/bills/{court}/{billId}`) + + - "Lobbying activity" section listing registrants + clients that lobbied this + bill, with position (Support / Oppose / Neutral) and compensation where + available + - Link to registrant profile pages + +- **Organization profile page integration** + - If an organization's normalized name matches a `clientNameNorm` in + `lobbyingFilings`, surface a "Lobbying history" panel showing which bills + they lobbied and which registrants they hired + +### MCP Tools + +Expose lobbying data via the MAPLE MCP server so that AI agents and Claude can +answer questions like "who lobbied bill H1234?" or "what did Acme Corp lobby +for in 2024?". + +- **`get_lobbying_filings_for_bill`** — given `generalCourt` + `billId`, return + all `lobbyingFilings` for that bill with registrant, client, position, and + amount +- **`get_lobbying_registrant`** — given `registrantId`, return the registrant + document with client list and disclosure URLs +- **`search_lobbying_by_client`** — given a client name (raw or normalized), + return matching filings across all courts +- **`get_lobbying_summary_for_bill`** — aggregate view: unique registrant count, + unique client count, total compensation (where non-null), position breakdown + +--- + +## Incremental Test Plan + +Testing proceeds from the inside out: unit logic first, then live portal +fetches against the real site, then a small Firestore write, then a full +backfill year, then steady-state function operation. + +### Step 1 — Unit test: normalization + +Run the normalization pipeline against known inputs and verify the outputs match +the reference implementation. + +```bash +# In a Node REPL or ts-node session: +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { normalizeEntityName } = require('./functions/src/lobbying/normalize') +console.log(normalizeEntityName('Acme Corp., Inc. d/b/a Acme Consulting')) +// Expected: 'ACME' +console.log(normalizeEntityName('LAN-TEL COMMUNICATIONS, INC.')) +// Expected: 'LAN TEL COMMUNICATIONS' +console.log(normalizeEntityName('Law Office of Jane Smith, LLC')) +// Expected: 'JANE SMITH' +" +``` + +### Step 2 — Unit test: chamber normalization and billId construction + +```bash +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { normalizeChamber, constructBillId } = require('./functions/src/lobbying/portal') +console.log(normalizeChamber('HB')) // House Bill +console.log(normalizeChamber('SB')) // Senate Bill +console.log(normalizeChamber('Executive')) // Executive +console.log(normalizeChamber('FY2024')) // Other +console.log(constructBillId('House Bill', '1234')) // H1234 +console.log(constructBillId('Senate Bill', '567')) // S567 +console.log(constructBillId('House Docket', '89')) // HD89 +console.log(constructBillId('Executive', 'EOEEA')) // null +" +``` + +### Step 3 — Live portal fetch: summary links + +Verify the portal is reachable and returns results for the current year. Use +`--limit 1` to minimize requests. + +```bash +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { makePortalClient, fetchSummaryLinks } = require('./functions/src/lobbying/portal') +const client = makePortalClient() +fetchSummaryLinks(client, 2024).then(urls => { + console.log('Summary links for 2024:', urls.length) + console.log('First URL:', urls[0]) +}).catch(console.error) +" +``` + +Expected: ~400–600 URLs, each containing `Summary.aspx`. + +### Step 4 — Live portal fetch: summary meta + one disclosure + +Pick the first summary URL from Step 3 and fetch its meta and first disclosure. + +```bash +conda run -n maple-2025 ts-node -P tsconfig.script.json -e " +const { makePortalClient, fetchSummaryLinks, fetchDisclosureMeta, fetchDisclosureDetail } = require('./functions/src/lobbying/portal') +async function main() { + const client = makePortalClient() + const [summaryUrl] = await fetchSummaryLinks(client, 2024) + const meta = await fetchDisclosureMeta(client, summaryUrl) + console.log('Meta:', JSON.stringify(meta, null, 2)) + if (meta.disclosureUrls[0]) { + const detail = await fetchDisclosureDetail(client, meta.disclosureUrls[0], 2024) + console.log('Compensation rows:', detail.compensation.length) + console.log('Bill rows:', detail.bills.length) + console.log('First bill:', detail.bills[0]) + } +} +main().catch(console.error) +" +``` + +Verify: `meta.entityName` is non-empty, `meta.regType` is `"Lobbyist"` or +`"Employer"`, bill rows have `billId` set correctly for legislative chambers. + +### Step 5 — Backfill: single year, small limit against dev Firestore + +Write a small batch to the dev Firestore emulator or dev project. + +```bash +# Against local emulator: +conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env local -- --year 2024 --limit 3 + +# Against dev project (writes real Firestore): +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env dev -- --year 2024 --limit 3 +``` + +Verify in Firestore console or emulator UI: + +- `lobbyingRegistrants` has 3 documents with `entityName`, `entityNameNorm`, + `regType`, `clients`, `generalCourt` +- `lobbyingFilings` has documents with `billId` non-null for legislative rows + and null for Executive rows +- `/scrapers/lobbyingBackfill/processedUrls` has entries with `url` and + `processedAt` fields +- Re-running the same command skips already-processed URLs (output shows 0 new + disclosures) + +### Step 6 — Spot-check: bill join + +Pick a `lobbyingFiling` document with a non-null `billId` and a `generalCourt` +≥ 192. Verify the bill exists in MAPLE: + +``` +/generalCourts/{filing.generalCourt}/bills/{filing.billId} +``` + +If the bill is found, the join key is correct. If not found, check: (a) whether +MAPLE has data for that court, (b) whether the bill number format matches +(prefix + integer, no leading zeros). + +### Step 7 — Backfill: full current year + +Once Step 5 passes, run without `--limit` for the current year: + +```bash +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env dev -- --year 2024 +``` + +Monitor progress via console output. Expected: ~500–600 registrants, ~1,000 +disclosure pages, several thousand filing documents written. + +### Step 8 — Backfill: full history (2005–present) + +Run without `--year` to process all years. Can be interrupted and resumed: + +```bash +GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + conda run -n maple-2025 yarn firebase-admin run-script backfillLobbying \ + --env dev +``` + +Expected runtime: several hours at 1s/request. The subcollection cursor at +`/scrapers/lobbyingBackfill/processedUrls` allows safe interruption and +resumption. + +### Step 9 — Deploy and verify Cloud Function + +Deploy the function to the dev project: + +```bash +conda run -n maple-2025 firebase deploy \ + --only functions:maple:scrapeLobbying \ + --project digital-testimony-dev +``` + +Trigger a manual run via the Firebase console or: + +```bash +conda run -n maple-2025 yarn firebase-admin run-script runScrapers \ + --env local --targets scrapeLobbying +``` + +Verify: Cloud Function logs show the expected number of new disclosures (should +be near zero if backfill completed, since current+prior year are already +processed). + +--- + +## Design Decisions + +| Decision | Choice | Rationale | +| --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Collection placement | Top-level `/lobbyingRegistrants`, `/lobbyingFilings` | Lobbying data spans multiple General Courts and is not scoped to a single court like bills/members | +| Single registrant model | One type, `regType: "Lobbyist" \| "Employer"` | Individual lobbyists and firms share the same portal schema; per-bill individual attribution is not available | +| `billId` construction | `{chamberPrefix}{billNumber}` at ingest time | Raw portal data stores chamber and integer separately; the composite is what matches MAPLE's `Bill.id` | +| `billId` null for Executive | `null` instead of agency name | Prevents accidental bill lookups; makes join guard explicit at the type level | +| Normalized name fields | Store both raw and `*Norm` fields | Raw names preserved for provenance; normalized names used for grouping and matching | +| HTML parser | `jsdom` | Already in `functions/package.json` (used by events scraper); no need to add cheerio | +| Live scraper cursor | Array in `/scrapers/lobbying` doc | ~1,000 URLs/year fits well within the 1 MB Firestore doc limit; simple and atomic with other scraper state | +| Backfill cursor | Firestore subcollection `/scrapers/lobbyingBackfill/processedUrls/{urlHash}` | Full 2005-present history (~50,000 URLs) would exceed the 1 MB doc limit; subcollection scales without bound and is durable, inspectable, and resumable from any machine | +| Incremental strategy | Skip already-processed disclosure URLs; write docs by logical key (upsert) | Survives function restarts and re-runs without re-fetching already-scraped pages; natural upsert prevents duplicates without an explicit dedup pass | +| Legacy format (pre-2013) | Store with `clientName: "_total_salary_"` sentinel | Preserves data completeness; callers can filter on this value | +| Historical data | Admin backfill script (2005 → present) | Full history is ingested once; Cloud Function maintains current+prior year going forward | diff --git a/firestore.indexes.json b/firestore.indexes.json index 83cb3fa6d..c267a6868 100644 --- a/firestore.indexes.json +++ b/firestore.indexes.json @@ -788,25 +788,46 @@ "collectionGroup": "ballotQuestions", "queryScope": "COLLECTION", "fields": [ - { "fieldPath": "electionYear", "order": "ASCENDING" }, - { "fieldPath": "ballotStatus", "order": "ASCENDING" } + { + "fieldPath": "electionYear", + "order": "ASCENDING" + }, + { + "fieldPath": "ballotStatus", + "order": "ASCENDING" + } ] }, { "collectionGroup": "publishedTestimony", "queryScope": "COLLECTION_GROUP", "fields": [ - { "fieldPath": "ballotQuestionId", "order": "ASCENDING" }, - { "fieldPath": "publishedAt", "order": "DESCENDING" } + { + "fieldPath": "ballotQuestionId", + "order": "ASCENDING" + }, + { + "fieldPath": "publishedAt", + "order": "DESCENDING" + } ] }, { "collectionGroup": "publishedTestimony", "queryScope": "COLLECTION", "fields": [ - { "fieldPath": "billId", "order": "ASCENDING" }, - { "fieldPath": "court", "order": "ASCENDING" }, - { "fieldPath": "ballotQuestionId", "order": "ASCENDING" } + { + "fieldPath": "billId", + "order": "ASCENDING" + }, + { + "fieldPath": "court", + "order": "ASCENDING" + }, + { + "fieldPath": "ballotQuestionId", + "order": "ASCENDING" + } ] }, { @@ -898,6 +919,62 @@ } } ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "billId", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "chamber", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "entityNameNorm", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "clientNameNorm", + "order": "ASCENDING" + } + ] } ], "fieldOverrides": [ diff --git a/firestore.rules b/firestore.rules index a95586279..42db67276 100644 --- a/firestore.rules +++ b/firestore.rules @@ -103,6 +103,14 @@ service cloud.firestore { allow read: if true; allow write: if false; } + match /lobbyingRegistrants/{id} { + allow read: if true; + allow write: if false; + } + match /lobbyingFilings/{id} { + allow read: if true; + allow write: if false; + } match /transcriptions/{tid} { // public, read-only allow read: if true diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts new file mode 100644 index 000000000..6d039ae51 --- /dev/null +++ b/functions/src/lobbying/index.ts @@ -0,0 +1,2 @@ +export * from "./types" +export { normalizeEntityName } from "./normalize" diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts new file mode 100644 index 000000000..a7beb338f --- /dev/null +++ b/functions/src/lobbying/normalize.ts @@ -0,0 +1,72 @@ +/** + * Entity name normalization pipeline. + * + * The SoS portal does not enforce consistent name formatting. The same client or + * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a + * Acme Consulting", etc. across filings and years. + * + * The steps must be applied in the exact order + * listed here; changing the order produces different (incorrect) output. + */ + +// Step 2: strip d/b/a trade-name suffix before any other transforms so the +// trade name doesn't bleed into the canonical form. +const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i + +// Step 5: remove legal entity type words with whole-word matching so +// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC". +const LEGAL_ENTITY_RE = + /\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g + +// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix). +const THE_RE = /\bTHE\b/g + +// Step 9: professional suffix phrases to remove wholesale. +const MISC_PHRASES = [ + "LAW OFFICE OF", + "AND ASSOCIATES", + "& ASSOCIATES", + "AND ASSOC", + "ATTORNEY AT LAW", + "ATTORNEY@LAW", + "ATTORNET AT LAW", // known portal typo + "AND PARTNERS", + "PUBLIC POLICY GROUP", + "LEGISLATIVE SERVICES", + "POLICY GROUP", + "ASSOCIATES", + "COUNSELLORS AT LAW" +] + +export function normalizeEntityName(raw: string | null | undefined): string { + if (!raw) return "" + + let x = raw.toUpperCase() // Step 1: uppercase + + x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix + + x = x.replace(/-/g, " ") // Step 3: hyphen → space + + // Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught + // by step 5's whole-word removal). + for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) { + x = x.split(ch).join(" ") + } + + x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words + + x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere + + x = x.replace(/&/g, "AND") // Step 7: ampersand → AND + + x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo + + // Step 9: remove professional suffix phrases + for (const phrase of MISC_PHRASES) { + x = x.split(phrase).join(" ") + } + + x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace + + return x +} diff --git a/functions/src/lobbying/types.ts b/functions/src/lobbying/types.ts new file mode 100644 index 000000000..83eaab761 --- /dev/null +++ b/functions/src/lobbying/types.ts @@ -0,0 +1,101 @@ +import { + Array, + InstanceOf, + Literal, + Number, + Null, + Record, + Static, + String, + Union +} from "runtypes" +import { Timestamp } from "../firebase" + +export type LobbyingChamber = Static +export const LobbyingChamber = Union( + Literal("House Bill"), + Literal("Senate Bill"), + Literal("House Docket"), + Literal("Senate Docket"), + Literal("Executive"), + Literal("Other") +) + +export type LobbyingClient = Static +export const LobbyingClient = Record({ + clientName: String, + clientNameNorm: String, + compensation: Null.Or(Number) +}) + +export type LobbyingRegistrant = Static +export const LobbyingRegistrant = Record({ + registrantId: String, + entityName: String, + entityNameNorm: String, + year: Number, + generalCourt: Number, + regType: Union(Literal("Lobbyist"), Literal("Employer")), + clients: Array(LobbyingClient), + disclosureUrls: Array(String), + fetchedAt: InstanceOf(Timestamp) +}) + +export type LobbyingFiling = Static +export const LobbyingFiling = Record({ + filingId: String, + entityName: String, + entityNameNorm: String, + clientName: String, + clientNameNorm: String, + year: Number, + generalCourt: Number, + chamber: LobbyingChamber, + // Non-null only for legislative chambers (House Bill, Senate Bill, House Docket, + // Senate Docket). For Executive and Other, no bill join should be attempted. + billId: Null.Or(String), + activityTitle: String, + position: String, + amount: Null.Or(Number), + fetchedAt: InstanceOf(Timestamp) +}) + +/** Firestore path for lobbying registrant documents */ +export const REGISTRANTS_COLLECTION = "lobbyingRegistrants" + +/** Firestore path for lobbying filing documents */ +export const FILINGS_COLLECTION = "lobbyingFilings" + +/** Firestore path for the live scraper cursor document */ +export const SCRAPER_DOC = "/scrapers/lobbying" + +/** Firestore path for the backfill cursor subcollection */ +export const BACKFILL_DOC = "/scrapers/lobbyingBackfill" +export const BACKFILL_URLS_COLLECTION = "processedUrls" + +/** Earliest year with portal data */ +export const FIRST_LOBBYING_YEAR = 2005 + +/** + * Sentinel clientName used for pre-2013 legacy filings where compensation is + * reported as a single total rather than broken down per client. + */ +export const LEGACY_TOTAL_CLIENT = "_total_salary_" + +/** + * Chamber prefix map for constructing billId values that match MAPLE's Bill.id. + * Typed as a plain index signature so portal.ts can look up any LobbyingChamber + * without triggering "Property X does not exist" on the Partial. + */ +export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = { + "House Bill": "H", + "Senate Bill": "S", + "House Docket": "HD", + "Senate Docket": "SD" +} + +/** Canonical chamber values for legacy short-form codes found in older filings */ +export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = { + HB: "House Bill", + SB: "Senate Bill" +} diff --git a/lobbying-scraper/.dockerignore b/lobbying-scraper/.dockerignore new file mode 100644 index 000000000..9460c99c4 --- /dev/null +++ b/lobbying-scraper/.dockerignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +*.pyo +.env diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile new file mode 100644 index 000000000..4b2da65b5 --- /dev/null +++ b/lobbying-scraper/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY normalize.py portal.py writer.py scrape.py ./ + +# Cloud Run sets PORT; we don't use it (this is a job, not a server). +# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally. +ENV PYTHONUNBUFFERED=1 + +# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides. +ENTRYPOINT ["python3", "scrape.py"] +CMD ["--mode", "weekly"] diff --git a/lobbying-scraper/__pycache__/normalize.cpython-37.pyc b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc new file mode 100644 index 000000000..47c3ba707 Binary files /dev/null and b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc differ diff --git a/lobbying-scraper/__pycache__/portal.cpython-37.pyc b/lobbying-scraper/__pycache__/portal.cpython-37.pyc new file mode 100644 index 000000000..413885e3d Binary files /dev/null and b/lobbying-scraper/__pycache__/portal.cpython-37.pyc differ diff --git a/lobbying-scraper/normalize.py b/lobbying-scraper/normalize.py new file mode 100644 index 000000000..6e6f7418e --- /dev/null +++ b/lobbying-scraper/normalize.py @@ -0,0 +1,50 @@ +"""Entity name normalization pipeline. + +Direct port of functions/src/lobbying/normalize.ts. Steps must be applied in +this exact order — changing the order produces different (incorrect) output. +""" + +from __future__ import annotations + +import re + +_DBA_RE = re.compile(r"\s+D\s*/+B\s*/+A?\s+.*|\s+DBA\s+.*", re.IGNORECASE) +_LEGAL_RE = re.compile( + r"\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b" +) +_THE_RE = re.compile(r"\bTHE\b") +_WS_RE = re.compile(r"\s+") + +_MISC_PHRASES = [ + "LAW OFFICE OF", + "AND ASSOCIATES", + "& ASSOCIATES", + "AND ASSOC", + "ATTORNEY AT LAW", + "ATTORNEY@LAW", + "ATTORNET AT LAW", # known portal typo + "AND PARTNERS", + "PUBLIC POLICY GROUP", + "LEGISLATIVE SERVICES", + "POLICY GROUP", + "ASSOCIATES", + "COUNSELLORS AT LAW", +] + + +def normalize_entity_name(raw: str | None) -> str: + if not raw: + return "" + x = raw.upper() # 1. uppercase + x = _DBA_RE.sub("", x) # 2. strip d/b/a suffix + x = x.replace("-", " ") # 3. hyphen → space + for ch in (",", ".", "'", "‘", "’", "(", ")"): + x = x.replace(ch, " ") # 4. punctuation → space + x = _LEGAL_RE.sub(" ", x) # 5. remove legal entity words + x = _THE_RE.sub(" ", x) # 6. remove THE anywhere + x = x.replace("&", "AND") # 7. ampersand → AND + x = x.replace("ASSICIATES", "ASSOCIATES") # 8. fix known typo + for phrase in _MISC_PHRASES: # 9. remove professional suffix phrases + x = x.replace(phrase, " ") + x = _WS_RE.sub(" ", x).strip() # 10. collapse whitespace + return x diff --git a/lobbying-scraper/portal.py b/lobbying-scraper/portal.py new file mode 100644 index 000000000..257721991 --- /dev/null +++ b/lobbying-scraper/portal.py @@ -0,0 +1,376 @@ +"""HTTP client and HTML parser for the MA SoS lobbying portal. + +Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/ + +Page flow: + 1. Search POST → summary links table + 2. Summary.aspx → registrant name/year/type + CompleteDisclosure links + 3. CompleteDisclosure.aspx → per-client compensation + per-client bill activity + +Two disclosure HTML formats: + Modern (>=~2013): grdvClientPaidToEntity + grdvActivitiesNew{year}_{n} tables. + Legacy (<~2013): grdvSalaryPaid (total only) + grdvActivities (all bills). +""" + +from __future__ import annotations + +import hashlib +import re +import time +from dataclasses import dataclass, field +from typing import Optional + +import requests +from bs4 import BeautifulSoup, Tag + +# ── Constants ───────────────────────────────────────────────────────────────── + +BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/" +SEARCH_URL = BASE_URL + "Default.aspx" + +_UA = ( + "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" +) +_REQUEST_DELAY = 1.0 +_MAX_RETRIES = 5 + +# Lobby disclosure data begins in 2005; GC 183 started Jan 2003. +FIRST_YEAR = 2005 +FIRST_GC = 183 +FIRST_GC_START_YEAR = 2003 + +# clientName sentinel for pre-2013 filings where compensation is a single total +LEGACY_TOTAL_CLIENT = "_total_salary_" + +# Maps canonical chamber names to the bill-ID prefix used in MAPLE's Bill.id +CHAMBER_PREFIXES: dict[str, str] = { + "House Bill": "H", + "Senate Bill": "S", + "House Docket": "HD", + "Senate Docket": "SD", +} + +# Legacy short-form chamber codes found in older filings +LEGACY_CHAMBER_MAP: dict[str, str] = { + "HB": "House Bill", + "SB": "Senate Bill", +} + +# ── Data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class Compensation: + client_name: str + amount: Optional[float] + + +@dataclass +class BillActivity: + client_name: str + chamber: str # canonical LobbyingChamber value + raw_bill_number: str + bill_id: Optional[str] # e.g. "H1234"; null for Executive/Other + activity_title: str + position: str + amount: Optional[float] + + +@dataclass +class DisclosureMeta: + entity_name: str + year: Optional[int] + reg_type: str # "Lobbyist" | "Employer" + disclosure_urls: list[str] = field(default_factory=list) + + +@dataclass +class DisclosureDetail: + compensation: list[Compensation] = field(default_factory=list) + bills: list[BillActivity] = field(default_factory=list) + + +# ── Derived-value helpers ───────────────────────────────────────────────────── + + +def year_to_general_court(year: int) -> int: + return FIRST_GC + (year - FIRST_GC_START_YEAR) // 2 + + +def normalize_chamber(raw: str) -> str: + t = raw.strip() + if t in LEGACY_CHAMBER_MAP: + return LEGACY_CHAMBER_MAP[t] + known = {"House Bill", "Senate Bill", "House Docket", "Senate Docket", "Executive"} + return t if t in known else "Other" + + +def construct_bill_id(chamber: str, raw_bill_number: str) -> Optional[str]: + """Construct the MAPLE-compatible billId from chamber + raw integer. + + Returns None for Executive and Other chambers where no bill join is possible. + H1234 and S1234 are distinct bills even though they share the same integer — + the prefix is required to disambiguate. + """ + prefix = CHAMBER_PREFIXES.get(chamber) + if not prefix: + return None + try: + return f"{prefix}{int(raw_bill_number)}" + except (ValueError, TypeError): + return None + + +def registrant_id(entity_name: str, year: int) -> str: + key = f"{year}|{entity_name}" + return hashlib.sha256(key.encode()).hexdigest()[:40] + + +def filing_id( + entity_name: str, + client_name: str, + chamber: str, + bill_id: Optional[str], + general_court: int, + position: str, +) -> str: + key = "|".join([entity_name, client_name, chamber, bill_id or "__null__", + str(general_court), position]) + return hashlib.sha256(key.encode()).hexdigest()[:40] + + +# ── HTTP session ────────────────────────────────────────────────────────────── + + +def make_session() -> requests.Session: + s = requests.Session() + s.headers.update({ + "User-Agent": _UA, + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }) + return s + + +def _get(session: requests.Session, url: str) -> BeautifulSoup: + for attempt in range(_MAX_RETRIES): + time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY) + try: + r = session.get(url, timeout=60) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt == _MAX_RETRIES - 1: + raise + print(f" GET retry {attempt + 1}: {e}") + + +def _post(session: requests.Session, url: str, data: dict) -> BeautifulSoup: + for attempt in range(_MAX_RETRIES): + time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY) + try: + r = session.post(url, data=data, timeout=180) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt == _MAX_RETRIES - 1: + raise + print(f" POST retry {attempt + 1}: {e}") + + +# ── Portal scraping ─────────────────────────────────────────────────────────── + + +def _viewstate(soup: BeautifulSoup) -> dict: + return { + inp["name"]: inp.get("value", "") + for inp in soup.find_all("input", type="hidden") + if inp.get("name") + } + + +def fetch_summary_links(session: requests.Session, year: int) -> list[str]: + """Return all Summary.aspx URLs for a given year via a single search POST.""" + soup = _get(session, SEARCH_URL) + data = { + **_viewstate(soup), + "__EVENTTARGET": "", + "__EVENTARGUMENT": "", + "ctl00$ContentPlaceHolder1$Search": "rdbSearchByType", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear": str(year), + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame": "", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown": "3", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType": "L", + "ctl00$ContentPlaceHolder1$drpPageSize": "20000", + "ctl00$ContentPlaceHolder1$btnSearch": "Search", + } + results = _post(session, SEARCH_URL, data) + table = results.find("table", id=lambda x: x and "grdvSearchResultByTypeAndCategory" in x) + if not table: + return [] + return [ + BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"] + for a in table.find_all("a", href=True) + if "Summary.aspx" in a["href"] + ] + + +def fetch_disclosure_meta(session: requests.Session, summary_url: str) -> DisclosureMeta: + soup = _get(session, summary_url) + + def text(el_id: str) -> str: + el = soup.find(id=el_id) + return el.get_text(strip=True) if el else "" + + entity_name = text("ContentPlaceHolder1_lblRegistrantName") + year_text = text("ContentPlaceHolder1_lblYear") + reg_type_raw = text("ContentPlaceHolder1_lblRegType") + + try: + year = int(year_text) + except ValueError: + year = None + + reg_type = "Employer" if "Entity" in reg_type_raw else "Lobbyist" + + disc_urls = [ + BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"] + for a in soup.find_all("a", href=True) + if "CompleteDisclosure" in a["href"] + ] + + return DisclosureMeta( + entity_name=entity_name, + year=year, + reg_type=reg_type, + disclosure_urls=disc_urls, + ) + + +def _parse_amount(text: str) -> Optional[float]: + cleaned = text.replace("$", "").replace(",", "").strip() + try: + return float(cleaned) + except ValueError: + return None + + +def _grid_rows(table: Tag) -> list[Tag]: + return table.find_all("tr", class_=lambda c: c and "Grid" in c and "Header" not in c) + + +def fetch_disclosure_detail( + session: requests.Session, disc_url: str, year: int +) -> DisclosureDetail: + soup = _get(session, disc_url) + compensation: list[Compensation] = [] + bills: list[BillActivity] = [] + gc = year_to_general_court(year) + + # ── Modern format (>=~2013) ─────────────────────────────────────────────── + comp_table = soup.find("table", id=lambda x: x and "grdvClientPaidToEntity" in (x or "")) + if comp_table: + for row in _grid_rows(comp_table): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) >= 2: + compensation.append(Compensation( + client_name=cells[0], + amount=_parse_amount(cells[1]), + )) + + act_tables = soup.find_all( + "table", + id=lambda x: x and re.search(r"grdvActivitiesNew(\d{4})?_\d+", x or ""), + ) + for act_table in act_tables: + # Walk backwards to find the nearest lblClientName span + client_name = "" + node = act_table + while node: + node = node.find_previous(["span", "div", "td"]) + if not node: + break + if node.get("id") and "lblClientName" in node["id"]: + client_name = node.get_text(strip=True) + break + + for row in _grid_rows(act_table): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) < 4: + continue + chamber = normalize_chamber(cells[0]) + raw_num = cells[1] + bill_id = construct_bill_id(chamber, raw_num) + bills.append(BillActivity( + client_name=client_name, + chamber=chamber, + raw_bill_number=raw_num, + bill_id=bill_id, + activity_title=cells[2] if len(cells) > 2 else "", + position=cells[3] if len(cells) > 3 else "", + amount=_parse_amount(cells[4]) if len(cells) > 4 else None, + )) + + if comp_table or bills: + return DisclosureDetail(compensation=compensation, bills=bills) + + # ── Legacy format (<~2013) ──────────────────────────────────────────────── + salary_table = soup.find("table", id=lambda x: x and "grdvSalaryPaid" in (x or "")) + if salary_table: + total = 0.0 + for row in salary_table.find_all("tr"): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) >= 2 and "Total" not in cells[0]: + amt = _parse_amount(cells[1]) + if amt: + total += amt + if total: + compensation.append(Compensation(client_name=LEGACY_TOTAL_CLIENT, amount=total)) + + act_table = soup.find("table", id=lambda x: x and x.endswith("grdvActivities")) + if act_table: + all_rows = act_table.find_all("tr") + headers = [th.get_text(strip=True) + for th in (all_rows[0].find_all(["th", "td"]) if all_rows else [])] + + if headers and "Activity" in headers[0]: + # 6-col entity layout has Lobbyist as second header + if len(headers) >= 2 and "Lobbyist" in headers[1]: + bill_col, pos_col, client_col = 0, 2, 4 + else: + bill_col, pos_col, client_col = 0, 1, 3 + else: + bill_col, pos_col, client_col = 1, None, 3 + + chamber_map = {"H": "House Bill", "S": "Senate Bill", + "HD": "House Docket", "SD": "Senate Docket"} + skip = {"Activity or Bill No and Title", "N/A", "None", "", "Total amount"} + + for row in all_rows[1:]: + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) <= max(bill_col, client_col): + continue + bill_cell = cells[bill_col] + if not bill_cell or bill_cell in skip: + continue + parts = bill_cell.split(None, 1) + bill_no = parts[0] + m = re.match(r"^([A-Z]+)(\d+)$", bill_no) + if not m: + continue + prefix, number = m.group(1), m.group(2) + chamber = chamber_map.get(prefix, "Other") + bill_id = construct_bill_id(chamber, number) + bills.append(BillActivity( + client_name=cells[client_col] if len(cells) > client_col else "", + chamber=chamber, + raw_bill_number=number, + bill_id=bill_id, + activity_title=parts[1] if len(parts) > 1 else "", + position=cells[pos_col] if pos_col is not None and len(cells) > pos_col else "", + amount=None, + )) + + return DisclosureDetail(compensation=compensation, bills=bills) diff --git a/lobbying-scraper/requirements.txt b/lobbying-scraper/requirements.txt new file mode 100644 index 000000000..5e7b4bcc7 --- /dev/null +++ b/lobbying-scraper/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.28 +beautifulsoup4>=4.12 +google-cloud-firestore>=2.14 diff --git a/lobbying-scraper/scrape.py b/lobbying-scraper/scrape.py new file mode 100644 index 000000000..fb985e05f --- /dev/null +++ b/lobbying-scraper/scrape.py @@ -0,0 +1,269 @@ +"""Lobbying disclosure scraper — Cloud Run entry point. + +Runs on a weekly Cloud Scheduler trigger. Checks for new or amended disclosures +and exits immediately if none are found (fast path). When new disclosures exist, +fetches and writes them to Firestore. + +Also serves as the library used by the TypeScript backfill admin script via +subprocess. + +Environment variables: + GOOGLE_CLOUD_PROJECT — GCP project ID (set automatically in Cloud Run) + FIRESTORE_EMULATOR_HOST — set to use the local emulator (e.g. localhost:8080) + +CLI flags (for local / backfill use): + --year YEAR Only process this year (default: current + prior) + --limit N Max registrants per year (for testing) + --dry-run Fetch and parse but do not write to Firestore +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from datetime import datetime, timezone + +from google.cloud import firestore + +from portal import ( + FIRST_YEAR, + fetch_disclosure_detail, + fetch_disclosure_meta, + fetch_summary_links, + make_session, +) +from writer import ( + BACKFILL_DOC, + BACKFILL_URLS_COLLECTION, + SCRAPER_DOC, + write_filings, + write_registrant, +) + + +# ── Cursor helpers ──────────────────────────────────────────────────────────── + + +def _load_live_cursor(db: firestore.Client) -> tuple[set[str], dict[str, list[str]]]: + """Return (processedDiscUrls, summaryDiscCache) from the live scraper doc.""" + doc = db.document(SCRAPER_DOC).get() + data = doc.to_dict() or {} + return ( + set(data.get("processedDiscUrls", [])), + data.get("summaryDiscCache", {}), + ) + + +def _save_live_cursor( + db: firestore.Client, + processed: set[str], + cache: dict[str, list[str]], +) -> None: + db.document(SCRAPER_DOC).set( + {"processedDiscUrls": list(processed), "summaryDiscCache": cache}, + merge=True, + ) + + +def _is_backfill_processed(db: firestore.Client, disc_url: str) -> bool: + h = hashlib.sha256(disc_url.encode()).hexdigest()[:40] + return db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).get().exists + + +def _mark_backfill_processed(db: firestore.Client, disc_url: str) -> None: + h = hashlib.sha256(disc_url.encode()).hexdigest()[:40] + db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).set( + {"url": disc_url, "processedAt": datetime.now(tz=timezone.utc).isoformat()} + ) + + +# ── Core processing ─────────────────────────────────────────────────────────── + + +def process_disclosure( + db: firestore.Client | None, + session, + summary_url: str, + disc_url: str, + year: int, + dry_run: bool = False, +) -> tuple[int, int]: + """Fetch one disclosure page and write registrant + filing documents. + + Returns (compensation_rows, filing_rows). + """ + meta = fetch_disclosure_meta(session, summary_url) + detail = fetch_disclosure_detail(session, disc_url, year) + + if dry_run or db is None: + return len(detail.compensation), len(detail.bills) + + write_registrant(db, meta, detail, disc_url) + n_filings = write_filings(db, meta, detail) + return len(detail.compensation), n_filings + + +# ── Weekly incremental run ──────────────────────────────────────────────────── + + +def run_weekly( + db: "firestore.Client | None", + years: list[int], + limit: int | None = None, + dry_run: bool = False, +) -> int: + """Incremental weekly check. Returns number of new disclosures processed.""" + current_year = datetime.now(tz=timezone.utc).year + processed, cache = _load_live_cursor(db) if db is not None else (set(), {}) + + session = make_session() + new_count = 0 + + for year in years: + print(f"\n── {year} ──") + try: + summary_urls = fetch_summary_links(session, year) + except Exception as e: + print(f" failed to fetch summary links: {e}", file=sys.stderr) + continue + + if limit: + summary_urls = summary_urls[:limit] + + print(f" {len(summary_urls)} registrants on portal") + + for summary_url in summary_urls: + # Use cached disc URLs for prior years; always re-check current year + disc_urls = cache.get(summary_url) + if disc_urls is None or year == current_year: + try: + meta = fetch_disclosure_meta(session, summary_url) + disc_urls = meta.disclosure_urls + cache[summary_url] = disc_urls + if not dry_run: + _save_live_cursor(db, processed, cache) + except Exception as e: + print(f" failed to fetch summary {summary_url}: {e}", file=sys.stderr) + continue + + new_disc_urls = [u for u in disc_urls if u not in processed] + if not new_disc_urls: + continue + + for disc_url in new_disc_urls: + try: + comp_n, filing_n = process_disclosure( + db, session, summary_url, disc_url, year, dry_run=dry_run + ) + processed.add(disc_url) + new_count += 1 + print(f" processed: {comp_n} clients, {filing_n} filings") + if not dry_run: + _save_live_cursor(db, processed, cache) + except Exception as e: + print(f" failed to process {disc_url}: {e}", file=sys.stderr) + + return new_count + + +# ── Historical backfill ─────────────────────────────────────────────────────── + + +def run_backfill( + db: "firestore.Client | None", + years: list[int], + limit: int | None = None, + dry_run: bool = False, +) -> int: + """Full historical backfill using the subcollection cursor. Resumable.""" + session = make_session() + total_new = 0 + + for year in years: + print(f"\n── {year} ──") + try: + summary_urls = fetch_summary_links(session, year) + except Exception as e: + print(f" failed to fetch summary links: {e}", file=sys.stderr) + continue + + if limit: + summary_urls = summary_urls[:limit] + + print(f" {len(summary_urls)} registrants on portal") + year_new = 0 + + for i, summary_url in enumerate(summary_urls): + try: + meta = fetch_disclosure_meta(session, summary_url) + except Exception as e: + print(f" [{i+1}/{len(summary_urls)}] failed to fetch summary: {e}", file=sys.stderr) + continue + + for disc_url in meta.disclosure_urls: + if db is not None and not dry_run and _is_backfill_processed(db, disc_url): + continue + try: + comp_n, filing_n = process_disclosure( + db, session, summary_url, disc_url, year, dry_run=dry_run + ) + if not dry_run: + _mark_backfill_processed(db, disc_url) + total_new += 1 + year_new += 1 + except Exception as e: + print(f" failed to process {disc_url}: {e}", file=sys.stderr) + + if (i + 1) % 50 == 0 or i + 1 == len(summary_urls): + print(f" [{i+1}/{len(summary_urls)}] {year_new} new disclosures so far") + + print(f" {year} complete: {year_new} new disclosures") + + return total_new + + +# ── Entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--year", type=int, default=None) + p.add_argument("--limit", type=int, default=None) + p.add_argument("--dry-run", action="store_true") + p.add_argument( + "--mode", + choices=["weekly", "backfill"], + default="weekly", + help="weekly: incremental check; backfill: full history with subcollection cursor", + ) + args = p.parse_args() + + current_year = datetime.now(tz=timezone.utc).year + + if args.year: + years = [args.year] + elif args.mode == "weekly": + years = [current_year, current_year - 1] + else: + years = list(range(FIRST_YEAR, current_year + 1)) + + db = firestore.Client() if not args.dry_run else None + + if args.mode == "weekly": + n = run_weekly(db, years, limit=args.limit, dry_run=args.dry_run) + if n == 0: + print("\nNo new disclosures found.") + else: + print(f"\nDone: {n} new disclosures written.") + else: + n = run_backfill(db, years, limit=args.limit, dry_run=args.dry_run) + print(f"\nBackfill complete: {n} new disclosures written.") + + # Emit structured result for callers (e.g. TypeScript backfill script) + print(json.dumps({"newDisclosures": n}), file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/lobbying-scraper/writer.py b/lobbying-scraper/writer.py new file mode 100644 index 000000000..a6804f401 --- /dev/null +++ b/lobbying-scraper/writer.py @@ -0,0 +1,126 @@ +"""Firestore document construction and write helpers. + +Mirrors the data model in functions/src/lobbying/types.ts. All collection +names and field names must stay in sync with that file. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import TYPE_CHECKING + +from normalize import normalize_entity_name +from portal import ( + BillActivity, + Compensation, + DisclosureDetail, + DisclosureMeta, + filing_id, + registrant_id, + year_to_general_court, +) + +if TYPE_CHECKING: + from google.cloud import firestore + +REGISTRANTS_COLLECTION = "lobbyingRegistrants" +FILINGS_COLLECTION = "lobbyingFilings" +SCRAPER_DOC = "/scrapers/lobbying" +BACKFILL_DOC = "/scrapers/lobbyingBackfill" +BACKFILL_URLS_COLLECTION = "processedUrls" + + +def _now() -> datetime: + return datetime.now(tz=timezone.utc) + + +def write_registrant( + db: firestore.Client, + meta: DisclosureMeta, + detail: DisclosureDetail, + disc_url: str, +) -> None: + """Upsert a LobbyingRegistrant document.""" + if not meta.entity_name or meta.year is None: + return + + doc_id = registrant_id(meta.entity_name, meta.year) + ref = db.collection(REGISTRANTS_COLLECTION).document(doc_id) + + clients = [ + { + "clientName": c.client_name, + "clientNameNorm": normalize_entity_name(c.client_name), + "compensation": c.amount, + } + for c in detail.compensation + ] + + data = { + "registrantId": doc_id, + "entityName": meta.entity_name, + "entityNameNorm": normalize_entity_name(meta.entity_name), + "year": meta.year, + "generalCourt": year_to_general_court(meta.year), + "regType": meta.reg_type, + "clients": clients, + "disclosureUrls": firestore.ArrayUnion([disc_url]), + "fetchedAt": _now(), + } + ref.set(data, merge=True) + + +def write_filings( + db: firestore.Client, + meta: DisclosureMeta, + detail: DisclosureDetail, +) -> int: + """Batch-write LobbyingFiling documents. Returns the number written.""" + if not meta.entity_name or meta.year is None or not detail.bills: + return 0 + + gc = year_to_general_court(meta.year) + entity_name = meta.entity_name + entity_norm = normalize_entity_name(entity_name) + now = _now() + + batch = db.batch() + count = 0 + + for bill in detail.bills: + fid = filing_id( + entity_name, + bill.client_name, + bill.chamber, + bill.bill_id, + gc, + bill.position, + ) + ref = db.collection(FILINGS_COLLECTION).document(fid) + doc = { + "filingId": fid, + "entityName": entity_name, + "entityNameNorm": entity_norm, + "clientName": bill.client_name, + "clientNameNorm": normalize_entity_name(bill.client_name), + "year": meta.year, + "generalCourt": gc, + "chamber": bill.chamber, + "billId": bill.bill_id, + "activityTitle": bill.activity_title, + "position": bill.position, + "amount": bill.amount, + "fetchedAt": now, + } + batch.set(ref, doc) + count += 1 + + # Firestore batch limit is 500 writes + if count % 400 == 0: + batch.commit() + batch = db.batch() + + if count % 400 != 0: + batch.commit() + + return count