From 4b793e3e514664fff81d420a3f8d2ceae380d2c8 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 26 May 2026 17:13:26 -0400 Subject: [PATCH 01/10] New attempt at hearing backend --- .../src/bills/updateBillReferences.test.ts | 2 + functions/src/events/AssemblyAIHandler.ts | 433 +++++++++++++++++ functions/src/events/EventScraper.ts | 221 +++++++++ functions/src/events/HearingScraper.ts | 194 ++++++++ functions/src/events/index.ts | 2 + functions/src/events/scrapeEvents.ts | 438 +----------------- functions/src/events/types.ts | 14 +- functions/src/hearings/search.ts | 4 +- functions/src/index.ts | 1 + functions/src/webhooks/transcription.ts | 19 +- .../backfillHearingTranscription.ts | 73 +-- .../backfillHearingVideoFormat.ts | 102 ++++ 12 files changed, 1009 insertions(+), 494 deletions(-) create mode 100644 functions/src/events/AssemblyAIHandler.ts create mode 100644 functions/src/events/EventScraper.ts create mode 100644 functions/src/events/HearingScraper.ts create mode 100644 scripts/firebase-admin/backfillHearingVideoFormat.ts diff --git a/functions/src/bills/updateBillReferences.test.ts b/functions/src/bills/updateBillReferences.test.ts index 04e7bb762..7e16873d5 100644 --- a/functions/src/bills/updateBillReferences.test.ts +++ b/functions/src/bills/updateBillReferences.test.ts @@ -13,6 +13,8 @@ function createHearing( type: "hearing", startsAt, fetchedAt: Timestamp.fromMillis(Date.now()), + videos: [], + transcriptionIds: [], content: { EventId: 1, EventDate: "2026-02-01T10:00:00", diff --git a/functions/src/events/AssemblyAIHandler.ts b/functions/src/events/AssemblyAIHandler.ts new file mode 100644 index 000000000..a31648b37 --- /dev/null +++ b/functions/src/events/AssemblyAIHandler.ts @@ -0,0 +1,433 @@ +import { AssemblyAI, Transcript, TranscriptParagraph, TranscriptUtterance, TranscriptWord } from "assemblyai" +import { db, storage } from "../firebase" +import { randomBytes } from "node:crypto" +import { sha256 } from "js-sha256" +import ffmpeg from "fluent-ffmpeg" +import fs from "fs" + +abstract class AssemblyAIHandlerBase { + abstract submitTranscription({ + EventId, + videoUrl, + bucketName + }: { + EventId: number + videoUrl: string + bucketName?: string + }): Promise; + + async submitTranscriptions({ + EventId, + videoUrls, + bucketName + }: { + EventId: number + videoUrls: string[] + bucketName?: string + }): Promise { + const transcriptionIds = await Promise.all(videoUrls.map(item => { + return this.submitTranscription({videoUrl: item, EventId, bucketName}) + })); + + return transcriptionIds + } + + abstract getTranscript(transcript_id: string): Promise; + abstract fetchParagraphs(transcript_id: string): Promise; +} + + +export class AssemblyAIHandler extends AssemblyAIHandlerBase { + assembly: AssemblyAI; + + constructor({ + apiKey + }: { + apiKey: string + }) { + super(); + this.assembly = new AssemblyAI({ + apiKey + }); + } + + async submitTranscription({ + EventId, + videoUrl, + bucketName + }: { + EventId: number + videoUrl: string + bucketName?: string + }): Promise { + const newToken = randomBytes(16).toString("hex") + const audioUrl = await extractAudioFromVideo( + EventId, + videoUrl, + bucketName + ) + + const transcript = await this.assembly.transcripts.submit({ + audio: + // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac", + audioUrl, + webhook_url: + // make sure process.env.FUNCTIONS_API_BASE equals + // https://us-central1-digital-testimony-prod.cloudfunctions.net + // on prod. test with: + // "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription", + `${process.env.FUNCTIONS_API_BASE}/transcription`, + speaker_labels: true, + webhook_auth_header_name: "x-maple-webhook", + webhook_auth_header_value: newToken + }) + + await db + .collection("events") + .doc(`hearing-${String(EventId)}`) + .collection("private") + .doc(transcript.id) + .set({ + videoAssemblyWebhookToken: sha256(newToken) + }) + + return transcript.id + } + + async getTranscript(transcript_id: string): Promise { + return (await this.assembly.transcripts.get(transcript_id)); + } + + async fetchParagraphs(transcript_id: string): Promise { + return (await this.assembly.transcripts.paragraphs(transcript_id)).paragraphs; + } +} + +export class AssemblyAIHandlerDummy extends AssemblyAIHandlerBase { + async submitTranscription({ + EventId, + videoUrl, + bucketName + }: { + EventId: number + videoUrl: string + bucketName?: string + }): Promise { + const token = randomBytes(16).toString("hex") + const transcriptionId = `mock_${Math.random().toString(36).slice(2)}` + + setTimeout(async () => { + await fetch("http://localhost:5001/demo-dtp/us-central1/transcription", { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-maple-webhook": token + }, + body: JSON.stringify(await this.getTranscript(transcriptionId)) + }) + }, 10000) + + await db + .collection("events") + .doc(`hearing-${String(EventId)}`) + .collection("private") + .doc(transcriptionId) + .set({ + videoAssemblyWebhookToken: sha256(token) + }) + + return transcriptionId; + } + + async getTranscript(transcriptId: string): Promise { + return getTranscript(transcriptId).transcript; + } + + async fetchParagraphs(transcriptId: string): Promise { + return getTranscript(transcriptId).paragraphs; + } +} + + +const extractAudioFromVideo = async ( + EventId: number, + videoUrl: string, + bucketName?: string +): Promise => { + const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a` + + // Stream directly from URL and copy audio codec + await new Promise((resolve, reject) => { + ffmpeg(videoUrl) + .noVideo() + .audioCodec("copy") + .format("mp4") + .on("start", commandLine => { + console.log(`Spawned FFmpeg with command: ${commandLine}`) + }) + .on("end", () => { + console.log("FFmpeg processing finished successfully") + resolve() + }) + .on("error", err => { + console.error("FFmpeg error:", err) + reject(err) + }) + .save(tmpFilePath) + }) + + // Upload the audio file + const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket() + const audioFileName = `hearing-${EventId}-${Date.now()}.m4a` + const file = bucket.file(audioFileName) + + const fileContent = await fs.promises.readFile(tmpFilePath) + await file.save(fileContent, { + metadata: { + contentType: "audio/mp4" + } + }) + + // Clean up temporary file + await fs.promises.unlink(tmpFilePath) + + const [url] = await file.getSignedUrl({ + action: "read", + expires: Date.now() + 24 * 60 * 60 * 1000 + }) + + // Delete old files + const [files] = await bucket.getFiles({ + prefix: "hearing-", + maxResults: 1000 + }) + const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000 + const oldFiles = files.filter(file => { + const timestamp = parseInt(file.name.split("-").pop()?.split(".")[0] || "0") + return timestamp < oneDayAgo + }) + await Promise.all(oldFiles.map(file => file.delete())) + + // Return the new audio url + return url +} + +export const submitTranscription = async ({ + EventId, + maybeVideoUrl, + bucketName +}: { + EventId: number + maybeVideoUrl: string + bucketName?: string +}) => { + const assembly = new AssemblyAI({ + apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : "" + }) + + const newToken = randomBytes(16).toString("hex") + const audioUrl = await extractAudioFromVideo( + EventId, + maybeVideoUrl, + bucketName + ) + + const transcript = await assembly.transcripts.submit({ + audio: + // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac", + audioUrl, + webhook_url: + // make sure process.env.FUNCTIONS_API_BASE equals + // https://us-central1-digital-testimony-prod.cloudfunctions.net + // on prod. test with: + // "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription", + `${process.env.FUNCTIONS_API_BASE}/transcription`, + speaker_labels: true, + webhook_auth_header_name: "x-maple-webhook", + webhook_auth_header_value: newToken + }) + + await db + .collection("events") + .doc(`hearing-${String(EventId)}`) + .collection("private") + .doc("webhookAuth") + .set({ + videoAssemblyWebhookToken: sha256(newToken) + }) + + return transcript.id +} + +const WORD_BANK = [ + "lorem", + "ipsum", + "dolor", + "sit", + "amet", + "consectetur", + "adipiscing", + "elit", + "sed", + "do", + "eiusmod", + "tempor", + "incididunt", + "ut", + "labore", + "et", + "dolore", + "magna", + "aliqua" +]; + +const SPEAKERS = ["A", "B", "C"]; + +function randomInt(min: number, max: number) { + return Math.floor(Math.random() * (max - min + 1)) + min; +} + +function randomFloat(min: number, max: number, precision = 2) { + return Number((Math.random() * (max - min) + min).toFixed(precision)); +} + +function mean(values: number[]) { + return values.reduce((a, b) => a + b, 0) / values.length; +} + +function loremSentence(length: number) { + return Array.from({ length }, () => { + return WORD_BANK[randomInt(0, WORD_BANK.length - 1)]; + }); +} + +function loremParagraph(length: number) { + return Array.from({ length }, () => + loremSentence(randomInt(3, 10)) + ); +} + +/** + * paragraphs -> sentences -> words + */ +function loremTranscriptStructure() { + return Array.from( + { length: randomInt(10, 20) }, + () => loremParagraph(randomInt(3, 8)) + ); +} + +export function getTranscript( + transcript_id: string +): { + transcript: Transcript; + paragraphs: TranscriptParagraph[]; +} { + const structure = loremTranscriptStructure(); + + const utterances: TranscriptUtterance[] = []; + const paragraphs: TranscriptParagraph[] = []; + const allWords: TranscriptWord[] = []; + + let currentTime = 0; + + for (const paragraph of structure) { + const speaker = SPEAKERS[randomInt(0, SPEAKERS.length - 1)]; + + const paragraphWords: TranscriptWord[] = []; + + for (const sentence of paragraph) { + const sentenceWords: TranscriptWord[] = []; + + for (const token of sentence) { + const confidence = randomFloat(0.5, 0.99); + + const word: TranscriptWord = { + confidence, + start: Number(currentTime.toFixed(2)), + end: Number((currentTime + 1).toFixed(2)), + speaker, + text: token + }; + + sentenceWords.push(word); + paragraphWords.push(word); + allWords.push(word); + + currentTime += 1; + } + + const utterance: TranscriptUtterance = { + confidence: Number( + mean(sentenceWords.map(w => w.confidence)).toFixed(2) + ), + start: sentenceWords[0].start, + end: sentenceWords[sentenceWords.length - 1].end, + speaker, + text: sentenceWords.map(w => w.text).join(" "), + words: sentenceWords + }; + + utterances.push(utterance); + + // pause between sentences + currentTime += randomFloat(0.2, 1.2); + } + + // paragraph object + const transcriptParagraph: TranscriptParagraph = { + confidence: Number( + mean(paragraphWords.map(w => w.confidence)).toFixed(2) + ), + start: paragraphWords[0].start, + end: paragraphWords[paragraphWords.length - 1].end, + text: paragraphWords.map(w => w.text).join(" "), + words: paragraphWords + }; + + paragraphs.push(transcriptParagraph); + + // longer pause between paragraphs + currentTime += randomFloat(1, 3); + } + + const transcript: Transcript = { + acoustic_model: "no", + audio_url: "https://example.com/definitely-a-video", + auto_highlights: false, + id: transcript_id, + language_confidence: 0.95, + language_confidence_threshold: 0.03, + language_model: "no", + speech_model: null, + redact_pii: true, + status: "completed", + summarization: false, + webhook_auth: true, + webhook_auth_header_name: "x-maple-webhook", + + text: utterances.map(u => u.text).join(". "), + confidence: Number( + mean(allWords.map(w => w.confidence)).toFixed(2) + ), + + utterances, + words: allWords + }; + + return { + transcript, + paragraphs + }; +} + +export const assemblyAI: AssemblyAIHandler | AssemblyAIHandlerDummy = (() => { + const apiKey = process.env.ASSEMBLY_API_KEY; + if (!apiKey || apiKey === "test-api-key") { + console.log("AssemblyAI is faked for this emulator"); + return new AssemblyAIHandlerDummy(); + } else { + console.log("AssemblyAI is real for this emulator"); + return new AssemblyAIHandler({ apiKey }); + } +})(); diff --git a/functions/src/events/EventScraper.ts b/functions/src/events/EventScraper.ts new file mode 100644 index 000000000..ebb4d8bfa --- /dev/null +++ b/functions/src/events/EventScraper.ts @@ -0,0 +1,221 @@ +import { RuntimeOptions, runWith } from "firebase-functions/v1" +import { DateTime } from "luxon" +import { logFetchError } from "../common" +import { db, Timestamp } from "../firebase" +import * as api from "../malegislature" +import { + BaseEvent, + BaseEventContent, + Session, + SessionContent, + SpecialEvent, + SpecialEventContent +} from "./types" +import { currentGeneralCourt } from "../shared" + +export abstract class EventScraper { + private schedule + private timeout + private memory + private pastEventCutoff + + constructor( + schedule: string, + timeout: number, + { + memory = "256MB", + pastEventCutoff = { days: 8 } + }: { + memory?: RuntimeOptions["memory"] + pastEventCutoff?: Duration + } = {} + ) { + this.schedule = schedule + this.timeout = timeout + this.memory = memory + this.pastEventCutoff = pastEventCutoff + } + + get function() { + return runWith({ + timeoutSeconds: this.timeout, + secrets: ["ASSEMBLY_API_KEY"], + memory: this.memory, + maxInstances: 1 + }) + .pubsub.schedule(this.schedule) + .onRun(() => this.run()) + } + + abstract listEvents(): Promise + abstract getEvent(item: ListItem): Promise + + private async run() { + const list = await this.listEvents().catch(logFetchError("event list")) + + if (!list) return + + const writer = db.bulkWriter() + const upcomingOrRecentCutoff = DateTime.now().minus(this.pastEventCutoff) + + for (let item of list) { + const id = (item as any)?.EventId, + event = await this.getEvent(item).catch(logFetchError("event", id)) + + if (!event) continue + if (event.startsAt.toMillis() < upcomingOrRecentCutoff.toMillis()) break + + writer.set(db.doc(`/events/${event.id}`), event, { merge: true }) + + console.log("event in run()", event) + } + + await writer.close() + } + + /** Parse the event start time in the time zone of the API. */ + getEventStart(content: { EventDate: string; StartTime: string }) { + const { year, month, day } = DateTime.fromISO(content.EventDate, { + zone: api.timeZone + }) + const { hour, minute, second, millisecond } = DateTime.fromISO( + content.StartTime, + { zone: api.timeZone } + ) + const startsAt = DateTime.fromObject( + { year, month, day, hour, minute, second, millisecond }, + { zone: api.timeZone } + ) + return startsAt + } + + /** Return timestamps shared between event types. */ + timestamps(content: BaseEventContent) { + const startsAt = this.getEventStart(content) + return { + fetchedAt: Timestamp.now(), + startsAt: Timestamp.fromMillis(startsAt.toMillis()) + } + } +} + +export class SpecialEventsScraper extends EventScraper< + SpecialEventContent, + SpecialEvent +> { + constructor() { + super("every 60 minutes", 540) + } + + async listEvents() { + const events = await api.getSpecialEvents() + return events.filter(SpecialEventContent.guard) + } + + getEvent(content: SpecialEventContent) { + const event: SpecialEvent = { + id: `specialEvent-${content.EventId}`, + type: "specialEvent", + content, + ...this.timestamps(content) + } + return Promise.resolve(event) + } +} + +export class SessionScraper extends EventScraper { + private court = currentGeneralCourt + + constructor() { + super("every 60 minutes", 120) + } + + async listEvents() { + const events = await api.getSessions(this.court) + return events.filter(SessionContent.guard) + } + + getEvent(content: SessionContent) { + const event: Session = { + id: `session-${this.court}-${content.EventId}`, + type: "session", + content, + ...this.timestamps(content) + } + return Promise.resolve(event) + } +} + +export abstract class EventPostProcessor { + private schedule + private timeout + private eventType + private memory + private pastEventBeginProcessing + private pastEventCutoff + + constructor( + schedule: string, + timeout: number, + eventType: string, + { + memory = "256MB", + pastEventBeginProcessing = {}, + pastEventCutoff = { days: 8 } + }: { + memory?: RuntimeOptions["memory"] + pastEventBeginProcessing?: Duration + pastEventCutoff?: Duration + } = {} + ) { + this.schedule = schedule + this.timeout = timeout + this.eventType = eventType + this.memory = memory + this.pastEventBeginProcessing = pastEventBeginProcessing + this.pastEventCutoff = pastEventCutoff + } + + get function() { + return runWith({ + timeoutSeconds: this.timeout, + secrets: ["ASSEMBLY_API_KEY"], + memory: this.memory, + maxInstances: 1 + }) + .pubsub.schedule(this.schedule) + .onRun(() => this.run()) + } + + abstract updateIf(data: FirebaseFirestore.DocumentData): null | ListItem + abstract getUpdate(item: ListItem): any + + private async run() { + const writer = db.bulkWriter() + + const now = DateTime.now() + const begin = now.minus(this.pastEventBeginProcessing).toJSDate() + const cutoff = now.minus(this.pastEventCutoff).toJSDate() + + const snapshot = await db.collection("events") + .where("type", "==", this.eventType) + .where("startsAt", "<=", begin) + .where("startsAt", ">=", cutoff) + .get() + + if (snapshot.empty) return + + for (const doc of snapshot.docs) { + const data = doc.data() + if (!data) continue + const item = this.updateIf(data) + if (!item) continue + + writer.update(doc.ref, this.getUpdate(item)) + + console.log("event in run()", data) + } + + await writer.close() + } +} diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts new file mode 100644 index 000000000..00c766f82 --- /dev/null +++ b/functions/src/events/HearingScraper.ts @@ -0,0 +1,194 @@ +import { JSDOM } from "jsdom" +import { db, Timestamp } from "../firebase" +import * as api from "../malegislature" +import { + Hearing, + HearingContent, + HearingListItem, + Video +} from "./types" +import { isValidVideoUrl } from "./helpers" +import { Committee } from "../committees/types" +import { EventPostProcessor, EventScraper } from "./EventScraper" +import { assemblyAI } from "./AssemblyAIHandler" + +const loadCommitteeChairNames = async ( + generalCourtNumber: number, + committeeCode: string +) => { + try { + const committeeSnap = await db + .collection(`generalCourts/${generalCourtNumber}/committees`) + .doc(committeeCode) + .get() + + if (!committeeSnap.exists) return [] as string[] + + const { members, content } = Committee.check(committeeSnap.data()) + const chairCodes = new Set() + const maybeHouse = content.HouseChairperson?.MemberCode + const maybeSenate = content.SenateChairperson?.MemberCode + + if (maybeHouse) chairCodes.add(maybeHouse) + if (maybeSenate) chairCodes.add(maybeSenate) + return (members ?? []) + .filter(member => chairCodes.has(member.id)) + .map(member => member.name) + } catch (error) { + console.warn( + `Failed to load committee chairs for ${committeeCode} (${generalCourtNumber}):`, + error + ) + return [] as string[] + } +} + + +export class HearingScraper extends EventScraper { + constructor() { + super("every 60 minutes", 480) + } + + async listEvents() { + const events = await api.listHearings() + return events.filter(HearingListItem.guard) + } + + async getEvent( + { EventId }: HearingListItem /* e.g. 4962 */ + ) { + const data = await api.getHearing(EventId) + const content = HearingContent.check(data) + + console.log("content in getEvent()", content) + + const host = content.HearingHost + const committeeChairs = + host?.CommitteeCode && host?.GeneralCourtNumber + ? await loadCommitteeChairNames( + host.GeneralCourtNumber, + host.CommitteeCode + ) + : [] + + return { + id: `hearing-${EventId}`, + type: "hearing", + content, + committeeChairs, + videos: [], + transcriptionIds: [], + ...this.timestamps(content) + } as Hearing + } +} + +export class HearingPostProcessor extends EventPostProcessor { + constructor() { + super("every 60 minutes", 480, "hearing", { memory: "4GB" }) + } + + + async getHearingVideos(EventId: number): Promise[]> { + const hearingErr = `An error collecting videos for hearing ${EventId} (webpage format changed?)`; + + const req = await fetch( + `https://malegislature.gov/Events/Hearings/Detail/${EventId}` + ) + const res = await req.text() + if (!res) throw new Error(`${hearingErr}: No response for request`); + const dom = new JSDOM(res); + if (!dom) throw new Error(`${hearingErr}: Could not create JSDOM of request`); + + const videoElements = [].slice.call(dom.window.document.querySelectorAll('#playWebcast')) as Element[]; + if (videoElements.length === 0) return [] + const videoURLs = videoElements.map((elem) => { + const onclick = elem.getAttribute("onclick"); + if (!onclick) throw new Error(`${hearingErr}: No onclick in ${elem}`); + const match = onclick.match(/switchVideo\('([^']+)'/); + if (!match || match.length < 2) throw new Error(`${hearingErr}: Could not match switchVideo in ${elem}`); + if (!isValidVideoUrl(match[1])) throw new Error(`${hearingErr}: ${match[1]} is not a valid video url`); + return match[1]; + }); + const tbody = videoElements[0].closest("tbody"); + if (!tbody) throw new Error(`${hearingErr}: Could not find parent tbody of #playWebcast`); + const titles = Array.from(tbody.querySelectorAll("tr")).map(tr => { + const item = tr.querySelector("td")?.textContent?.trim(); + if (!item) throw new Error(`${hearingErr}: Could not locate title in ${tr}`) + return item; + }); + if (titles.length !== videoURLs.length) throw new Error(`${hearingErr}: Number of video table rows did not equal number of #playWebcast elements`) + + let videos = videoURLs.map((url, i) => { + return { + url: url, title: titles[i] + } + }); + + let seen = new Set(); + videos = videos.filter(item => { + if (seen.has(item.url)) return false; + seen.add(item.url); + return true; + }); + + if (videos.length > 1) { + const order = videos.map(item => { + const title = item.title.toLowerCase(); + const match = title.match(/\b(?:(\d+)\s+of\s+\d+|part\s+(\d+)|pt\.?\s+(\d+))\b/); + if (!match) return -1; + const part = parseInt(match[1] || match[2] || match[3], 10); + return part-1; + }); + seen.clear(); + let validOrder = true; + for (const n of order) { + if (n < 0 || n >= order.length || seen.has(n)) { + validOrder = false; + break; + } + seen.add(n); + } + if (validOrder) { + const reordered = new Array(videos.length); + for (let i = 0; i < order.length; i++) { + reordered[order[i]] = videos[i]; + } + videos = reordered; + videos = videos.map((item, index) => { + item.title = `Part ${index+1}`; + return item; + }) + } else { + console.log(`While scraping hearing videos, the titles ${titles} could not be mapped to a numeric order`) + } + } else { + videos[0].title = `hearing-${EventId}` + } + return videos; + } + + updateIf(data: FirebaseFirestore.DocumentData): null | HearingListItem { + if (data.videos.length) return null + return { EventId: data.id } + } + + async getUpdate( + { EventId }: HearingListItem + ): Promise<{ transcriptionIds: string[], videos: Video[], videosFetchedAt: Timestamp }> { + const videos = await this.getHearingVideos(EventId) + const transcriptionIds = await assemblyAI.submitTranscriptions({videoUrls: videos.map(item => item.url), EventId}) + const videosWithTranscriptions = videos.map((item, index) => { + return { + transcriptionId: transcriptionIds[index], + ...item + } + }) + + return { + transcriptionIds, + videos: videosWithTranscriptions, + videosFetchedAt: Timestamp.now(), + } + } +} diff --git a/functions/src/events/index.ts b/functions/src/events/index.ts index 96ff5307d..2a1f508ad 100644 --- a/functions/src/events/index.ts +++ b/functions/src/events/index.ts @@ -1,3 +1,5 @@ export * from "./scrapeEvents" export { scrapeSingleHearing } from "./scrapeEvents" export { scrapeSingleHearingv2 } from "./scrapeEvents" +export { assemblyAI } from "./AssemblyAIHandler" +export { HearingScraper, HearingPostProcessor } from "./HearingScraper" diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 419bd505b..701ccb567 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -1,418 +1,9 @@ import * as functions from "firebase-functions/v1" -import { RuntimeOptions, runWith } from "firebase-functions/v1" import { onCall, CallableRequest } from "firebase-functions/v2/https" -import { DateTime } from "luxon" -import { JSDOM } from "jsdom" -import { AssemblyAI } from "assemblyai" -import { checkAuth, checkAdmin, logFetchError } from "../common" -import { db, storage, Timestamp } from "../firebase" -import * as api from "../malegislature" -import { - BaseEvent, - BaseEventContent, - Hearing, - HearingContent, - HearingListItem, - Session, - SessionContent, - SpecialEvent, - SpecialEventContent -} from "./types" -import { currentGeneralCourt } from "../shared" -import { randomBytes } from "node:crypto" -import { sha256 } from "js-sha256" -import { isValidVideoUrl, withinCutoff } from "./helpers" -import ffmpeg from "fluent-ffmpeg" -import fs from "fs" -import { Committee } from "../committees/types" -abstract class EventScraper { - private schedule - private timeout - private memory - - constructor( - schedule: string, - timeout: number, - memory: RuntimeOptions["memory"] = "256MB" - ) { - this.schedule = schedule - this.timeout = timeout - this.memory = memory - } - - get function() { - return runWith({ - timeoutSeconds: this.timeout, - secrets: ["ASSEMBLY_API_KEY"], - memory: this.memory, - maxInstances: 1 - }) - .pubsub.schedule(this.schedule) - .onRun(() => this.run()) - } - - abstract listEvents(): Promise - abstract getEvent(item: ListItem): Promise - - private async run() { - const list = await this.listEvents().catch(logFetchError("event list")) - - if (!list) return - - const writer = db.bulkWriter() - const upcomingOrRecentCutoff = DateTime.now().minus({ days: 8 }) - - for (let item of list) { - const id = (item as any)?.EventId, - event = await this.getEvent(item).catch(logFetchError("event", id)) - - if (!event) continue - if (event.startsAt.toMillis() < upcomingOrRecentCutoff.toMillis()) break - - writer.set(db.doc(`/events/${event.id}`), event, { merge: true }) - - console.log("event in run()", event) - } - - await writer.close() - } - - /** Parse the event start time in the time zone of the API. */ - getEventStart(content: { EventDate: string; StartTime: string }) { - const { year, month, day } = DateTime.fromISO(content.EventDate, { - zone: api.timeZone - }) - const { hour, minute, second, millisecond } = DateTime.fromISO( - content.StartTime, - { zone: api.timeZone } - ) - const startsAt = DateTime.fromObject( - { year, month, day, hour, minute, second, millisecond }, - { zone: api.timeZone } - ) - return startsAt - } - - /** Return timestamps shared between event types. */ - timestamps(content: BaseEventContent) { - const startsAt = this.getEventStart(content) - return { - fetchedAt: Timestamp.now(), - startsAt: Timestamp.fromMillis(startsAt.toMillis()) - } - } -} - -class SpecialEventsScraper extends EventScraper< - SpecialEventContent, - SpecialEvent -> { - constructor() { - super("every 60 minutes", 540) - } - - async listEvents() { - const events = await api.getSpecialEvents() - return events.filter(SpecialEventContent.guard) - } - - getEvent(content: SpecialEventContent) { - const event: SpecialEvent = { - id: `specialEvent-${content.EventId}`, - type: "specialEvent", - content, - ...this.timestamps(content) - } - return Promise.resolve(event) - } -} - -class SessionScraper extends EventScraper { - private court = currentGeneralCourt - - constructor() { - super("every 60 minutes", 120) - } - - async listEvents() { - const events = await api.getSessions(this.court) - return events.filter(SessionContent.guard) - } - - getEvent(content: SessionContent) { - const event: Session = { - id: `session-${this.court}-${content.EventId}`, - type: "session", - content, - ...this.timestamps(content) - } - return Promise.resolve(event) - } -} - -const extractAudioFromVideo = async ( - EventId: number, - videoUrl: string, - bucketName?: string -): Promise => { - const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a` - - // Stream directly from URL and copy audio codec - await new Promise((resolve, reject) => { - ffmpeg(videoUrl) - .noVideo() - .audioCodec("copy") - .format("mp4") - .on("start", commandLine => { - console.log(`Spawned FFmpeg with command: ${commandLine}`) - }) - .on("end", () => { - console.log("FFmpeg processing finished successfully") - resolve() - }) - .on("error", err => { - console.error("FFmpeg error:", err) - reject(err) - }) - .save(tmpFilePath) - }) - - // Upload the audio file - const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket() - const audioFileName = `hearing-${EventId}-${Date.now()}.m4a` - const file = bucket.file(audioFileName) - - const fileContent = await fs.promises.readFile(tmpFilePath) - await file.save(fileContent, { - metadata: { - contentType: "audio/mp4" - } - }) - - // Clean up temporary file - await fs.promises.unlink(tmpFilePath) - - const [url] = await file.getSignedUrl({ - action: "read", - expires: Date.now() + 24 * 60 * 60 * 1000 - }) - - // Delete old files - const [files] = await bucket.getFiles({ - prefix: "hearing-", - maxResults: 1000 - }) - const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000 - const oldFiles = files.filter(file => { - const timestamp = parseInt(file.name.split("-").pop()?.split(".")[0] || "0") - return timestamp < oneDayAgo - }) - await Promise.all(oldFiles.map(file => file.delete())) - - // Return the new audio url - return url -} - -export const submitTranscription = async ({ - EventId, - maybeVideoUrl, - bucketName -}: { - EventId: number - maybeVideoUrl: string - bucketName?: string -}) => { - const assembly = new AssemblyAI({ - apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : "" - }) - - const newToken = randomBytes(16).toString("hex") - const audioUrl = await extractAudioFromVideo( - EventId, - maybeVideoUrl, - bucketName - ) - - const transcript = await assembly.transcripts.submit({ - audio: - // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac", - audioUrl, - webhook_url: - // make sure process.env.FUNCTIONS_API_BASE equals - // https://us-central1-digital-testimony-prod.cloudfunctions.net - // on prod. test with: - // "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription", - `${process.env.FUNCTIONS_API_BASE}/transcription`, - speaker_labels: true, - webhook_auth_header_name: "x-maple-webhook", - webhook_auth_header_value: newToken - }) - - await db - .collection("events") - .doc(`hearing-${String(EventId)}`) - .collection("private") - .doc("webhookAuth") - .set({ - videoAssemblyWebhookToken: sha256(newToken) - }) - - return transcript.id -} - -export const getHearingVideoUrl = async (EventId: number) => { - const req = await fetch( - `https://malegislature.gov/Events/Hearings/Detail/${EventId}` - ) - const res = await req.text() - if (res) { - const dom = new JSDOM(res) - if (dom) { - const maybeVideoSource = - dom.window.document.querySelectorAll("video source") - if (maybeVideoSource.length && maybeVideoSource[0]) { - const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement - const maybeVideoUrl = firstVideoSource.src - - return isValidVideoUrl(maybeVideoUrl) ? maybeVideoUrl : null - } - } - } - return null -} - -const shouldScrapeVideo = async ( - EventId: number, - ignoreCutoff: boolean = false -) => { - const eventInDb = await db - .collection("events") - .doc(`hearing-${String(EventId)}`) - .get() - const eventData = eventInDb.data() - - console.log("eventData in shouldScrapeVideo()", eventData) - - if (!eventData) { - return false - } - if (!eventData.videoURL) { - return ( - ignoreCutoff || - withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate())) - ) - } - return false -} - -const loadCommitteeChairNames = async ( - generalCourtNumber: number, - committeeCode: string -) => { - try { - const committeeSnap = await db - .collection(`generalCourts/${generalCourtNumber}/committees`) - .doc(committeeCode) - .get() - - if (!committeeSnap.exists) return [] as string[] - - const { members, content } = Committee.check(committeeSnap.data()) - const chairCodes = new Set() - const maybeHouse = content.HouseChairperson?.MemberCode - const maybeSenate = content.SenateChairperson?.MemberCode - - if (maybeHouse) chairCodes.add(maybeHouse) - if (maybeSenate) chairCodes.add(maybeSenate) - return (members ?? []) - .filter(member => chairCodes.has(member.id)) - .map(member => member.name) - } catch (error) { - console.warn( - `Failed to load committee chairs for ${committeeCode} (${generalCourtNumber}):`, - error - ) - return [] as string[] - } -} - -class HearingScraper extends EventScraper { - constructor() { - super("every 60 minutes", 480, "4GB") - } - - async listEvents() { - const events = await api.listHearings() - return events.filter(HearingListItem.guard) - } - - async getEvent( - { EventId }: HearingListItem /* e.g. 4962 */, - { ignoreCutoff = false }: { ignoreCutoff?: boolean } = {} - ) { - const data = await api.getHearing(EventId) - const content = HearingContent.check(data) - - console.log("content in getEvent()", content) - - const host = content.HearingHost - const committeeChairs = - host?.CommitteeCode && host?.GeneralCourtNumber - ? await loadCommitteeChairNames( - host.GeneralCourtNumber, - host.CommitteeCode - ) - : [] - - if (await shouldScrapeVideo(EventId, ignoreCutoff)) { - try { - const maybeVideoUrl = await getHearingVideoUrl(EventId) - if (maybeVideoUrl) { - const transcriptId = await submitTranscription({ - maybeVideoUrl, - EventId - }) - - // Immediately save video info to prevent reprocessing - // since the bulkWriter does not save the video properties - // returned from this method. - await db.collection("events").doc(`hearing-${EventId}`).update({ - videoURL: maybeVideoUrl, - videoFetchedAt: Timestamp.now(), - videoTranscriptionId: transcriptId - }) - - return { - id: `hearing-${EventId}`, - type: "hearing", - content, - ...this.timestamps(content), - videoURL: maybeVideoUrl, - videoFetchedAt: Timestamp.now(), - committeeChairs, - videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId - } as Hearing - } - } catch (error) { - console.error(`Failed to process audio for hearing ${EventId}:`, error) - return { - id: `hearing-${EventId}`, - type: "hearing", - content, - committeeChairs, - ...this.timestamps(content) - } as Hearing - } - } - return { - id: `hearing-${EventId}`, - type: "hearing", - content, - committeeChairs, - ...this.timestamps(content) - } as Hearing - } -} +import { checkAuth, checkAdmin } from "../common" +import { db } from "../firebase" +import { SpecialEventsScraper, SessionScraper } from "./EventScraper" +import { HearingScraper, HearingPostProcessor } from "./HearingScraper" /** * Callable cloud function to scrape a single hearing by EventId. @@ -442,12 +33,10 @@ export const scrapeSingleHearing = functions } try { - // Create a temporary scraper instance to reuse the existing logic - const scraper = new HearingScraper() - const hearing = await scraper.getEvent( - { EventId: eventId }, - { ignoreCutoff: true } - ) + const hearing = { + ...await new HearingScraper().getEvent({ EventId: eventId }), + ...await new HearingPostProcessor().getUpdate({ EventId: eventId }) // Videos + } // Save the hearing to Firestore await db.doc(`/events/${hearing.id}`).set(hearing, { merge: true }) @@ -487,12 +76,10 @@ export const scrapeSingleHearingv2 = onCall( } try { - // Create a temporary scraper instance to reuse the existing logic - const scraper = new HearingScraper() - const hearing = await scraper.getEvent( - { EventId: eventId }, - { ignoreCutoff: true } - ) + const hearing = { + ...await new HearingScraper().getEvent({ EventId: eventId }), + ...await new HearingPostProcessor().getUpdate({ EventId: eventId }) // Videos + } // Save the hearing to Firestore await db.doc(`/events/${hearing.id}`).set(hearing, { merge: true }) @@ -518,3 +105,4 @@ export const scrapeSingleHearingv2 = onCall( export const scrapeSpecialEvents = new SpecialEventsScraper().function export const scrapeSessions = new SessionScraper().function export const scrapeHearings = new HearingScraper().function +export const scrapeVideos = new HearingPostProcessor().function diff --git a/functions/src/events/types.ts b/functions/src/events/types.ts index 4101b41d1..90d40dfb5 100644 --- a/functions/src/events/types.ts +++ b/functions/src/events/types.ts @@ -97,13 +97,21 @@ export const HearingContent = BaseEventContent.extend({ export type HearingListItem = Static export const HearingListItem = Record({ EventId: Number }) +export type Video = Static +export const Video = Record({ + url: String, + title: String, + transcriptionId: String, +}) + + export type Hearing = Static export const Hearing = BaseEvent.extend({ type: L("hearing"), content: HearingContent, - videoURL: Optional(String), - videoTranscriptionId: Optional(String), - videoFetchedAt: Optional(InstanceOf(Timestamp)), + videos: Array(Video), + transcriptionIds: Array(String), + videosFetchedAt: Optional(InstanceOf(Timestamp)), committeeChairs: Optional(Array(String)) }) diff --git a/functions/src/hearings/search.ts b/functions/src/hearings/search.ts index fe26d0385..e5472eac8 100644 --- a/functions/src/hearings/search.ts +++ b/functions/src/hearings/search.ts @@ -57,7 +57,7 @@ export const { }, convert: data => { const hearing = Hearing.check(data) - const { content, startsAt: startsAtTimestamp, id, videoURL } = hearing + const { content, startsAt: startsAtTimestamp, id, videos } = hearing const startsAt = startsAtTimestamp.toMillis() const schedule = DateTime.fromMillis(startsAt, { zone: timeZone }) @@ -115,7 +115,7 @@ export const { bill => bill.slug || `${courtNumber}/${bill.number}` ), court: courtNumber, - hasVideo: Boolean(videoURL) + hasVideo: videos.length > 0 } } }) diff --git a/functions/src/index.ts b/functions/src/index.ts index 641255bf4..970e31b59 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -17,6 +17,7 @@ export { } from "./committees" export { scrapeHearings, + scrapeVideos, scrapeSessions, scrapeSpecialEvents, scrapeSingleHearing, diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts index 04a5ed1f8..95174929f 100644 --- a/functions/src/webhooks/transcription.ts +++ b/functions/src/webhooks/transcription.ts @@ -1,5 +1,5 @@ import * as functions from "firebase-functions" -import { AssemblyAI } from "assemblyai" +import { assemblyAI } from "../events/AssemblyAIHandler" import { db, Timestamp } from "../firebase" import { sha256 } from "js-sha256" @@ -10,22 +10,15 @@ export const transcription = functions if (req.body.status === "completed") { // If we get a request with the right header and status, get the // transcription from the assembly API. - const assembly = new AssemblyAI({ - apiKey: process.env.ASSEMBLY_API_KEY - ? process.env.ASSEMBLY_API_KEY - : "" - }) - const transcript = await assembly.transcripts.get( - req.body.transcript_id - ) + const transcript = await assemblyAI.getTranscript(req.body.id) if (transcript && transcript.webhook_auth) { // If there is a transcript and the transcript has an auth property, // look for an event (aka Hearing) in the DB with a matching ID. const maybeEventsInDb = await db .collection("events") - .where("videoTranscriptionId", "==", transcript.id) + .where("transcriptionIds", "array-contains", transcript.id) .get() if (maybeEventsInDb.docs.length) { @@ -43,7 +36,7 @@ export const transcription = functions .collection("events") .doc(doc.id) .collection("private") - .doc("webhookAuth") + .doc(transcript.id) .get() const tokenDataInDb = @@ -69,12 +62,12 @@ export const transcription = functions // If there is one authenticated event, pull out the parts we want to // save and try to save them in the db. - const { paragraphs } = await assembly.transcripts.paragraphs( + const paragraphs = await assemblyAI.fetchParagraphs( transcript.id ) const { id, text, audio_url, utterances } = transcript try { - const transcriptionInDb = await db + const transcriptionInDb = db .collection("transcriptions") .doc(id) diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index a9c19daeb..1078e740a 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -1,7 +1,6 @@ -import { Timestamp } from "../../functions/src/firebase" import { Record, Number, String } from "runtypes" import { Script } from "./types" -import { getHearingVideoUrl, submitTranscription } from "functions/src/events" +import { HearingPostProcessor } from "functions/src/events" const Args = Record({ eventId: Number.optional(), @@ -20,30 +19,17 @@ export const script: Script = async ({ db, args }) => { return } const data = doc.data() - if (data?.videoTranscriptionId) { - console.log(`Hearing ${eventId} already has a transcription.`) - return - } + if (!data) return try { - const maybeVideoUrl = await getHearingVideoUrl(eventId) - if (maybeVideoUrl) { - const transcriptId = await submitTranscription({ - maybeVideoUrl, - EventId: eventId, - bucketName - }) - - await docRef.update({ - videoURL: maybeVideoUrl, - videoFetchedAt: Timestamp.now(), - videoTranscriptionId: transcriptId - }) + const update = await new HearingPostProcessor().getUpdate({ EventId: eventId }) + if (update !== null) { + await docRef.update(update) console.log( - `Transcription submitted for hearing ${eventId}: ${transcriptId}` + `Transcriptions submitted for hearing ${eventId}: ${update.transcriptionIds}` ) } else { - console.log(`No valid video URL found for hearing ${eventId}`) + console.log(`No additional videos to be processed for ${eventId}`) } } catch (error) { console.error(`Failed to process hearing ${eventId}:`, error) @@ -60,40 +46,25 @@ export const script: Script = async ({ db, args }) => { if (count >= 100) { break // Limit to 100 operations for this run } + const EventId = parseInt(doc.id.replace("hearing-", "")) + console.log(`Processing hearing ${EventId}...`) const data = doc.data() - if (!data.videoTranscriptionId) { - const EventId = parseInt(doc.id.replace("hearing-", "")) - console.log(`Processing hearing ${EventId}...`) - - try { - const maybeVideoUrl = await getHearingVideoUrl(EventId) - if (maybeVideoUrl) { - const transcriptId = await submitTranscription({ - maybeVideoUrl, - EventId, - bucketName - }) + if (data.empty) continue - await doc.ref.update({ - videoURL: maybeVideoUrl, - videoFetchedAt: Timestamp.now(), - videoTranscriptionId: transcriptId - }) + try { + const update = await new HearingPostProcessor().getUpdate({ EventId }) + if (update.videos.length > data.videos.length) { + await doc.ref.update(update) - console.log( - `Transcription submitted for hearing ${EventId}: ${transcriptId}` - ) - count++ - } else { - console.log(`No valid video URL found for hearing ${EventId}`) - } - } catch (error) { - console.error(`Failed to process hearing ${EventId}:`, error) + console.log( + `Transcriptions submitted for hearing ${EventId}: ${update.transcriptionIds}` + ) + count++ + } else { + console.log(`No additional videos to be processed for hearing ${EventId}`) } - } else { - console.log( - `Skipping hearing ${data.EventId}, already has transcription.` - ) + } catch (error) { + console.error(`Failed to process hearing ${EventId}:`, error) } } console.log("Done processing hearings without transcriptions.") diff --git a/scripts/firebase-admin/backfillHearingVideoFormat.ts b/scripts/firebase-admin/backfillHearingVideoFormat.ts new file mode 100644 index 000000000..f5630e485 --- /dev/null +++ b/scripts/firebase-admin/backfillHearingVideoFormat.ts @@ -0,0 +1,102 @@ +import { FieldValue, Timestamp } from "../../functions/src/firebase" +import { Record, Number } from "runtypes" +import { Script } from "./types" + +const Args = Record({ + eventId: Number.optional() +}) + +function migrateVideo( + data: FirebaseFirestore.DocumentData +): FirebaseFirestore.DocumentData | null { + if ('videos' in data) { + return null; + } + + if (!('videoURL' in data)) { + return { + videos: [], + transcriptionIds: [], + videoTranscriptionId: FieldValue.delete(), + videoFetchedAt: FieldValue.delete(), + videoURL: FieldValue.delete(), + }; + } + + const url = data.videoURL; + const fetchedAt = data?.videoFetchedAt; + const transcriptionId = data?.videoTranscriptionId; + + if (!fetchedAt) { + throw new Error( + `If videoURL is present for the video, it is expected that videoFetchedAt is also present (id: ${data.id})` + ); + } + + const transcriptionIds = transcriptionId ? [transcriptionId] : []; + + const videos = [ + { + // Default; not shown + title: data.id, + url, + transcriptionId, + fetchedAt, + }, + ]; + + return { + videos, + transcriptionIds, + videosFetchedAt: fetchedAt || Timestamp.now(), + videoTranscriptionId: FieldValue.delete(), + videoFetchedAt: FieldValue.delete(), + videoURL: FieldValue.delete(), + }; +} + +export const script: Script = async ({ db, args }) => { + const { eventId } = Args.check(args) + + // Process a single event by eventId + if (eventId) { + const snapshot = await db + .collection("events") + .where("type", "==", "hearing") + .where("id", "==", eventId) + .get() + + if (snapshot.empty || snapshot.docs.length !== 1) { + throw new Error(`The number of documents matching the event id ${eventId} must be exactly one`) + } + + const doc = snapshot.docs[0] + const modify = migrateVideo(doc.data()) + if (modify) { + doc.ref.update(modify) + } + } else { + const snapshot = await db + .collection("events") + .where("type", "==", "hearing") + .get(); + + if (snapshot.empty) { + throw new Error("Hearing backfill failed; no documents were found"); + } + + let bulkWriter = db.bulkWriter(); + + for (const doc of snapshot.docs) { + console.log(doc.data().id) + const modify = migrateVideo(doc.data()) + if (modify) { + // syncHearingToSearchIndex will temporarily complain due to the multiple updates of bulkWriter + bulkWriter.update(doc.ref, modify) + } + } + await bulkWriter.close(); + } + + console.log("Video backfill complete") +} From 9389560d9cc85466dbaf2211a68ba5caa6c02ae8 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 12:19:48 -0400 Subject: [PATCH 02/10] Prettier --- functions/src/events/AssemblyAIHandler.ts | 150 +++++++++--------- functions/src/events/EventScraper.ts | 3 +- functions/src/events/HearingScraper.ts | 141 ++++++++-------- functions/src/events/scrapeEvents.ts | 8 +- functions/src/events/types.ts | 3 +- functions/src/webhooks/transcription.ts | 4 +- .../backfillHearingTranscription.ts | 8 +- .../backfillHearingVideoFormat.ts | 46 +++--- 8 files changed, 188 insertions(+), 175 deletions(-) diff --git a/functions/src/events/AssemblyAIHandler.ts b/functions/src/events/AssemblyAIHandler.ts index a31648b37..b186e41b5 100644 --- a/functions/src/events/AssemblyAIHandler.ts +++ b/functions/src/events/AssemblyAIHandler.ts @@ -1,4 +1,10 @@ -import { AssemblyAI, Transcript, TranscriptParagraph, TranscriptUtterance, TranscriptWord } from "assemblyai" +import { + AssemblyAI, + Transcript, + TranscriptParagraph, + TranscriptUtterance, + TranscriptWord +} from "assemblyai" import { db, storage } from "../firebase" import { randomBytes } from "node:crypto" import { sha256 } from "js-sha256" @@ -14,7 +20,7 @@ abstract class AssemblyAIHandlerBase { EventId: number videoUrl: string bucketName?: string - }): Promise; + }): Promise async submitTranscriptions({ EventId, @@ -25,30 +31,29 @@ abstract class AssemblyAIHandlerBase { videoUrls: string[] bucketName?: string }): Promise { - const transcriptionIds = await Promise.all(videoUrls.map(item => { - return this.submitTranscription({videoUrl: item, EventId, bucketName}) - })); + const transcriptionIds = await Promise.all( + videoUrls.map(item => { + return this.submitTranscription({ videoUrl: item, EventId, bucketName }) + }) + ) return transcriptionIds } - abstract getTranscript(transcript_id: string): Promise; - abstract fetchParagraphs(transcript_id: string): Promise; + abstract getTranscript(transcript_id: string): Promise + abstract fetchParagraphs( + transcript_id: string + ): Promise } - export class AssemblyAIHandler extends AssemblyAIHandlerBase { - assembly: AssemblyAI; + assembly: AssemblyAI - constructor({ - apiKey - }: { - apiKey: string - }) { - super(); + constructor({ apiKey }: { apiKey: string }) { + super() this.assembly = new AssemblyAI({ apiKey - }); + }) } async submitTranscription({ @@ -61,11 +66,7 @@ export class AssemblyAIHandler extends AssemblyAIHandlerBase { bucketName?: string }): Promise { const newToken = randomBytes(16).toString("hex") - const audioUrl = await extractAudioFromVideo( - EventId, - videoUrl, - bucketName - ) + const audioUrl = await extractAudioFromVideo(EventId, videoUrl, bucketName) const transcript = await this.assembly.transcripts.submit({ audio: @@ -95,11 +96,12 @@ export class AssemblyAIHandler extends AssemblyAIHandlerBase { } async getTranscript(transcript_id: string): Promise { - return (await this.assembly.transcripts.get(transcript_id)); + return await this.assembly.transcripts.get(transcript_id) } async fetchParagraphs(transcript_id: string): Promise { - return (await this.assembly.transcripts.paragraphs(transcript_id)).paragraphs; + return (await this.assembly.transcripts.paragraphs(transcript_id)) + .paragraphs } } @@ -135,20 +137,19 @@ export class AssemblyAIHandlerDummy extends AssemblyAIHandlerBase { .set({ videoAssemblyWebhookToken: sha256(token) }) - - return transcriptionId; + + return transcriptionId } async getTranscript(transcriptId: string): Promise { - return getTranscript(transcriptId).transcript; + return getTranscript(transcriptId).transcript } async fetchParagraphs(transcriptId: string): Promise { - return getTranscript(transcriptId).paragraphs; + return getTranscript(transcriptId).paragraphs } } - const extractAudioFromVideo = async ( EventId: number, videoUrl: string, @@ -279,68 +280,63 @@ const WORD_BANK = [ "dolore", "magna", "aliqua" -]; +] -const SPEAKERS = ["A", "B", "C"]; +const SPEAKERS = ["A", "B", "C"] function randomInt(min: number, max: number) { - return Math.floor(Math.random() * (max - min + 1)) + min; + return Math.floor(Math.random() * (max - min + 1)) + min } function randomFloat(min: number, max: number, precision = 2) { - return Number((Math.random() * (max - min) + min).toFixed(precision)); + return Number((Math.random() * (max - min) + min).toFixed(precision)) } function mean(values: number[]) { - return values.reduce((a, b) => a + b, 0) / values.length; + return values.reduce((a, b) => a + b, 0) / values.length } function loremSentence(length: number) { return Array.from({ length }, () => { - return WORD_BANK[randomInt(0, WORD_BANK.length - 1)]; - }); + return WORD_BANK[randomInt(0, WORD_BANK.length - 1)] + }) } function loremParagraph(length: number) { - return Array.from({ length }, () => - loremSentence(randomInt(3, 10)) - ); + return Array.from({ length }, () => loremSentence(randomInt(3, 10))) } /** * paragraphs -> sentences -> words */ function loremTranscriptStructure() { - return Array.from( - { length: randomInt(10, 20) }, - () => loremParagraph(randomInt(3, 8)) - ); + return Array.from({ length: randomInt(10, 20) }, () => + loremParagraph(randomInt(3, 8)) + ) } -export function getTranscript( - transcript_id: string -): { - transcript: Transcript; - paragraphs: TranscriptParagraph[]; +export function getTranscript(transcript_id: string): { + transcript: Transcript + paragraphs: TranscriptParagraph[] } { - const structure = loremTranscriptStructure(); + const structure = loremTranscriptStructure() - const utterances: TranscriptUtterance[] = []; - const paragraphs: TranscriptParagraph[] = []; - const allWords: TranscriptWord[] = []; + const utterances: TranscriptUtterance[] = [] + const paragraphs: TranscriptParagraph[] = [] + const allWords: TranscriptWord[] = [] - let currentTime = 0; + let currentTime = 0 for (const paragraph of structure) { - const speaker = SPEAKERS[randomInt(0, SPEAKERS.length - 1)]; + const speaker = SPEAKERS[randomInt(0, SPEAKERS.length - 1)] - const paragraphWords: TranscriptWord[] = []; + const paragraphWords: TranscriptWord[] = [] for (const sentence of paragraph) { - const sentenceWords: TranscriptWord[] = []; + const sentenceWords: TranscriptWord[] = [] for (const token of sentence) { - const confidence = randomFloat(0.5, 0.99); + const confidence = randomFloat(0.5, 0.99) const word: TranscriptWord = { confidence, @@ -348,13 +344,13 @@ export function getTranscript( end: Number((currentTime + 1).toFixed(2)), speaker, text: token - }; + } - sentenceWords.push(word); - paragraphWords.push(word); - allWords.push(word); + sentenceWords.push(word) + paragraphWords.push(word) + allWords.push(word) - currentTime += 1; + currentTime += 1 } const utterance: TranscriptUtterance = { @@ -366,12 +362,12 @@ export function getTranscript( speaker, text: sentenceWords.map(w => w.text).join(" "), words: sentenceWords - }; + } - utterances.push(utterance); + utterances.push(utterance) // pause between sentences - currentTime += randomFloat(0.2, 1.2); + currentTime += randomFloat(0.2, 1.2) } // paragraph object @@ -383,12 +379,12 @@ export function getTranscript( end: paragraphWords[paragraphWords.length - 1].end, text: paragraphWords.map(w => w.text).join(" "), words: paragraphWords - }; + } - paragraphs.push(transcriptParagraph); + paragraphs.push(transcriptParagraph) // longer pause between paragraphs - currentTime += randomFloat(1, 3); + currentTime += randomFloat(1, 3) } const transcript: Transcript = { @@ -407,27 +403,25 @@ export function getTranscript( webhook_auth_header_name: "x-maple-webhook", text: utterances.map(u => u.text).join(". "), - confidence: Number( - mean(allWords.map(w => w.confidence)).toFixed(2) - ), + confidence: Number(mean(allWords.map(w => w.confidence)).toFixed(2)), utterances, words: allWords - }; + } return { transcript, paragraphs - }; + } } export const assemblyAI: AssemblyAIHandler | AssemblyAIHandlerDummy = (() => { - const apiKey = process.env.ASSEMBLY_API_KEY; + const apiKey = process.env.ASSEMBLY_API_KEY if (!apiKey || apiKey === "test-api-key") { - console.log("AssemblyAI is faked for this emulator"); - return new AssemblyAIHandlerDummy(); + console.log("AssemblyAI is faked for this emulator") + return new AssemblyAIHandlerDummy() } else { - console.log("AssemblyAI is real for this emulator"); - return new AssemblyAIHandler({ apiKey }); + console.log("AssemblyAI is real for this emulator") + return new AssemblyAIHandler({ apiKey }) } -})(); +})() diff --git a/functions/src/events/EventScraper.ts b/functions/src/events/EventScraper.ts index ebb4d8bfa..a09135df5 100644 --- a/functions/src/events/EventScraper.ts +++ b/functions/src/events/EventScraper.ts @@ -197,7 +197,8 @@ export abstract class EventPostProcessor { const begin = now.minus(this.pastEventBeginProcessing).toJSDate() const cutoff = now.minus(this.pastEventCutoff).toJSDate() - const snapshot = await db.collection("events") + const snapshot = await db + .collection("events") .where("type", "==", this.eventType) .where("startsAt", "<=", begin) .where("startsAt", ">=", cutoff) diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts index 00c766f82..b58076c6e 100644 --- a/functions/src/events/HearingScraper.ts +++ b/functions/src/events/HearingScraper.ts @@ -1,12 +1,7 @@ import { JSDOM } from "jsdom" import { db, Timestamp } from "../firebase" import * as api from "../malegislature" -import { - Hearing, - HearingContent, - HearingListItem, - Video -} from "./types" +import { Hearing, HearingContent, HearingListItem, Video } from "./types" import { isValidVideoUrl } from "./helpers" import { Committee } from "../committees/types" import { EventPostProcessor, EventScraper } from "./EventScraper" @@ -43,7 +38,6 @@ const loadCommitteeChairNames = async ( } } - export class HearingScraper extends EventScraper { constructor() { super("every 60 minutes", 480) @@ -54,9 +48,7 @@ export class HearingScraper extends EventScraper { return events.filter(HearingListItem.guard) } - async getEvent( - { EventId }: HearingListItem /* e.g. 4962 */ - ) { + async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) { const data = await api.getHearing(EventId) const content = HearingContent.check(data) @@ -88,84 +80,102 @@ export class HearingPostProcessor extends EventPostProcessor { super("every 60 minutes", 480, "hearing", { memory: "4GB" }) } - - async getHearingVideos(EventId: number): Promise[]> { - const hearingErr = `An error collecting videos for hearing ${EventId} (webpage format changed?)`; + async getHearingVideos( + EventId: number + ): Promise[]> { + const hearingErr = `An error collecting videos for hearing ${EventId} (webpage format changed?)` const req = await fetch( `https://malegislature.gov/Events/Hearings/Detail/${EventId}` ) const res = await req.text() - if (!res) throw new Error(`${hearingErr}: No response for request`); - const dom = new JSDOM(res); - if (!dom) throw new Error(`${hearingErr}: Could not create JSDOM of request`); - - const videoElements = [].slice.call(dom.window.document.querySelectorAll('#playWebcast')) as Element[]; + if (!res) throw new Error(`${hearingErr}: No response for request`) + const dom = new JSDOM(res) + if (!dom) + throw new Error(`${hearingErr}: Could not create JSDOM of request`) + + const videoElements = [].slice.call( + dom.window.document.querySelectorAll("#playWebcast") + ) as Element[] if (videoElements.length === 0) return [] - const videoURLs = videoElements.map((elem) => { - const onclick = elem.getAttribute("onclick"); - if (!onclick) throw new Error(`${hearingErr}: No onclick in ${elem}`); - const match = onclick.match(/switchVideo\('([^']+)'/); - if (!match || match.length < 2) throw new Error(`${hearingErr}: Could not match switchVideo in ${elem}`); - if (!isValidVideoUrl(match[1])) throw new Error(`${hearingErr}: ${match[1]} is not a valid video url`); - return match[1]; - }); - const tbody = videoElements[0].closest("tbody"); - if (!tbody) throw new Error(`${hearingErr}: Could not find parent tbody of #playWebcast`); + const videoURLs = videoElements.map(elem => { + const onclick = elem.getAttribute("onclick") + if (!onclick) throw new Error(`${hearingErr}: No onclick in ${elem}`) + const match = onclick.match(/switchVideo\('([^']+)'/) + if (!match || match.length < 2) + throw new Error(`${hearingErr}: Could not match switchVideo in ${elem}`) + if (!isValidVideoUrl(match[1])) + throw new Error(`${hearingErr}: ${match[1]} is not a valid video url`) + return match[1] + }) + const tbody = videoElements[0].closest("tbody") + if (!tbody) + throw new Error( + `${hearingErr}: Could not find parent tbody of #playWebcast` + ) const titles = Array.from(tbody.querySelectorAll("tr")).map(tr => { - const item = tr.querySelector("td")?.textContent?.trim(); - if (!item) throw new Error(`${hearingErr}: Could not locate title in ${tr}`) - return item; - }); - if (titles.length !== videoURLs.length) throw new Error(`${hearingErr}: Number of video table rows did not equal number of #playWebcast elements`) + const item = tr.querySelector("td")?.textContent?.trim() + if (!item) + throw new Error(`${hearingErr}: Could not locate title in ${tr}`) + return item + }) + if (titles.length !== videoURLs.length) + throw new Error( + `${hearingErr}: Number of video table rows did not equal number of #playWebcast elements` + ) let videos = videoURLs.map((url, i) => { return { - url: url, title: titles[i] + url: url, + title: titles[i] } - }); + }) - let seen = new Set(); + let seen = new Set() videos = videos.filter(item => { - if (seen.has(item.url)) return false; - seen.add(item.url); - return true; - }); + if (seen.has(item.url)) return false + seen.add(item.url) + return true + }) if (videos.length > 1) { const order = videos.map(item => { - const title = item.title.toLowerCase(); - const match = title.match(/\b(?:(\d+)\s+of\s+\d+|part\s+(\d+)|pt\.?\s+(\d+))\b/); - if (!match) return -1; - const part = parseInt(match[1] || match[2] || match[3], 10); - return part-1; - }); - seen.clear(); - let validOrder = true; + const title = item.title.toLowerCase() + const match = title.match( + /\b(?:(\d+)\s+of\s+\d+|part\s+(\d+)|pt\.?\s+(\d+))\b/ + ) + if (!match) return -1 + const part = parseInt(match[1] || match[2] || match[3], 10) + return part - 1 + }) + seen.clear() + let validOrder = true for (const n of order) { if (n < 0 || n >= order.length || seen.has(n)) { - validOrder = false; - break; + validOrder = false + break } - seen.add(n); + seen.add(n) } if (validOrder) { - const reordered = new Array(videos.length); + const reordered = new Array(videos.length) for (let i = 0; i < order.length; i++) { - reordered[order[i]] = videos[i]; + reordered[order[i]] = videos[i] } - videos = reordered; + videos = reordered videos = videos.map((item, index) => { - item.title = `Part ${index+1}`; - return item; + item.title = `Part ${index + 1}` + return item }) } else { - console.log(`While scraping hearing videos, the titles ${titles} could not be mapped to a numeric order`) + console.log( + `While scraping hearing videos, the titles ${titles} could not be mapped to a numeric order` + ) } } else { videos[0].title = `hearing-${EventId}` } - return videos; + return videos } updateIf(data: FirebaseFirestore.DocumentData): null | HearingListItem { @@ -173,11 +183,16 @@ export class HearingPostProcessor extends EventPostProcessor { return { EventId: data.id } } - async getUpdate( - { EventId }: HearingListItem - ): Promise<{ transcriptionIds: string[], videos: Video[], videosFetchedAt: Timestamp }> { + async getUpdate({ EventId }: HearingListItem): Promise<{ + transcriptionIds: string[] + videos: Video[] + videosFetchedAt: Timestamp + }> { const videos = await this.getHearingVideos(EventId) - const transcriptionIds = await assemblyAI.submitTranscriptions({videoUrls: videos.map(item => item.url), EventId}) + const transcriptionIds = await assemblyAI.submitTranscriptions({ + videoUrls: videos.map(item => item.url), + EventId + }) const videosWithTranscriptions = videos.map((item, index) => { return { transcriptionId: transcriptionIds[index], @@ -188,7 +203,7 @@ export class HearingPostProcessor extends EventPostProcessor { return { transcriptionIds, videos: videosWithTranscriptions, - videosFetchedAt: Timestamp.now(), + videosFetchedAt: Timestamp.now() } } } diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 701ccb567..6f5476e7d 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -34,8 +34,8 @@ export const scrapeSingleHearing = functions try { const hearing = { - ...await new HearingScraper().getEvent({ EventId: eventId }), - ...await new HearingPostProcessor().getUpdate({ EventId: eventId }) // Videos + ...(await new HearingScraper().getEvent({ EventId: eventId })), + ...(await new HearingPostProcessor().getUpdate({ EventId: eventId })) // Videos } // Save the hearing to Firestore @@ -77,8 +77,8 @@ export const scrapeSingleHearingv2 = onCall( try { const hearing = { - ...await new HearingScraper().getEvent({ EventId: eventId }), - ...await new HearingPostProcessor().getUpdate({ EventId: eventId }) // Videos + ...(await new HearingScraper().getEvent({ EventId: eventId })), + ...(await new HearingPostProcessor().getUpdate({ EventId: eventId })) // Videos } // Save the hearing to Firestore diff --git a/functions/src/events/types.ts b/functions/src/events/types.ts index 90d40dfb5..9a00fe190 100644 --- a/functions/src/events/types.ts +++ b/functions/src/events/types.ts @@ -101,10 +101,9 @@ export type Video = Static export const Video = Record({ url: String, title: String, - transcriptionId: String, + transcriptionId: String }) - export type Hearing = Static export const Hearing = BaseEvent.extend({ type: L("hearing"), diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts index 95174929f..c50584c69 100644 --- a/functions/src/webhooks/transcription.ts +++ b/functions/src/webhooks/transcription.ts @@ -62,9 +62,7 @@ export const transcription = functions // If there is one authenticated event, pull out the parts we want to // save and try to save them in the db. - const paragraphs = await assemblyAI.fetchParagraphs( - transcript.id - ) + const paragraphs = await assemblyAI.fetchParagraphs(transcript.id) const { id, text, audio_url, utterances } = transcript try { const transcriptionInDb = db diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index 1078e740a..46469b9e7 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -21,7 +21,9 @@ export const script: Script = async ({ db, args }) => { const data = doc.data() if (!data) return try { - const update = await new HearingPostProcessor().getUpdate({ EventId: eventId }) + const update = await new HearingPostProcessor().getUpdate({ + EventId: eventId + }) if (update !== null) { await docRef.update(update) @@ -61,7 +63,9 @@ export const script: Script = async ({ db, args }) => { ) count++ } else { - console.log(`No additional videos to be processed for hearing ${EventId}`) + console.log( + `No additional videos to be processed for hearing ${EventId}` + ) } } catch (error) { console.error(`Failed to process hearing ${EventId}:`, error) diff --git a/scripts/firebase-admin/backfillHearingVideoFormat.ts b/scripts/firebase-admin/backfillHearingVideoFormat.ts index f5630e485..015436b69 100644 --- a/scripts/firebase-admin/backfillHearingVideoFormat.ts +++ b/scripts/firebase-admin/backfillHearingVideoFormat.ts @@ -9,31 +9,31 @@ const Args = Record({ function migrateVideo( data: FirebaseFirestore.DocumentData ): FirebaseFirestore.DocumentData | null { - if ('videos' in data) { - return null; + if ("videos" in data) { + return null } - if (!('videoURL' in data)) { + if (!("videoURL" in data)) { return { videos: [], transcriptionIds: [], videoTranscriptionId: FieldValue.delete(), videoFetchedAt: FieldValue.delete(), - videoURL: FieldValue.delete(), - }; + videoURL: FieldValue.delete() + } } - const url = data.videoURL; - const fetchedAt = data?.videoFetchedAt; - const transcriptionId = data?.videoTranscriptionId; + const url = data.videoURL + const fetchedAt = data?.videoFetchedAt + const transcriptionId = data?.videoTranscriptionId if (!fetchedAt) { throw new Error( `If videoURL is present for the video, it is expected that videoFetchedAt is also present (id: ${data.id})` - ); + ) } - const transcriptionIds = transcriptionId ? [transcriptionId] : []; + const transcriptionIds = transcriptionId ? [transcriptionId] : [] const videos = [ { @@ -41,9 +41,9 @@ function migrateVideo( title: data.id, url, transcriptionId, - fetchedAt, - }, - ]; + fetchedAt + } + ] return { videos, @@ -51,8 +51,8 @@ function migrateVideo( videosFetchedAt: fetchedAt || Timestamp.now(), videoTranscriptionId: FieldValue.delete(), videoFetchedAt: FieldValue.delete(), - videoURL: FieldValue.delete(), - }; + videoURL: FieldValue.delete() + } } export const script: Script = async ({ db, args }) => { @@ -65,11 +65,13 @@ export const script: Script = async ({ db, args }) => { .where("type", "==", "hearing") .where("id", "==", eventId) .get() - + if (snapshot.empty || snapshot.docs.length !== 1) { - throw new Error(`The number of documents matching the event id ${eventId} must be exactly one`) + throw new Error( + `The number of documents matching the event id ${eventId} must be exactly one` + ) } - + const doc = snapshot.docs[0] const modify = migrateVideo(doc.data()) if (modify) { @@ -79,13 +81,13 @@ export const script: Script = async ({ db, args }) => { const snapshot = await db .collection("events") .where("type", "==", "hearing") - .get(); + .get() if (snapshot.empty) { - throw new Error("Hearing backfill failed; no documents were found"); + throw new Error("Hearing backfill failed; no documents were found") } - let bulkWriter = db.bulkWriter(); + let bulkWriter = db.bulkWriter() for (const doc of snapshot.docs) { console.log(doc.data().id) @@ -95,7 +97,7 @@ export const script: Script = async ({ db, args }) => { bulkWriter.update(doc.ref, modify) } } - await bulkWriter.close(); + await bulkWriter.close() } console.log("Video backfill complete") From d227c5f904a43ebc22c09d4419c12438b93b781d Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 14:49:31 -0400 Subject: [PATCH 03/10] Small fixes --- functions/src/events/AssemblyAIHandler.ts | 31 ++++++++++++----------- functions/src/events/EventScraper.ts | 2 +- functions/src/events/HearingScraper.ts | 4 +-- functions/src/webhooks/transcription.ts | 6 +++-- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/functions/src/events/AssemblyAIHandler.ts b/functions/src/events/AssemblyAIHandler.ts index b186e41b5..f9ad64ef0 100644 --- a/functions/src/events/AssemblyAIHandler.ts +++ b/functions/src/events/AssemblyAIHandler.ts @@ -350,7 +350,7 @@ export function getTranscript(transcript_id: string): { paragraphWords.push(word) allWords.push(word) - currentTime += 1 + currentTime += 300 } const utterance: TranscriptUtterance = { @@ -366,11 +366,9 @@ export function getTranscript(transcript_id: string): { utterances.push(utterance) - // pause between sentences - currentTime += randomFloat(0.2, 1.2) + currentTime += randomInt(100, 3000) } - // paragraph object const transcriptParagraph: TranscriptParagraph = { confidence: Number( mean(paragraphWords.map(w => w.confidence)).toFixed(2) @@ -383,8 +381,7 @@ export function getTranscript(transcript_id: string): { paragraphs.push(transcriptParagraph) - // longer pause between paragraphs - currentTime += randomFloat(1, 3) + currentTime += randomInt(500, 7000) } const transcript: Transcript = { @@ -415,13 +412,17 @@ export function getTranscript(transcript_id: string): { } } -export const assemblyAI: AssemblyAIHandler | AssemblyAIHandlerDummy = (() => { - const apiKey = process.env.ASSEMBLY_API_KEY - if (!apiKey || apiKey === "test-api-key") { - console.log("AssemblyAI is faked for this emulator") - return new AssemblyAIHandlerDummy() - } else { - console.log("AssemblyAI is real for this emulator") - return new AssemblyAIHandler({ apiKey }) +let assemblyInstance: AssemblyAIHandler | AssemblyAIHandlerDummy | undefined + +export function assemblyAI(): AssemblyAIHandler | AssemblyAIHandlerDummy { + if (!assemblyInstance) { + const apiKey = process.env.ASSEMBLY_API_KEY + if (!apiKey || apiKey === "test-api-key") { + console.log("AssemblyAI is faked for this emulator") + assemblyInstance = new AssemblyAIHandlerDummy() + } else { + assemblyInstance = new AssemblyAIHandler({ apiKey }) + } } -})() + return assemblyInstance +} diff --git a/functions/src/events/EventScraper.ts b/functions/src/events/EventScraper.ts index a09135df5..3f7e62850 100644 --- a/functions/src/events/EventScraper.ts +++ b/functions/src/events/EventScraper.ts @@ -212,7 +212,7 @@ export abstract class EventPostProcessor { const item = this.updateIf(data) if (!item) continue - writer.update(doc.ref, this.getUpdate(item)) + writer.update(doc.ref, await this.getUpdate(item)) console.log("event in run()", data) } diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts index b58076c6e..066b05ae6 100644 --- a/functions/src/events/HearingScraper.ts +++ b/functions/src/events/HearingScraper.ts @@ -180,7 +180,7 @@ export class HearingPostProcessor extends EventPostProcessor { updateIf(data: FirebaseFirestore.DocumentData): null | HearingListItem { if (data.videos.length) return null - return { EventId: data.id } + return { EventId: data.content.EventId } } async getUpdate({ EventId }: HearingListItem): Promise<{ @@ -189,7 +189,7 @@ export class HearingPostProcessor extends EventPostProcessor { videosFetchedAt: Timestamp }> { const videos = await this.getHearingVideos(EventId) - const transcriptionIds = await assemblyAI.submitTranscriptions({ + const transcriptionIds = await assemblyAI().submitTranscriptions({ videoUrls: videos.map(item => item.url), EventId }) diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts index c50584c69..774eba82d 100644 --- a/functions/src/webhooks/transcription.ts +++ b/functions/src/webhooks/transcription.ts @@ -11,7 +11,7 @@ export const transcription = functions // If we get a request with the right header and status, get the // transcription from the assembly API. - const transcript = await assemblyAI.getTranscript(req.body.id) + const transcript = await assemblyAI().getTranscript(req.body.id) if (transcript && transcript.webhook_auth) { // If there is a transcript and the transcript has an auth property, @@ -62,7 +62,9 @@ export const transcription = functions // If there is one authenticated event, pull out the parts we want to // save and try to save them in the db. - const paragraphs = await assemblyAI.fetchParagraphs(transcript.id) + const paragraphs = await assemblyAI().fetchParagraphs( + transcript.id + ) const { id, text, audio_url, utterances } = transcript try { const transcriptionInDb = db From 80fa337a9912bc79809316744ff7f83ef9103c28 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 17:00:44 -0400 Subject: [PATCH 04/10] Video title parsing --- functions/src/events/HearingScraper.ts | 49 +++++++++++++++++-- .../backfillHearingVideoFormat.ts | 1 - 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts index 066b05ae6..b7c2caaa5 100644 --- a/functions/src/events/HearingScraper.ts +++ b/functions/src/events/HearingScraper.ts @@ -75,6 +75,43 @@ export class HearingScraper extends EventScraper { } } +function removeCommonWords(strings: string[]) { + if (!strings.length) return []; + + // Normalize whitespace and split into words + const wordLists = strings.map(s => + s.trim().replace(/\s+/g, " ").split(" ") + ); + + let prefixLen = 0; + while ( + wordLists.every(words => + prefixLen < words.length && + words[prefixLen].toLowerCase() === + wordLists[0][prefixLen].toLowerCase() + ) + ) { + prefixLen++; + } + + let suffixLen = 0; + while ( + wordLists.every(words => + suffixLen < words.length - prefixLen && + words[words.length - 1 - suffixLen].toLowerCase() === + wordLists[0][wordLists[0].length - 1 - suffixLen].toLowerCase() + ) + ) { + suffixLen++; + } + + return wordLists.map(words => + words + .slice(prefixLen, words.length - suffixLen) + .join(" ") + ); +} + export class HearingPostProcessor extends EventPostProcessor { constructor() { super("every 60 minutes", 480, "hearing", { memory: "4GB" }) @@ -168,9 +205,15 @@ export class HearingPostProcessor extends EventPostProcessor { return item }) } else { - console.log( - `While scraping hearing videos, the titles ${titles} could not be mapped to a numeric order` - ) + let shortTitles = removeCommonWords(titles) + if (shortTitles[0].length === 0) { + shortTitles = shortTitles.map((_, i) => `Part ${i+1}`) + } + videos = videos.map((item, index) => { + item.title = shortTitles[index] + return item + }) + console.log(`Ordering not possible for hearing ${EventId} - fallback titles are ${JSON.stringify(shortTitles)}`) } } else { videos[0].title = `hearing-${EventId}` diff --git a/scripts/firebase-admin/backfillHearingVideoFormat.ts b/scripts/firebase-admin/backfillHearingVideoFormat.ts index 015436b69..d05a7e0a4 100644 --- a/scripts/firebase-admin/backfillHearingVideoFormat.ts +++ b/scripts/firebase-admin/backfillHearingVideoFormat.ts @@ -93,7 +93,6 @@ export const script: Script = async ({ db, args }) => { console.log(doc.data().id) const modify = migrateVideo(doc.data()) if (modify) { - // syncHearingToSearchIndex will temporarily complain due to the multiple updates of bulkWriter bulkWriter.update(doc.ref, modify) } } From fa46b2058200a1f2e36e2269a38c40a183a66719 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 18:43:43 -0400 Subject: [PATCH 05/10] Allowed to reuse existing transcripts --- functions/src/events/AssemblyAIHandler.ts | 22 +------------------ functions/src/events/HearingScraper.ts | 20 ++++++++++++----- functions/src/events/scrapeEvents.ts | 2 +- .../backfillHearingTranscription.ts | 17 ++++++++------ 4 files changed, 27 insertions(+), 34 deletions(-) diff --git a/functions/src/events/AssemblyAIHandler.ts b/functions/src/events/AssemblyAIHandler.ts index f9ad64ef0..2bc290114 100644 --- a/functions/src/events/AssemblyAIHandler.ts +++ b/functions/src/events/AssemblyAIHandler.ts @@ -22,24 +22,6 @@ abstract class AssemblyAIHandlerBase { bucketName?: string }): Promise - async submitTranscriptions({ - EventId, - videoUrls, - bucketName - }: { - EventId: number - videoUrls: string[] - bucketName?: string - }): Promise { - const transcriptionIds = await Promise.all( - videoUrls.map(item => { - return this.submitTranscription({ videoUrl: item, EventId, bucketName }) - }) - ) - - return transcriptionIds - } - abstract getTranscript(transcript_id: string): Promise abstract fetchParagraphs( transcript_id: string @@ -306,9 +288,7 @@ function loremParagraph(length: number) { return Array.from({ length }, () => loremSentence(randomInt(3, 10))) } -/** - * paragraphs -> sentences -> words - */ +// paragraphs -> sentences -> words function loremTranscriptStructure() { return Array.from({ length: randomInt(10, 20) }, () => loremParagraph(randomInt(3, 8)) diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts index b7c2caaa5..09f6feab4 100644 --- a/functions/src/events/HearingScraper.ts +++ b/functions/src/events/HearingScraper.ts @@ -226,16 +226,26 @@ export class HearingPostProcessor extends EventPostProcessor { return { EventId: data.content.EventId } } - async getUpdate({ EventId }: HearingListItem): Promise<{ + async getUpdate({ EventId }: HearingListItem, existingVideos?: Video[]): Promise<{ transcriptionIds: string[] videos: Video[] videosFetchedAt: Timestamp }> { const videos = await this.getHearingVideos(EventId) - const transcriptionIds = await assemblyAI().submitTranscriptions({ - videoUrls: videos.map(item => item.url), - EventId - }) + + const prevURLs = existingVideos ? + Object.fromEntries(existingVideos.map(({ url, transcriptionId }) => + [url, transcriptionId] + )) : {} + + const transcriptionIds = await Promise.all( + videos.map(item => { + return prevURLs[item.url] !== undefined ? prevURLs[item.url] : assemblyAI().submitTranscription({ + EventId, videoUrl: item.url + }) + }) + ) + const videosWithTranscriptions = videos.map((item, index) => { return { transcriptionId: transcriptionIds[index], diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 6f5476e7d..ecbdbce49 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -78,7 +78,7 @@ export const scrapeSingleHearingv2 = onCall( try { const hearing = { ...(await new HearingScraper().getEvent({ EventId: eventId })), - ...(await new HearingPostProcessor().getUpdate({ EventId: eventId })) // Videos + ...(await new HearingPostProcessor().getUpdate({ EventId: eventId })) } // Save the hearing to Firestore diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index 46469b9e7..b19e3500d 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -1,14 +1,15 @@ -import { Record, Number, String } from "runtypes" +import { Record, Number, String, Boolean } from "runtypes" import { Script } from "./types" import { HearingPostProcessor } from "functions/src/events" const Args = Record({ eventId: Number.optional(), - bucketName: String.optional() + bucketName: String.optional(), + recreateTranscripts: Boolean.optional() }) export const script: Script = async ({ db, args }) => { - const { eventId, bucketName } = Args.check(args) + const { eventId, bucketName, recreateTranscripts } = Args.check(args) // Process a single event by eventId if (eventId) { @@ -21,9 +22,9 @@ export const script: Script = async ({ db, args }) => { const data = doc.data() if (!data) return try { - const update = await new HearingPostProcessor().getUpdate({ - EventId: eventId - }) + const update = recreateTranscripts ? + await new HearingPostProcessor().getUpdate({ EventId: eventId }) : + await new HearingPostProcessor().getUpdate({ EventId: eventId }, data.videos) if (update !== null) { await docRef.update(update) @@ -54,7 +55,9 @@ export const script: Script = async ({ db, args }) => { if (data.empty) continue try { - const update = await new HearingPostProcessor().getUpdate({ EventId }) + const update = recreateTranscripts ? + await new HearingPostProcessor().getUpdate({ EventId }) : + await new HearingPostProcessor().getUpdate({ EventId }, data.videos) if (update.videos.length > data.videos.length) { await doc.ref.update(update) From 55693948911afeaff8a5b9ca8690c181948a5917 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 18:52:07 -0400 Subject: [PATCH 06/10] Prettier --- functions/src/events/HearingScraper.ts | 71 +++++++++++-------- .../backfillHearingTranscription.ts | 15 ++-- 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts index 09f6feab4..ea7b38bde 100644 --- a/functions/src/events/HearingScraper.ts +++ b/functions/src/events/HearingScraper.ts @@ -76,40 +76,37 @@ export class HearingScraper extends EventScraper { } function removeCommonWords(strings: string[]) { - if (!strings.length) return []; + if (!strings.length) return [] // Normalize whitespace and split into words - const wordLists = strings.map(s => - s.trim().replace(/\s+/g, " ").split(" ") - ); + const wordLists = strings.map(s => s.trim().replace(/\s+/g, " ").split(" ")) - let prefixLen = 0; + let prefixLen = 0 while ( - wordLists.every(words => - prefixLen < words.length && - words[prefixLen].toLowerCase() === - wordLists[0][prefixLen].toLowerCase() + wordLists.every( + words => + prefixLen < words.length && + words[prefixLen].toLowerCase() === wordLists[0][prefixLen].toLowerCase() ) ) { - prefixLen++; + prefixLen++ } - let suffixLen = 0; + let suffixLen = 0 while ( - wordLists.every(words => - suffixLen < words.length - prefixLen && - words[words.length - 1 - suffixLen].toLowerCase() === - wordLists[0][wordLists[0].length - 1 - suffixLen].toLowerCase() + wordLists.every( + words => + suffixLen < words.length - prefixLen && + words[words.length - 1 - suffixLen].toLowerCase() === + wordLists[0][wordLists[0].length - 1 - suffixLen].toLowerCase() ) ) { - suffixLen++; + suffixLen++ } return wordLists.map(words => - words - .slice(prefixLen, words.length - suffixLen) - .join(" ") - ); + words.slice(prefixLen, words.length - suffixLen).join(" ") + ) } export class HearingPostProcessor extends EventPostProcessor { @@ -207,13 +204,17 @@ export class HearingPostProcessor extends EventPostProcessor { } else { let shortTitles = removeCommonWords(titles) if (shortTitles[0].length === 0) { - shortTitles = shortTitles.map((_, i) => `Part ${i+1}`) + shortTitles = shortTitles.map((_, i) => `Part ${i + 1}`) } videos = videos.map((item, index) => { item.title = shortTitles[index] return item }) - console.log(`Ordering not possible for hearing ${EventId} - fallback titles are ${JSON.stringify(shortTitles)}`) + console.log( + `Ordering not possible for hearing ${EventId} - fallback titles are ${JSON.stringify( + shortTitles + )}` + ) } } else { videos[0].title = `hearing-${EventId}` @@ -226,23 +227,33 @@ export class HearingPostProcessor extends EventPostProcessor { return { EventId: data.content.EventId } } - async getUpdate({ EventId }: HearingListItem, existingVideos?: Video[]): Promise<{ + async getUpdate( + { EventId }: HearingListItem, + existingVideos?: Video[] + ): Promise<{ transcriptionIds: string[] videos: Video[] videosFetchedAt: Timestamp }> { const videos = await this.getHearingVideos(EventId) - const prevURLs = existingVideos ? - Object.fromEntries(existingVideos.map(({ url, transcriptionId }) => - [url, transcriptionId] - )) : {} + const prevURLs = existingVideos + ? Object.fromEntries( + existingVideos.map(({ url, transcriptionId }) => [ + url, + transcriptionId + ]) + ) + : {} const transcriptionIds = await Promise.all( videos.map(item => { - return prevURLs[item.url] !== undefined ? prevURLs[item.url] : assemblyAI().submitTranscription({ - EventId, videoUrl: item.url - }) + return prevURLs[item.url] !== undefined + ? prevURLs[item.url] + : assemblyAI().submitTranscription({ + EventId, + videoUrl: item.url + }) }) ) diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index b19e3500d..7612cf9d7 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -22,9 +22,12 @@ export const script: Script = async ({ db, args }) => { const data = doc.data() if (!data) return try { - const update = recreateTranscripts ? - await new HearingPostProcessor().getUpdate({ EventId: eventId }) : - await new HearingPostProcessor().getUpdate({ EventId: eventId }, data.videos) + const update = recreateTranscripts + ? await new HearingPostProcessor().getUpdate({ EventId: eventId }) + : await new HearingPostProcessor().getUpdate( + { EventId: eventId }, + data.videos + ) if (update !== null) { await docRef.update(update) @@ -55,9 +58,9 @@ export const script: Script = async ({ db, args }) => { if (data.empty) continue try { - const update = recreateTranscripts ? - await new HearingPostProcessor().getUpdate({ EventId }) : - await new HearingPostProcessor().getUpdate({ EventId }, data.videos) + const update = recreateTranscripts + ? await new HearingPostProcessor().getUpdate({ EventId }) + : await new HearingPostProcessor().getUpdate({ EventId }, data.videos) if (update.videos.length > data.videos.length) { await doc.ref.update(update) From e135a151d8371106c38ca374dfdc36855d9153c2 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 22:53:54 -0400 Subject: [PATCH 07/10] Rewrote additional files --- .../ballotquestions/CommitteeHearing.test.tsx | 22 +- components/ballotquestions/types.ts | 2 +- docs/ballot-questions-frontend.md | 6 +- pages/ballotQuestions/[id].tsx | 3 +- .../backfillHearingVideoFormat.ts | 9 +- .../migrateHearingTranscription.ts | 265 ++++++++---------- .../BallotQuestionDetails.stories.tsx | 5 +- 7 files changed, 152 insertions(+), 160 deletions(-) diff --git a/components/ballotquestions/CommitteeHearing.test.tsx b/components/ballotquestions/CommitteeHearing.test.tsx index 9777da7c5..ae41805e5 100644 --- a/components/ballotquestions/CommitteeHearing.test.tsx +++ b/components/ballotquestions/CommitteeHearing.test.tsx @@ -31,7 +31,11 @@ describe("CommitteeHearing", () => { }) it("shows hearing context copy", () => { - render() + render( + + ) expect(screen.getByText("Committee Hearing")).toBeInTheDocument() expect( screen.getByText("Committee hearings are public meetings.") @@ -39,13 +43,19 @@ describe("CommitteeHearing", () => { }) it("formats the hearing date", () => { - render() + render( + + ) expect(screen.getByText(/December 14, 2025/)).toBeInTheDocument() }) it("shows a hearing page link when an id is present", () => { render( - + ) expect( screen.getByRole("link", { name: /Open hearing page/i }) @@ -53,7 +63,11 @@ describe("CommitteeHearing", () => { }) it("hides the hearing page link when no hearing id is available", () => { - render() + render( + + ) expect(screen.queryByRole("link")).not.toBeInTheDocument() }) }) diff --git a/components/ballotquestions/types.ts b/components/ballotquestions/types.ts index cdc9e0b9f..b27cecb87 100644 --- a/components/ballotquestions/types.ts +++ b/components/ballotquestions/types.ts @@ -1,6 +1,6 @@ export type Hearing = { id: string - videoURL?: string + videoURLs: string[] startsAt: number // milliseconds since epoch, converted from Firestore Timestamp server-side } diff --git a/docs/ballot-questions-frontend.md b/docs/ballot-questions-frontend.md index 5e9ee756a..ff0cede3d 100644 --- a/docs/ballot-questions-frontend.md +++ b/docs/ballot-questions-frontend.md @@ -166,15 +166,15 @@ For each relevant hearing, display: - **Status**: "Occurred" if `hearing.content.startsAt` is in the past, "Scheduled" if in the future - **Date**: formatted from `hearing.content.startsAt` -- **Watch link**: "Watch the committee hearing here." linked to `hearing.videoURL` — hidden if no video +- **Watch link**: "Watch the committee hearing here." linked to `hearing.videoURLs` — hidden if no videos Since ballot questions are always under SJ42 and typically have one hearing, render a single hearing block. If there are multiple, render them in reverse chronological order (most recent first). **Hearing data model recap:** - `bill.hearingIds?: string[]` — event IDs; doc path is `/events/hearing-{id}` -- `bill.nextHearingAt?: Timestamp` — convenience field for upcoming hearing only (not sufficient alone — we need date + videoURL from the full document) -- `hearing.videoURL?: string` — link for the "Watch" CTA +- `bill.nextHearingAt?: Timestamp` — convenience field for upcoming hearing only (not sufficient alone — we need date + videoURLs from the full document) +- `hearing.videoURLs: string[]` — link for the "Watch" CTA - `hearing.content.startsAt` — determines "Occurred" vs. "Scheduled" status No new components are needed for hearing display — build a simple `CommitteeHearing` component local to `components/ballotquestions/`. diff --git a/pages/ballotQuestions/[id].tsx b/pages/ballotQuestions/[id].tsx index 5bbbe1ff9..418628e3e 100644 --- a/pages/ballotQuestions/[id].tsx +++ b/pages/ballotQuestions/[id].tsx @@ -10,6 +10,7 @@ import { Hearing } from "../../components/ballotquestions/types" import { BallotQuestion, Bill } from "../../components/db" +import { Video } from "../../components/hearing/hearing" import { createPage } from "../../components/page" import { usePublishService } from "../../components/publish/hooks" import { serverSideTranslations } from "next-i18next/serverSideTranslations" @@ -22,7 +23,7 @@ async function getHearing(id: string): Promise { const data = snap.data() return { id, - videoURL: data.videoURL ?? undefined, + videoURLs: data.videos.map((item: Video) => item.url), startsAt: data.startsAt?.toMillis() ?? 0 } } diff --git a/scripts/firebase-admin/backfillHearingVideoFormat.ts b/scripts/firebase-admin/backfillHearingVideoFormat.ts index d05a7e0a4..0596cb484 100644 --- a/scripts/firebase-admin/backfillHearingVideoFormat.ts +++ b/scripts/firebase-admin/backfillHearingVideoFormat.ts @@ -27,12 +27,6 @@ function migrateVideo( const fetchedAt = data?.videoFetchedAt const transcriptionId = data?.videoTranscriptionId - if (!fetchedAt) { - throw new Error( - `If videoURL is present for the video, it is expected that videoFetchedAt is also present (id: ${data.id})` - ) - } - const transcriptionIds = transcriptionId ? [transcriptionId] : [] const videos = [ @@ -40,8 +34,7 @@ function migrateVideo( // Default; not shown title: data.id, url, - transcriptionId, - fetchedAt + transcriptionId } ] diff --git a/scripts/firebase-admin/migrateHearingTranscription.ts b/scripts/firebase-admin/migrateHearingTranscription.ts index 910ba3943..776139bc0 100644 --- a/scripts/firebase-admin/migrateHearingTranscription.ts +++ b/scripts/firebase-admin/migrateHearingTranscription.ts @@ -39,6 +39,119 @@ function convertTimestamps(obj: any): any { return obj } +async function migrateTranscription( + db: admin.firestore.Firestore, + devDb: admin.firestore.Firestore, + transcriptionId: string, + bulkWriter?: FirebaseFirestore.BulkWriter +) { + const devTranscriptionDoc = await devDb + .collection("transcriptions") + .doc(transcriptionId) + .get() + + const devTranscriptionData = devTranscriptionDoc.exists + ? devTranscriptionDoc.data() + : null + + if (!devTranscriptionData) { + throw new Error( + `Transcription ${transcriptionId} not found in dev project.` + ) + } + + // Create transcription in target project instead of setting, in case it already exists, which will throw an error + const convertedData = convertTimestamps(devTranscriptionData) + console.log(`Creating transcription ${transcriptionId}...`) + if (bulkWriter) { + bulkWriter.create( + db.collection("transcriptions").doc(transcriptionId), + convertedData + ) + } else { + await db + .collection("transcriptions") + .doc(transcriptionId) + .create(convertedData) + } + + const subcollections = await devTranscriptionDoc.ref.listCollections() + for (const subcol of subcollections) { + const docs = await subcol.get() + for (const doc of docs.docs) { + const ref = db + .collection("transcriptions") + .doc(transcriptionId) + .collection(subcol.id) + .doc(doc.id) + if (bulkWriter) { + bulkWriter.set(ref, doc.data()) + } + await ref.set(doc.data()) + } + } +} + +async function migrateHearing( + db: admin.firestore.Firestore, + devDb: admin.firestore.Firestore, + devDoc: + | admin.firestore.DocumentSnapshot + | admin.firestore.QueryDocumentSnapshot, + bulkWriter?: FirebaseFirestore.BulkWriter +): Promise<"migrate" | "skip" | "fail"> { + const devData = devDoc.data() + + if (!devData || !devData.transcriptionIds.length) { + console.log(`Hearing ${devDoc.id} has no transcription to migrate.`) + return "skip" + } + const targetDoc = await db.collection("events").doc(devDoc.id).get() + const targetData = targetDoc.exists ? targetDoc.data() : null + + if (!targetData) { + console.log(`${devDoc.id} not found in target project.`) + return "skip" + } + + // Only migrate if hearing in target environment has less transcriptions than dev + if (devData.transcriptionIds.length <= targetData.transcriptionIds.length) { + console.log(`${devDoc.id} already has a transcription, skipping.`) + return "skip" + } + + for (const transcriptionId of devData.transcriptionIds) { + if (!targetData.transcriptionIds.includes(transcriptionId)) { + try { + await migrateTranscription(db, devDb, transcriptionId, bulkWriter) + } catch (err) { + console.error(`Error creating transcription ${transcriptionId}:`, err) + return "fail" + } + } + } + + console.log(`Updating hearing ${devDoc.id}...`) + if (bulkWriter) { + bulkWriter.update(db.collection("events").doc(devDoc.id), { + videos: devData.videos, + videosFetchedAt: convertTimestamps(devData.videosFetchedAt), + transcriptionIds: devData.transcriptionIds + }) + } else { + await db + .collection("events") + .doc(devDoc.id) + .update({ + videos: devData.videos, + videosFetchedAt: convertTimestamps(devData.videosFetchedAt), + transcriptionIds: devData.transcriptionIds + }) + } + + return "migrate" +} + const Args = Record({ sourceProject: String, hearing: Number.optional() @@ -66,78 +179,15 @@ export const script: Script = async ({ db, args }) => { if (hearing) { const hearingId = "hearing-" + hearing console.log(`Processing single hearing: ${hearingId}`) - const devHearingsSnapshot = await devDb - .collection("events") - .doc(hearingId) - .get() + const devDoc = await devDb.collection("events").doc(hearingId).get() - if (!devHearingsSnapshot.exists) { + if (!devDoc.exists) { console.error(`Hearing ${hearingId} not found in dev project.`) return } - const devData = devHearingsSnapshot.data() - if (!devData?.videoTranscriptionId) { - console.log(`Hearing ${hearingId} has no transcription to migrate.`) - return - } - const targetDoc = await db.collection("events").doc(hearingId).get() - const targetData = targetDoc.exists ? targetDoc.data() : null - - // Only migrate if hearing in target environment does not have a transcription yet - if (!targetData?.videoTranscriptionId) { - const transcriptionId = devData.videoTranscriptionId - const devTranscriptionDoc = await devDb - .collection("transcriptions") - .doc(transcriptionId) - .get() - - const devTranscriptionData = devTranscriptionDoc.exists - ? devTranscriptionDoc.data() - : null - - if (devTranscriptionData) { - // Create transcription in target project instead of setting, in case it already exists, which will throw an error - const convertedData = convertTimestamps(devTranscriptionData) - try { - console.log(`Creating transcription ${transcriptionId}...`) - await db - .collection("transcriptions") - .doc(transcriptionId) - .create(convertedData) - } catch (err) { - console.error(`Error creating transcription ${transcriptionId}:`, err) - return - } - - const subcollections = await devTranscriptionDoc.ref.listCollections() - for (const subcol of subcollections) { - const docs = await subcol.get() - for (const doc of docs.docs) { - await db - .collection("transcriptions") - .doc(transcriptionId) - .collection(subcol.id) - .doc(doc.id) - .set(doc.data()) - } - } - } else { - console.error( - `Transcription ${transcriptionId} not found in dev project.` - ) - } - - await db - .collection("events") - .doc(hearingId) - .update({ - videoURL: devData.videoURL, - videoFetchedAt: convertTimestamps(devData.videoFetchedAt), - videoTranscriptionId: devData.videoTranscriptionId - }) - console.log(`Migration complete for hearing ${hearingId}.`) - } + await migrateHearing(db, devDb, devDoc) + console.log(`Migration complete for hearing ${hearingId}.`) } else { // For full migration const devHearingsSnapshot = await devDb @@ -157,83 +207,14 @@ export const script: Script = async ({ db, args }) => { console.log(`Migration limit of ${limit} reached. Stopping.`) break } - const devData = devDoc.data() - if (!devData.videoTranscriptionId) { - skipped++ - console.log(`${devDoc.id} has no transcription to migrate.`) - continue - } - - const targetDoc = await db.collection("events").doc(devDoc.id).get() - const targetData = targetDoc.exists ? targetDoc.data() : null - - if (!targetData) { - skipped++ - console.log(`${devDoc.id} not found in target project.`) - continue - } - // Only migrate if hearing in target environment does not have a transcription yet - if (!targetData?.videoTranscriptionId) { - console.log(`Migrating ${devDoc.id}...`) - const transcriptionId = devData.videoTranscriptionId - const devTranscriptionDoc = await devDb - .collection("transcriptions") - .doc(transcriptionId) - .get() - - const devTranscriptionData = devTranscriptionDoc.exists - ? devTranscriptionDoc.data() - : null - - if (devTranscriptionData) { - // Create transcription in target project instead of setting, in case it already exists, which will throw an error - const convertedData = convertTimestamps(devTranscriptionData) - try { - console.log(`Creating transcription ${transcriptionId}...`) - bulkWriter.create( - db.collection("transcriptions").doc(transcriptionId), - convertedData - ) - } catch (err) { - failed++ - console.error( - `Error creating transcription ${transcriptionId}:`, - err - ) - continue - } - - const subcollections = await devTranscriptionDoc.ref.listCollections() - for (const subcol of subcollections) { - const docs = await subcol.get() - for (const doc of docs.docs) { - await db - .collection("transcriptions") - .doc(transcriptionId) - .collection(subcol.id) - .doc(doc.id) - .set(doc.data()) - } - } - } else { - failed++ - console.error( - `Transcription ${transcriptionId} not found in dev project.` - ) - continue - } - - console.log(`Updating ${devDoc.id}...`) - bulkWriter.update(db.collection("events").doc(devDoc.id), { - videoURL: devData.videoURL, - videoFetchedAt: convertTimestamps(devData.videoFetchedAt), - videoTranscriptionId: devData.videoTranscriptionId - }) - migrated++ + const result = await migrateHearing(db, devDb, devDoc, bulkWriter) + if (result === "migrate") { + migrated += 1 + } else if (result === "skip") { + skipped += 1 } else { - console.log(`${devDoc.id} already has a transcription, skipping.`) - skipped++ + failed += 1 } } diff --git a/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx b/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx index 40ec1154c..c26d83150 100644 --- a/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx +++ b/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx @@ -142,7 +142,10 @@ const sampleBill: Bill = { const sampleHearing = { id: "hearing-101", startsAt: new Date("2026-03-12T10:00:00-05:00").getTime(), - videoURL: "https://malegislature.gov/" + videoURLs: [ + "https://prodarchivevideo.blob.core.windows.net/video/2022/Hearings/Joint/April/12.mp4", + "https://prodarchivevideo.blob.core.windows.net/video/2022/Hearings/Joint/April/12_1.mp4" + ] } const emptyTestimonyListing: UsePublishedTestimonyListing = { From f44991234b3021fb25ed9ec55df7687fcefe8e26 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 2 Jun 2026 22:53:54 -0400 Subject: [PATCH 08/10] Small bugfixing --- functions/src/events/HearingScraper.ts | 2 -- functions/src/hearings/search.ts | 2 +- .../firebase-admin/migrateHearingTranscription.ts | 14 +++++++------- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts index ea7b38bde..99383f187 100644 --- a/functions/src/events/HearingScraper.ts +++ b/functions/src/events/HearingScraper.ts @@ -52,8 +52,6 @@ export class HearingScraper extends EventScraper { const data = await api.getHearing(EventId) const content = HearingContent.check(data) - console.log("content in getEvent()", content) - const host = content.HearingHost const committeeChairs = host?.CommitteeCode && host?.GeneralCourtNumber diff --git a/functions/src/hearings/search.ts b/functions/src/hearings/search.ts index e5472eac8..ce2375042 100644 --- a/functions/src/hearings/search.ts +++ b/functions/src/hearings/search.ts @@ -33,7 +33,7 @@ export const { documentTrigger: "events/{eventId}", alias: "hearings", idField: "id", - filter: data => data.type === "hearing", + filter: data => data.type === "hearing" && "transcriptionIds" in data, schema: { fields: [ { name: "eventId", type: "int32", facet: false }, diff --git a/scripts/firebase-admin/migrateHearingTranscription.ts b/scripts/firebase-admin/migrateHearingTranscription.ts index 776139bc0..03dce0f1c 100644 --- a/scripts/firebase-admin/migrateHearingTranscription.ts +++ b/scripts/firebase-admin/migrateHearingTranscription.ts @@ -102,7 +102,7 @@ async function migrateHearing( ): Promise<"migrate" | "skip" | "fail"> { const devData = devDoc.data() - if (!devData || !devData.transcriptionIds.length) { + if (!devData || !devData?.transcriptionIds?.length) { console.log(`Hearing ${devDoc.id} has no transcription to migrate.`) return "skip" } @@ -114,14 +114,10 @@ async function migrateHearing( return "skip" } - // Only migrate if hearing in target environment has less transcriptions than dev - if (devData.transcriptionIds.length <= targetData.transcriptionIds.length) { - console.log(`${devDoc.id} already has a transcription, skipping.`) - return "skip" - } - + let found = false for (const transcriptionId of devData.transcriptionIds) { if (!targetData.transcriptionIds.includes(transcriptionId)) { + found = true try { await migrateTranscription(db, devDb, transcriptionId, bulkWriter) } catch (err) { @@ -130,6 +126,10 @@ async function migrateHearing( } } } + if (!found) { + console.log(`${devDoc.id} has no new transcriptions.`) + return "skip" + } console.log(`Updating hearing ${devDoc.id}...`) if (bulkWriter) { From 76f7afbed3066ac73b7b21334a79c4c60f98670a Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 9 Jun 2026 21:23:31 -0400 Subject: [PATCH 09/10] Align to Assembly AI return --- functions/src/events/AssemblyAIHandler.ts | 4 +++- functions/src/webhooks/transcription.ts | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/functions/src/events/AssemblyAIHandler.ts b/functions/src/events/AssemblyAIHandler.ts index 2bc290114..6e57318b5 100644 --- a/functions/src/events/AssemblyAIHandler.ts +++ b/functions/src/events/AssemblyAIHandler.ts @@ -101,13 +101,15 @@ export class AssemblyAIHandlerDummy extends AssemblyAIHandlerBase { const transcriptionId = `mock_${Math.random().toString(36).slice(2)}` setTimeout(async () => { + const transcript: any = await this.getTranscript(transcriptionId) + transcript["transcript_id"] = transcript.id await fetch("http://localhost:5001/demo-dtp/us-central1/transcription", { method: "POST", headers: { "Content-Type": "application/json", "x-maple-webhook": token }, - body: JSON.stringify(await this.getTranscript(transcriptionId)) + body: JSON.stringify(transcript) }) }, 10000) diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts index 774eba82d..9c6a01608 100644 --- a/functions/src/webhooks/transcription.ts +++ b/functions/src/webhooks/transcription.ts @@ -11,7 +11,7 @@ export const transcription = functions // If we get a request with the right header and status, get the // transcription from the assembly API. - const transcript = await assemblyAI().getTranscript(req.body.id) + const transcript = await assemblyAI().getTranscript(req.body.transcript_id) if (transcript && transcript.webhook_auth) { // If there is a transcript and the transcript has an auth property, From 88df8e887b9ac7c81f14d1facd69d56c2a2868b7 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 9 Jun 2026 21:36:50 -0400 Subject: [PATCH 10/10] Prettier --- functions/src/webhooks/transcription.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts index 9c6a01608..767f015af 100644 --- a/functions/src/webhooks/transcription.ts +++ b/functions/src/webhooks/transcription.ts @@ -11,7 +11,9 @@ export const transcription = functions // If we get a request with the right header and status, get the // transcription from the assembly API. - const transcript = await assemblyAI().getTranscript(req.body.transcript_id) + const transcript = await assemblyAI().getTranscript( + req.body.transcript_id + ) if (transcript && transcript.webhook_auth) { // If there is a transcript and the transcript has an auth property,