diff --git a/components/ballotquestions/CommitteeHearing.test.tsx b/components/ballotquestions/CommitteeHearing.test.tsx
index 9777da7c5..ae41805e5 100644
--- a/components/ballotquestions/CommitteeHearing.test.tsx
+++ b/components/ballotquestions/CommitteeHearing.test.tsx
@@ -31,7 +31,11 @@ describe("CommitteeHearing", () => {
})
it("shows hearing context copy", () => {
- render()
+ render(
+
+ )
expect(screen.getByText("Committee Hearing")).toBeInTheDocument()
expect(
screen.getByText("Committee hearings are public meetings.")
@@ -39,13 +43,19 @@ describe("CommitteeHearing", () => {
})
it("formats the hearing date", () => {
- render()
+ render(
+
+ )
expect(screen.getByText(/December 14, 2025/)).toBeInTheDocument()
})
it("shows a hearing page link when an id is present", () => {
render(
-
+
)
expect(
screen.getByRole("link", { name: /Open hearing page/i })
@@ -53,7 +63,11 @@ describe("CommitteeHearing", () => {
})
it("hides the hearing page link when no hearing id is available", () => {
- render()
+ render(
+
+ )
expect(screen.queryByRole("link")).not.toBeInTheDocument()
})
})
diff --git a/components/ballotquestions/types.ts b/components/ballotquestions/types.ts
index cdc9e0b9f..b27cecb87 100644
--- a/components/ballotquestions/types.ts
+++ b/components/ballotquestions/types.ts
@@ -1,6 +1,6 @@
export type Hearing = {
id: string
- videoURL?: string
+ videoURLs: string[]
startsAt: number // milliseconds since epoch, converted from Firestore Timestamp server-side
}
diff --git a/docs/ballot-questions-frontend.md b/docs/ballot-questions-frontend.md
index 5e9ee756a..ff0cede3d 100644
--- a/docs/ballot-questions-frontend.md
+++ b/docs/ballot-questions-frontend.md
@@ -166,15 +166,15 @@ For each relevant hearing, display:
- **Status**: "Occurred" if `hearing.content.startsAt` is in the past, "Scheduled" if in the future
- **Date**: formatted from `hearing.content.startsAt`
-- **Watch link**: "Watch the committee hearing here." linked to `hearing.videoURL` — hidden if no video
+- **Watch link**: "Watch the committee hearing here." linked to `hearing.videoURLs` — hidden if no videos
Since ballot questions are always under SJ42 and typically have one hearing, render a single hearing block. If there are multiple, render them in reverse chronological order (most recent first).
**Hearing data model recap:**
- `bill.hearingIds?: string[]` — event IDs; doc path is `/events/hearing-{id}`
-- `bill.nextHearingAt?: Timestamp` — convenience field for upcoming hearing only (not sufficient alone — we need date + videoURL from the full document)
-- `hearing.videoURL?: string` — link for the "Watch" CTA
+- `bill.nextHearingAt?: Timestamp` — convenience field for upcoming hearing only (not sufficient alone — we need date + videoURLs from the full document)
+- `hearing.videoURLs: string[]` — link for the "Watch" CTA
- `hearing.content.startsAt` — determines "Occurred" vs. "Scheduled" status
No new components are needed for hearing display — build a simple `CommitteeHearing` component local to `components/ballotquestions/`.
diff --git a/functions/src/bills/updateBillReferences.test.ts b/functions/src/bills/updateBillReferences.test.ts
index 04e7bb762..7e16873d5 100644
--- a/functions/src/bills/updateBillReferences.test.ts
+++ b/functions/src/bills/updateBillReferences.test.ts
@@ -13,6 +13,8 @@ function createHearing(
type: "hearing",
startsAt,
fetchedAt: Timestamp.fromMillis(Date.now()),
+ videos: [],
+ transcriptionIds: [],
content: {
EventId: 1,
EventDate: "2026-02-01T10:00:00",
diff --git a/functions/src/events/AssemblyAIHandler.ts b/functions/src/events/AssemblyAIHandler.ts
new file mode 100644
index 000000000..6e57318b5
--- /dev/null
+++ b/functions/src/events/AssemblyAIHandler.ts
@@ -0,0 +1,410 @@
+import {
+ AssemblyAI,
+ Transcript,
+ TranscriptParagraph,
+ TranscriptUtterance,
+ TranscriptWord
+} from "assemblyai"
+import { db, storage } from "../firebase"
+import { randomBytes } from "node:crypto"
+import { sha256 } from "js-sha256"
+import ffmpeg from "fluent-ffmpeg"
+import fs from "fs"
+
+abstract class AssemblyAIHandlerBase {
+ abstract submitTranscription({
+ EventId,
+ videoUrl,
+ bucketName
+ }: {
+ EventId: number
+ videoUrl: string
+ bucketName?: string
+ }): Promise
+
+ abstract getTranscript(transcript_id: string): Promise
+ abstract fetchParagraphs(
+ transcript_id: string
+ ): Promise
+}
+
+export class AssemblyAIHandler extends AssemblyAIHandlerBase {
+ assembly: AssemblyAI
+
+ constructor({ apiKey }: { apiKey: string }) {
+ super()
+ this.assembly = new AssemblyAI({
+ apiKey
+ })
+ }
+
+ async submitTranscription({
+ EventId,
+ videoUrl,
+ bucketName
+ }: {
+ EventId: number
+ videoUrl: string
+ bucketName?: string
+ }): Promise {
+ const newToken = randomBytes(16).toString("hex")
+ const audioUrl = await extractAudioFromVideo(EventId, videoUrl, bucketName)
+
+ const transcript = await this.assembly.transcripts.submit({
+ audio:
+ // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
+ audioUrl,
+ webhook_url:
+ // make sure process.env.FUNCTIONS_API_BASE equals
+ // https://us-central1-digital-testimony-prod.cloudfunctions.net
+ // on prod. test with:
+ // "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
+ `${process.env.FUNCTIONS_API_BASE}/transcription`,
+ speaker_labels: true,
+ webhook_auth_header_name: "x-maple-webhook",
+ webhook_auth_header_value: newToken
+ })
+
+ await db
+ .collection("events")
+ .doc(`hearing-${String(EventId)}`)
+ .collection("private")
+ .doc(transcript.id)
+ .set({
+ videoAssemblyWebhookToken: sha256(newToken)
+ })
+
+ return transcript.id
+ }
+
+ async getTranscript(transcript_id: string): Promise {
+ return await this.assembly.transcripts.get(transcript_id)
+ }
+
+ async fetchParagraphs(transcript_id: string): Promise {
+ return (await this.assembly.transcripts.paragraphs(transcript_id))
+ .paragraphs
+ }
+}
+
+export class AssemblyAIHandlerDummy extends AssemblyAIHandlerBase {
+ async submitTranscription({
+ EventId,
+ videoUrl,
+ bucketName
+ }: {
+ EventId: number
+ videoUrl: string
+ bucketName?: string
+ }): Promise {
+ const token = randomBytes(16).toString("hex")
+ const transcriptionId = `mock_${Math.random().toString(36).slice(2)}`
+
+ setTimeout(async () => {
+ const transcript: any = await this.getTranscript(transcriptionId)
+ transcript["transcript_id"] = transcript.id
+ await fetch("http://localhost:5001/demo-dtp/us-central1/transcription", {
+ method: "POST",
+ headers: {
+ "Content-Type": "application/json",
+ "x-maple-webhook": token
+ },
+ body: JSON.stringify(transcript)
+ })
+ }, 10000)
+
+ await db
+ .collection("events")
+ .doc(`hearing-${String(EventId)}`)
+ .collection("private")
+ .doc(transcriptionId)
+ .set({
+ videoAssemblyWebhookToken: sha256(token)
+ })
+
+ return transcriptionId
+ }
+
+ async getTranscript(transcriptId: string): Promise {
+ return getTranscript(transcriptId).transcript
+ }
+
+ async fetchParagraphs(transcriptId: string): Promise {
+ return getTranscript(transcriptId).paragraphs
+ }
+}
+
+const extractAudioFromVideo = async (
+ EventId: number,
+ videoUrl: string,
+ bucketName?: string
+): Promise => {
+ const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a`
+
+ // Stream directly from URL and copy audio codec
+ await new Promise((resolve, reject) => {
+ ffmpeg(videoUrl)
+ .noVideo()
+ .audioCodec("copy")
+ .format("mp4")
+ .on("start", commandLine => {
+ console.log(`Spawned FFmpeg with command: ${commandLine}`)
+ })
+ .on("end", () => {
+ console.log("FFmpeg processing finished successfully")
+ resolve()
+ })
+ .on("error", err => {
+ console.error("FFmpeg error:", err)
+ reject(err)
+ })
+ .save(tmpFilePath)
+ })
+
+ // Upload the audio file
+ const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket()
+ const audioFileName = `hearing-${EventId}-${Date.now()}.m4a`
+ const file = bucket.file(audioFileName)
+
+ const fileContent = await fs.promises.readFile(tmpFilePath)
+ await file.save(fileContent, {
+ metadata: {
+ contentType: "audio/mp4"
+ }
+ })
+
+ // Clean up temporary file
+ await fs.promises.unlink(tmpFilePath)
+
+ const [url] = await file.getSignedUrl({
+ action: "read",
+ expires: Date.now() + 24 * 60 * 60 * 1000
+ })
+
+ // Delete old files
+ const [files] = await bucket.getFiles({
+ prefix: "hearing-",
+ maxResults: 1000
+ })
+ const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000
+ const oldFiles = files.filter(file => {
+ const timestamp = parseInt(file.name.split("-").pop()?.split(".")[0] || "0")
+ return timestamp < oneDayAgo
+ })
+ await Promise.all(oldFiles.map(file => file.delete()))
+
+ // Return the new audio url
+ return url
+}
+
+export const submitTranscription = async ({
+ EventId,
+ maybeVideoUrl,
+ bucketName
+}: {
+ EventId: number
+ maybeVideoUrl: string
+ bucketName?: string
+}) => {
+ const assembly = new AssemblyAI({
+ apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
+ })
+
+ const newToken = randomBytes(16).toString("hex")
+ const audioUrl = await extractAudioFromVideo(
+ EventId,
+ maybeVideoUrl,
+ bucketName
+ )
+
+ const transcript = await assembly.transcripts.submit({
+ audio:
+ // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
+ audioUrl,
+ webhook_url:
+ // make sure process.env.FUNCTIONS_API_BASE equals
+ // https://us-central1-digital-testimony-prod.cloudfunctions.net
+ // on prod. test with:
+ // "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
+ `${process.env.FUNCTIONS_API_BASE}/transcription`,
+ speaker_labels: true,
+ webhook_auth_header_name: "x-maple-webhook",
+ webhook_auth_header_value: newToken
+ })
+
+ await db
+ .collection("events")
+ .doc(`hearing-${String(EventId)}`)
+ .collection("private")
+ .doc("webhookAuth")
+ .set({
+ videoAssemblyWebhookToken: sha256(newToken)
+ })
+
+ return transcript.id
+}
+
+const WORD_BANK = [
+ "lorem",
+ "ipsum",
+ "dolor",
+ "sit",
+ "amet",
+ "consectetur",
+ "adipiscing",
+ "elit",
+ "sed",
+ "do",
+ "eiusmod",
+ "tempor",
+ "incididunt",
+ "ut",
+ "labore",
+ "et",
+ "dolore",
+ "magna",
+ "aliqua"
+]
+
+const SPEAKERS = ["A", "B", "C"]
+
+function randomInt(min: number, max: number) {
+ return Math.floor(Math.random() * (max - min + 1)) + min
+}
+
+function randomFloat(min: number, max: number, precision = 2) {
+ return Number((Math.random() * (max - min) + min).toFixed(precision))
+}
+
+function mean(values: number[]) {
+ return values.reduce((a, b) => a + b, 0) / values.length
+}
+
+function loremSentence(length: number) {
+ return Array.from({ length }, () => {
+ return WORD_BANK[randomInt(0, WORD_BANK.length - 1)]
+ })
+}
+
+function loremParagraph(length: number) {
+ return Array.from({ length }, () => loremSentence(randomInt(3, 10)))
+}
+
+// paragraphs -> sentences -> words
+function loremTranscriptStructure() {
+ return Array.from({ length: randomInt(10, 20) }, () =>
+ loremParagraph(randomInt(3, 8))
+ )
+}
+
+export function getTranscript(transcript_id: string): {
+ transcript: Transcript
+ paragraphs: TranscriptParagraph[]
+} {
+ const structure = loremTranscriptStructure()
+
+ const utterances: TranscriptUtterance[] = []
+ const paragraphs: TranscriptParagraph[] = []
+ const allWords: TranscriptWord[] = []
+
+ let currentTime = 0
+
+ for (const paragraph of structure) {
+ const speaker = SPEAKERS[randomInt(0, SPEAKERS.length - 1)]
+
+ const paragraphWords: TranscriptWord[] = []
+
+ for (const sentence of paragraph) {
+ const sentenceWords: TranscriptWord[] = []
+
+ for (const token of sentence) {
+ const confidence = randomFloat(0.5, 0.99)
+
+ const word: TranscriptWord = {
+ confidence,
+ start: Number(currentTime.toFixed(2)),
+ end: Number((currentTime + 1).toFixed(2)),
+ speaker,
+ text: token
+ }
+
+ sentenceWords.push(word)
+ paragraphWords.push(word)
+ allWords.push(word)
+
+ currentTime += 300
+ }
+
+ const utterance: TranscriptUtterance = {
+ confidence: Number(
+ mean(sentenceWords.map(w => w.confidence)).toFixed(2)
+ ),
+ start: sentenceWords[0].start,
+ end: sentenceWords[sentenceWords.length - 1].end,
+ speaker,
+ text: sentenceWords.map(w => w.text).join(" "),
+ words: sentenceWords
+ }
+
+ utterances.push(utterance)
+
+ currentTime += randomInt(100, 3000)
+ }
+
+ const transcriptParagraph: TranscriptParagraph = {
+ confidence: Number(
+ mean(paragraphWords.map(w => w.confidence)).toFixed(2)
+ ),
+ start: paragraphWords[0].start,
+ end: paragraphWords[paragraphWords.length - 1].end,
+ text: paragraphWords.map(w => w.text).join(" "),
+ words: paragraphWords
+ }
+
+ paragraphs.push(transcriptParagraph)
+
+ currentTime += randomInt(500, 7000)
+ }
+
+ const transcript: Transcript = {
+ acoustic_model: "no",
+ audio_url: "https://example.com/definitely-a-video",
+ auto_highlights: false,
+ id: transcript_id,
+ language_confidence: 0.95,
+ language_confidence_threshold: 0.03,
+ language_model: "no",
+ speech_model: null,
+ redact_pii: true,
+ status: "completed",
+ summarization: false,
+ webhook_auth: true,
+ webhook_auth_header_name: "x-maple-webhook",
+
+ text: utterances.map(u => u.text).join(". "),
+ confidence: Number(mean(allWords.map(w => w.confidence)).toFixed(2)),
+
+ utterances,
+ words: allWords
+ }
+
+ return {
+ transcript,
+ paragraphs
+ }
+}
+
+let assemblyInstance: AssemblyAIHandler | AssemblyAIHandlerDummy | undefined
+
+export function assemblyAI(): AssemblyAIHandler | AssemblyAIHandlerDummy {
+ if (!assemblyInstance) {
+ const apiKey = process.env.ASSEMBLY_API_KEY
+ if (!apiKey || apiKey === "test-api-key") {
+ console.log("AssemblyAI is faked for this emulator")
+ assemblyInstance = new AssemblyAIHandlerDummy()
+ } else {
+ assemblyInstance = new AssemblyAIHandler({ apiKey })
+ }
+ }
+ return assemblyInstance
+}
diff --git a/functions/src/events/EventScraper.ts b/functions/src/events/EventScraper.ts
new file mode 100644
index 000000000..3f7e62850
--- /dev/null
+++ b/functions/src/events/EventScraper.ts
@@ -0,0 +1,222 @@
+import { RuntimeOptions, runWith } from "firebase-functions/v1"
+import { DateTime } from "luxon"
+import { logFetchError } from "../common"
+import { db, Timestamp } from "../firebase"
+import * as api from "../malegislature"
+import {
+ BaseEvent,
+ BaseEventContent,
+ Session,
+ SessionContent,
+ SpecialEvent,
+ SpecialEventContent
+} from "./types"
+import { currentGeneralCourt } from "../shared"
+
+export abstract class EventScraper {
+ private schedule
+ private timeout
+ private memory
+ private pastEventCutoff
+
+ constructor(
+ schedule: string,
+ timeout: number,
+ {
+ memory = "256MB",
+ pastEventCutoff = { days: 8 }
+ }: {
+ memory?: RuntimeOptions["memory"]
+ pastEventCutoff?: Duration
+ } = {}
+ ) {
+ this.schedule = schedule
+ this.timeout = timeout
+ this.memory = memory
+ this.pastEventCutoff = pastEventCutoff
+ }
+
+ get function() {
+ return runWith({
+ timeoutSeconds: this.timeout,
+ secrets: ["ASSEMBLY_API_KEY"],
+ memory: this.memory,
+ maxInstances: 1
+ })
+ .pubsub.schedule(this.schedule)
+ .onRun(() => this.run())
+ }
+
+ abstract listEvents(): Promise
+ abstract getEvent(item: ListItem): Promise
+
+ private async run() {
+ const list = await this.listEvents().catch(logFetchError("event list"))
+
+ if (!list) return
+
+ const writer = db.bulkWriter()
+ const upcomingOrRecentCutoff = DateTime.now().minus(this.pastEventCutoff)
+
+ for (let item of list) {
+ const id = (item as any)?.EventId,
+ event = await this.getEvent(item).catch(logFetchError("event", id))
+
+ if (!event) continue
+ if (event.startsAt.toMillis() < upcomingOrRecentCutoff.toMillis()) break
+
+ writer.set(db.doc(`/events/${event.id}`), event, { merge: true })
+
+ console.log("event in run()", event)
+ }
+
+ await writer.close()
+ }
+
+ /** Parse the event start time in the time zone of the API. */
+ getEventStart(content: { EventDate: string; StartTime: string }) {
+ const { year, month, day } = DateTime.fromISO(content.EventDate, {
+ zone: api.timeZone
+ })
+ const { hour, minute, second, millisecond } = DateTime.fromISO(
+ content.StartTime,
+ { zone: api.timeZone }
+ )
+ const startsAt = DateTime.fromObject(
+ { year, month, day, hour, minute, second, millisecond },
+ { zone: api.timeZone }
+ )
+ return startsAt
+ }
+
+ /** Return timestamps shared between event types. */
+ timestamps(content: BaseEventContent) {
+ const startsAt = this.getEventStart(content)
+ return {
+ fetchedAt: Timestamp.now(),
+ startsAt: Timestamp.fromMillis(startsAt.toMillis())
+ }
+ }
+}
+
+export class SpecialEventsScraper extends EventScraper<
+ SpecialEventContent,
+ SpecialEvent
+> {
+ constructor() {
+ super("every 60 minutes", 540)
+ }
+
+ async listEvents() {
+ const events = await api.getSpecialEvents()
+ return events.filter(SpecialEventContent.guard)
+ }
+
+ getEvent(content: SpecialEventContent) {
+ const event: SpecialEvent = {
+ id: `specialEvent-${content.EventId}`,
+ type: "specialEvent",
+ content,
+ ...this.timestamps(content)
+ }
+ return Promise.resolve(event)
+ }
+}
+
+export class SessionScraper extends EventScraper {
+ private court = currentGeneralCourt
+
+ constructor() {
+ super("every 60 minutes", 120)
+ }
+
+ async listEvents() {
+ const events = await api.getSessions(this.court)
+ return events.filter(SessionContent.guard)
+ }
+
+ getEvent(content: SessionContent) {
+ const event: Session = {
+ id: `session-${this.court}-${content.EventId}`,
+ type: "session",
+ content,
+ ...this.timestamps(content)
+ }
+ return Promise.resolve(event)
+ }
+}
+
+export abstract class EventPostProcessor {
+ private schedule
+ private timeout
+ private eventType
+ private memory
+ private pastEventBeginProcessing
+ private pastEventCutoff
+
+ constructor(
+ schedule: string,
+ timeout: number,
+ eventType: string,
+ {
+ memory = "256MB",
+ pastEventBeginProcessing = {},
+ pastEventCutoff = { days: 8 }
+ }: {
+ memory?: RuntimeOptions["memory"]
+ pastEventBeginProcessing?: Duration
+ pastEventCutoff?: Duration
+ } = {}
+ ) {
+ this.schedule = schedule
+ this.timeout = timeout
+ this.eventType = eventType
+ this.memory = memory
+ this.pastEventBeginProcessing = pastEventBeginProcessing
+ this.pastEventCutoff = pastEventCutoff
+ }
+
+ get function() {
+ return runWith({
+ timeoutSeconds: this.timeout,
+ secrets: ["ASSEMBLY_API_KEY"],
+ memory: this.memory,
+ maxInstances: 1
+ })
+ .pubsub.schedule(this.schedule)
+ .onRun(() => this.run())
+ }
+
+ abstract updateIf(data: FirebaseFirestore.DocumentData): null | ListItem
+ abstract getUpdate(item: ListItem): any
+
+ private async run() {
+ const writer = db.bulkWriter()
+
+ const now = DateTime.now()
+ const begin = now.minus(this.pastEventBeginProcessing).toJSDate()
+ const cutoff = now.minus(this.pastEventCutoff).toJSDate()
+
+ const snapshot = await db
+ .collection("events")
+ .where("type", "==", this.eventType)
+ .where("startsAt", "<=", begin)
+ .where("startsAt", ">=", cutoff)
+ .get()
+
+ if (snapshot.empty) return
+
+ for (const doc of snapshot.docs) {
+ const data = doc.data()
+ if (!data) continue
+ const item = this.updateIf(data)
+ if (!item) continue
+
+ writer.update(doc.ref, await this.getUpdate(item))
+
+ console.log("event in run()", data)
+ }
+
+ await writer.close()
+ }
+}
diff --git a/functions/src/events/HearingScraper.ts b/functions/src/events/HearingScraper.ts
new file mode 100644
index 000000000..99383f187
--- /dev/null
+++ b/functions/src/events/HearingScraper.ts
@@ -0,0 +1,271 @@
+import { JSDOM } from "jsdom"
+import { db, Timestamp } from "../firebase"
+import * as api from "../malegislature"
+import { Hearing, HearingContent, HearingListItem, Video } from "./types"
+import { isValidVideoUrl } from "./helpers"
+import { Committee } from "../committees/types"
+import { EventPostProcessor, EventScraper } from "./EventScraper"
+import { assemblyAI } from "./AssemblyAIHandler"
+
+const loadCommitteeChairNames = async (
+ generalCourtNumber: number,
+ committeeCode: string
+) => {
+ try {
+ const committeeSnap = await db
+ .collection(`generalCourts/${generalCourtNumber}/committees`)
+ .doc(committeeCode)
+ .get()
+
+ if (!committeeSnap.exists) return [] as string[]
+
+ const { members, content } = Committee.check(committeeSnap.data())
+ const chairCodes = new Set()
+ const maybeHouse = content.HouseChairperson?.MemberCode
+ const maybeSenate = content.SenateChairperson?.MemberCode
+
+ if (maybeHouse) chairCodes.add(maybeHouse)
+ if (maybeSenate) chairCodes.add(maybeSenate)
+ return (members ?? [])
+ .filter(member => chairCodes.has(member.id))
+ .map(member => member.name)
+ } catch (error) {
+ console.warn(
+ `Failed to load committee chairs for ${committeeCode} (${generalCourtNumber}):`,
+ error
+ )
+ return [] as string[]
+ }
+}
+
+export class HearingScraper extends EventScraper {
+ constructor() {
+ super("every 60 minutes", 480)
+ }
+
+ async listEvents() {
+ const events = await api.listHearings()
+ return events.filter(HearingListItem.guard)
+ }
+
+ async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
+ const data = await api.getHearing(EventId)
+ const content = HearingContent.check(data)
+
+ const host = content.HearingHost
+ const committeeChairs =
+ host?.CommitteeCode && host?.GeneralCourtNumber
+ ? await loadCommitteeChairNames(
+ host.GeneralCourtNumber,
+ host.CommitteeCode
+ )
+ : []
+
+ return {
+ id: `hearing-${EventId}`,
+ type: "hearing",
+ content,
+ committeeChairs,
+ videos: [],
+ transcriptionIds: [],
+ ...this.timestamps(content)
+ } as Hearing
+ }
+}
+
+function removeCommonWords(strings: string[]) {
+ if (!strings.length) return []
+
+ // Normalize whitespace and split into words
+ const wordLists = strings.map(s => s.trim().replace(/\s+/g, " ").split(" "))
+
+ let prefixLen = 0
+ while (
+ wordLists.every(
+ words =>
+ prefixLen < words.length &&
+ words[prefixLen].toLowerCase() === wordLists[0][prefixLen].toLowerCase()
+ )
+ ) {
+ prefixLen++
+ }
+
+ let suffixLen = 0
+ while (
+ wordLists.every(
+ words =>
+ suffixLen < words.length - prefixLen &&
+ words[words.length - 1 - suffixLen].toLowerCase() ===
+ wordLists[0][wordLists[0].length - 1 - suffixLen].toLowerCase()
+ )
+ ) {
+ suffixLen++
+ }
+
+ return wordLists.map(words =>
+ words.slice(prefixLen, words.length - suffixLen).join(" ")
+ )
+}
+
+export class HearingPostProcessor extends EventPostProcessor {
+ constructor() {
+ super("every 60 minutes", 480, "hearing", { memory: "4GB" })
+ }
+
+ async getHearingVideos(
+ EventId: number
+ ): Promise[]> {
+ const hearingErr = `An error collecting videos for hearing ${EventId} (webpage format changed?)`
+
+ const req = await fetch(
+ `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
+ )
+ const res = await req.text()
+ if (!res) throw new Error(`${hearingErr}: No response for request`)
+ const dom = new JSDOM(res)
+ if (!dom)
+ throw new Error(`${hearingErr}: Could not create JSDOM of request`)
+
+ const videoElements = [].slice.call(
+ dom.window.document.querySelectorAll("#playWebcast")
+ ) as Element[]
+ if (videoElements.length === 0) return []
+ const videoURLs = videoElements.map(elem => {
+ const onclick = elem.getAttribute("onclick")
+ if (!onclick) throw new Error(`${hearingErr}: No onclick in ${elem}`)
+ const match = onclick.match(/switchVideo\('([^']+)'/)
+ if (!match || match.length < 2)
+ throw new Error(`${hearingErr}: Could not match switchVideo in ${elem}`)
+ if (!isValidVideoUrl(match[1]))
+ throw new Error(`${hearingErr}: ${match[1]} is not a valid video url`)
+ return match[1]
+ })
+ const tbody = videoElements[0].closest("tbody")
+ if (!tbody)
+ throw new Error(
+ `${hearingErr}: Could not find parent tbody of #playWebcast`
+ )
+ const titles = Array.from(tbody.querySelectorAll("tr")).map(tr => {
+ const item = tr.querySelector("td")?.textContent?.trim()
+ if (!item)
+ throw new Error(`${hearingErr}: Could not locate title in ${tr}`)
+ return item
+ })
+ if (titles.length !== videoURLs.length)
+ throw new Error(
+ `${hearingErr}: Number of video table rows did not equal number of #playWebcast elements`
+ )
+
+ let videos = videoURLs.map((url, i) => {
+ return {
+ url: url,
+ title: titles[i]
+ }
+ })
+
+ let seen = new Set()
+ videos = videos.filter(item => {
+ if (seen.has(item.url)) return false
+ seen.add(item.url)
+ return true
+ })
+
+ if (videos.length > 1) {
+ const order = videos.map(item => {
+ const title = item.title.toLowerCase()
+ const match = title.match(
+ /\b(?:(\d+)\s+of\s+\d+|part\s+(\d+)|pt\.?\s+(\d+))\b/
+ )
+ if (!match) return -1
+ const part = parseInt(match[1] || match[2] || match[3], 10)
+ return part - 1
+ })
+ seen.clear()
+ let validOrder = true
+ for (const n of order) {
+ if (n < 0 || n >= order.length || seen.has(n)) {
+ validOrder = false
+ break
+ }
+ seen.add(n)
+ }
+ if (validOrder) {
+ const reordered = new Array(videos.length)
+ for (let i = 0; i < order.length; i++) {
+ reordered[order[i]] = videos[i]
+ }
+ videos = reordered
+ videos = videos.map((item, index) => {
+ item.title = `Part ${index + 1}`
+ return item
+ })
+ } else {
+ let shortTitles = removeCommonWords(titles)
+ if (shortTitles[0].length === 0) {
+ shortTitles = shortTitles.map((_, i) => `Part ${i + 1}`)
+ }
+ videos = videos.map((item, index) => {
+ item.title = shortTitles[index]
+ return item
+ })
+ console.log(
+ `Ordering not possible for hearing ${EventId} - fallback titles are ${JSON.stringify(
+ shortTitles
+ )}`
+ )
+ }
+ } else {
+ videos[0].title = `hearing-${EventId}`
+ }
+ return videos
+ }
+
+ updateIf(data: FirebaseFirestore.DocumentData): null | HearingListItem {
+ if (data.videos.length) return null
+ return { EventId: data.content.EventId }
+ }
+
+ async getUpdate(
+ { EventId }: HearingListItem,
+ existingVideos?: Video[]
+ ): Promise<{
+ transcriptionIds: string[]
+ videos: Video[]
+ videosFetchedAt: Timestamp
+ }> {
+ const videos = await this.getHearingVideos(EventId)
+
+ const prevURLs = existingVideos
+ ? Object.fromEntries(
+ existingVideos.map(({ url, transcriptionId }) => [
+ url,
+ transcriptionId
+ ])
+ )
+ : {}
+
+ const transcriptionIds = await Promise.all(
+ videos.map(item => {
+ return prevURLs[item.url] !== undefined
+ ? prevURLs[item.url]
+ : assemblyAI().submitTranscription({
+ EventId,
+ videoUrl: item.url
+ })
+ })
+ )
+
+ const videosWithTranscriptions = videos.map((item, index) => {
+ return {
+ transcriptionId: transcriptionIds[index],
+ ...item
+ }
+ })
+
+ return {
+ transcriptionIds,
+ videos: videosWithTranscriptions,
+ videosFetchedAt: Timestamp.now()
+ }
+ }
+}
diff --git a/functions/src/events/index.ts b/functions/src/events/index.ts
index 96ff5307d..2a1f508ad 100644
--- a/functions/src/events/index.ts
+++ b/functions/src/events/index.ts
@@ -1,3 +1,5 @@
export * from "./scrapeEvents"
export { scrapeSingleHearing } from "./scrapeEvents"
export { scrapeSingleHearingv2 } from "./scrapeEvents"
+export { assemblyAI } from "./AssemblyAIHandler"
+export { HearingScraper, HearingPostProcessor } from "./HearingScraper"
diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts
index 419bd505b..ecbdbce49 100644
--- a/functions/src/events/scrapeEvents.ts
+++ b/functions/src/events/scrapeEvents.ts
@@ -1,418 +1,9 @@
import * as functions from "firebase-functions/v1"
-import { RuntimeOptions, runWith } from "firebase-functions/v1"
import { onCall, CallableRequest } from "firebase-functions/v2/https"
-import { DateTime } from "luxon"
-import { JSDOM } from "jsdom"
-import { AssemblyAI } from "assemblyai"
-import { checkAuth, checkAdmin, logFetchError } from "../common"
-import { db, storage, Timestamp } from "../firebase"
-import * as api from "../malegislature"
-import {
- BaseEvent,
- BaseEventContent,
- Hearing,
- HearingContent,
- HearingListItem,
- Session,
- SessionContent,
- SpecialEvent,
- SpecialEventContent
-} from "./types"
-import { currentGeneralCourt } from "../shared"
-import { randomBytes } from "node:crypto"
-import { sha256 } from "js-sha256"
-import { isValidVideoUrl, withinCutoff } from "./helpers"
-import ffmpeg from "fluent-ffmpeg"
-import fs from "fs"
-import { Committee } from "../committees/types"
-abstract class EventScraper {
- private schedule
- private timeout
- private memory
-
- constructor(
- schedule: string,
- timeout: number,
- memory: RuntimeOptions["memory"] = "256MB"
- ) {
- this.schedule = schedule
- this.timeout = timeout
- this.memory = memory
- }
-
- get function() {
- return runWith({
- timeoutSeconds: this.timeout,
- secrets: ["ASSEMBLY_API_KEY"],
- memory: this.memory,
- maxInstances: 1
- })
- .pubsub.schedule(this.schedule)
- .onRun(() => this.run())
- }
-
- abstract listEvents(): Promise
- abstract getEvent(item: ListItem): Promise
-
- private async run() {
- const list = await this.listEvents().catch(logFetchError("event list"))
-
- if (!list) return
-
- const writer = db.bulkWriter()
- const upcomingOrRecentCutoff = DateTime.now().minus({ days: 8 })
-
- for (let item of list) {
- const id = (item as any)?.EventId,
- event = await this.getEvent(item).catch(logFetchError("event", id))
-
- if (!event) continue
- if (event.startsAt.toMillis() < upcomingOrRecentCutoff.toMillis()) break
-
- writer.set(db.doc(`/events/${event.id}`), event, { merge: true })
-
- console.log("event in run()", event)
- }
-
- await writer.close()
- }
-
- /** Parse the event start time in the time zone of the API. */
- getEventStart(content: { EventDate: string; StartTime: string }) {
- const { year, month, day } = DateTime.fromISO(content.EventDate, {
- zone: api.timeZone
- })
- const { hour, minute, second, millisecond } = DateTime.fromISO(
- content.StartTime,
- { zone: api.timeZone }
- )
- const startsAt = DateTime.fromObject(
- { year, month, day, hour, minute, second, millisecond },
- { zone: api.timeZone }
- )
- return startsAt
- }
-
- /** Return timestamps shared between event types. */
- timestamps(content: BaseEventContent) {
- const startsAt = this.getEventStart(content)
- return {
- fetchedAt: Timestamp.now(),
- startsAt: Timestamp.fromMillis(startsAt.toMillis())
- }
- }
-}
-
-class SpecialEventsScraper extends EventScraper<
- SpecialEventContent,
- SpecialEvent
-> {
- constructor() {
- super("every 60 minutes", 540)
- }
-
- async listEvents() {
- const events = await api.getSpecialEvents()
- return events.filter(SpecialEventContent.guard)
- }
-
- getEvent(content: SpecialEventContent) {
- const event: SpecialEvent = {
- id: `specialEvent-${content.EventId}`,
- type: "specialEvent",
- content,
- ...this.timestamps(content)
- }
- return Promise.resolve(event)
- }
-}
-
-class SessionScraper extends EventScraper {
- private court = currentGeneralCourt
-
- constructor() {
- super("every 60 minutes", 120)
- }
-
- async listEvents() {
- const events = await api.getSessions(this.court)
- return events.filter(SessionContent.guard)
- }
-
- getEvent(content: SessionContent) {
- const event: Session = {
- id: `session-${this.court}-${content.EventId}`,
- type: "session",
- content,
- ...this.timestamps(content)
- }
- return Promise.resolve(event)
- }
-}
-
-const extractAudioFromVideo = async (
- EventId: number,
- videoUrl: string,
- bucketName?: string
-): Promise => {
- const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a`
-
- // Stream directly from URL and copy audio codec
- await new Promise((resolve, reject) => {
- ffmpeg(videoUrl)
- .noVideo()
- .audioCodec("copy")
- .format("mp4")
- .on("start", commandLine => {
- console.log(`Spawned FFmpeg with command: ${commandLine}`)
- })
- .on("end", () => {
- console.log("FFmpeg processing finished successfully")
- resolve()
- })
- .on("error", err => {
- console.error("FFmpeg error:", err)
- reject(err)
- })
- .save(tmpFilePath)
- })
-
- // Upload the audio file
- const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket()
- const audioFileName = `hearing-${EventId}-${Date.now()}.m4a`
- const file = bucket.file(audioFileName)
-
- const fileContent = await fs.promises.readFile(tmpFilePath)
- await file.save(fileContent, {
- metadata: {
- contentType: "audio/mp4"
- }
- })
-
- // Clean up temporary file
- await fs.promises.unlink(tmpFilePath)
-
- const [url] = await file.getSignedUrl({
- action: "read",
- expires: Date.now() + 24 * 60 * 60 * 1000
- })
-
- // Delete old files
- const [files] = await bucket.getFiles({
- prefix: "hearing-",
- maxResults: 1000
- })
- const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000
- const oldFiles = files.filter(file => {
- const timestamp = parseInt(file.name.split("-").pop()?.split(".")[0] || "0")
- return timestamp < oneDayAgo
- })
- await Promise.all(oldFiles.map(file => file.delete()))
-
- // Return the new audio url
- return url
-}
-
-export const submitTranscription = async ({
- EventId,
- maybeVideoUrl,
- bucketName
-}: {
- EventId: number
- maybeVideoUrl: string
- bucketName?: string
-}) => {
- const assembly = new AssemblyAI({
- apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
- })
-
- const newToken = randomBytes(16).toString("hex")
- const audioUrl = await extractAudioFromVideo(
- EventId,
- maybeVideoUrl,
- bucketName
- )
-
- const transcript = await assembly.transcripts.submit({
- audio:
- // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
- audioUrl,
- webhook_url:
- // make sure process.env.FUNCTIONS_API_BASE equals
- // https://us-central1-digital-testimony-prod.cloudfunctions.net
- // on prod. test with:
- // "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
- `${process.env.FUNCTIONS_API_BASE}/transcription`,
- speaker_labels: true,
- webhook_auth_header_name: "x-maple-webhook",
- webhook_auth_header_value: newToken
- })
-
- await db
- .collection("events")
- .doc(`hearing-${String(EventId)}`)
- .collection("private")
- .doc("webhookAuth")
- .set({
- videoAssemblyWebhookToken: sha256(newToken)
- })
-
- return transcript.id
-}
-
-export const getHearingVideoUrl = async (EventId: number) => {
- const req = await fetch(
- `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
- )
- const res = await req.text()
- if (res) {
- const dom = new JSDOM(res)
- if (dom) {
- const maybeVideoSource =
- dom.window.document.querySelectorAll("video source")
- if (maybeVideoSource.length && maybeVideoSource[0]) {
- const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
- const maybeVideoUrl = firstVideoSource.src
-
- return isValidVideoUrl(maybeVideoUrl) ? maybeVideoUrl : null
- }
- }
- }
- return null
-}
-
-const shouldScrapeVideo = async (
- EventId: number,
- ignoreCutoff: boolean = false
-) => {
- const eventInDb = await db
- .collection("events")
- .doc(`hearing-${String(EventId)}`)
- .get()
- const eventData = eventInDb.data()
-
- console.log("eventData in shouldScrapeVideo()", eventData)
-
- if (!eventData) {
- return false
- }
- if (!eventData.videoURL) {
- return (
- ignoreCutoff ||
- withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate()))
- )
- }
- return false
-}
-
-const loadCommitteeChairNames = async (
- generalCourtNumber: number,
- committeeCode: string
-) => {
- try {
- const committeeSnap = await db
- .collection(`generalCourts/${generalCourtNumber}/committees`)
- .doc(committeeCode)
- .get()
-
- if (!committeeSnap.exists) return [] as string[]
-
- const { members, content } = Committee.check(committeeSnap.data())
- const chairCodes = new Set()
- const maybeHouse = content.HouseChairperson?.MemberCode
- const maybeSenate = content.SenateChairperson?.MemberCode
-
- if (maybeHouse) chairCodes.add(maybeHouse)
- if (maybeSenate) chairCodes.add(maybeSenate)
- return (members ?? [])
- .filter(member => chairCodes.has(member.id))
- .map(member => member.name)
- } catch (error) {
- console.warn(
- `Failed to load committee chairs for ${committeeCode} (${generalCourtNumber}):`,
- error
- )
- return [] as string[]
- }
-}
-
-class HearingScraper extends EventScraper {
- constructor() {
- super("every 60 minutes", 480, "4GB")
- }
-
- async listEvents() {
- const events = await api.listHearings()
- return events.filter(HearingListItem.guard)
- }
-
- async getEvent(
- { EventId }: HearingListItem /* e.g. 4962 */,
- { ignoreCutoff = false }: { ignoreCutoff?: boolean } = {}
- ) {
- const data = await api.getHearing(EventId)
- const content = HearingContent.check(data)
-
- console.log("content in getEvent()", content)
-
- const host = content.HearingHost
- const committeeChairs =
- host?.CommitteeCode && host?.GeneralCourtNumber
- ? await loadCommitteeChairNames(
- host.GeneralCourtNumber,
- host.CommitteeCode
- )
- : []
-
- if (await shouldScrapeVideo(EventId, ignoreCutoff)) {
- try {
- const maybeVideoUrl = await getHearingVideoUrl(EventId)
- if (maybeVideoUrl) {
- const transcriptId = await submitTranscription({
- maybeVideoUrl,
- EventId
- })
-
- // Immediately save video info to prevent reprocessing
- // since the bulkWriter does not save the video properties
- // returned from this method.
- await db.collection("events").doc(`hearing-${EventId}`).update({
- videoURL: maybeVideoUrl,
- videoFetchedAt: Timestamp.now(),
- videoTranscriptionId: transcriptId
- })
-
- return {
- id: `hearing-${EventId}`,
- type: "hearing",
- content,
- ...this.timestamps(content),
- videoURL: maybeVideoUrl,
- videoFetchedAt: Timestamp.now(),
- committeeChairs,
- videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
- } as Hearing
- }
- } catch (error) {
- console.error(`Failed to process audio for hearing ${EventId}:`, error)
- return {
- id: `hearing-${EventId}`,
- type: "hearing",
- content,
- committeeChairs,
- ...this.timestamps(content)
- } as Hearing
- }
- }
- return {
- id: `hearing-${EventId}`,
- type: "hearing",
- content,
- committeeChairs,
- ...this.timestamps(content)
- } as Hearing
- }
-}
+import { checkAuth, checkAdmin } from "../common"
+import { db } from "../firebase"
+import { SpecialEventsScraper, SessionScraper } from "./EventScraper"
+import { HearingScraper, HearingPostProcessor } from "./HearingScraper"
/**
* Callable cloud function to scrape a single hearing by EventId.
@@ -442,12 +33,10 @@ export const scrapeSingleHearing = functions
}
try {
- // Create a temporary scraper instance to reuse the existing logic
- const scraper = new HearingScraper()
- const hearing = await scraper.getEvent(
- { EventId: eventId },
- { ignoreCutoff: true }
- )
+ const hearing = {
+ ...(await new HearingScraper().getEvent({ EventId: eventId })),
+ ...(await new HearingPostProcessor().getUpdate({ EventId: eventId })) // Videos
+ }
// Save the hearing to Firestore
await db.doc(`/events/${hearing.id}`).set(hearing, { merge: true })
@@ -487,12 +76,10 @@ export const scrapeSingleHearingv2 = onCall(
}
try {
- // Create a temporary scraper instance to reuse the existing logic
- const scraper = new HearingScraper()
- const hearing = await scraper.getEvent(
- { EventId: eventId },
- { ignoreCutoff: true }
- )
+ const hearing = {
+ ...(await new HearingScraper().getEvent({ EventId: eventId })),
+ ...(await new HearingPostProcessor().getUpdate({ EventId: eventId }))
+ }
// Save the hearing to Firestore
await db.doc(`/events/${hearing.id}`).set(hearing, { merge: true })
@@ -518,3 +105,4 @@ export const scrapeSingleHearingv2 = onCall(
export const scrapeSpecialEvents = new SpecialEventsScraper().function
export const scrapeSessions = new SessionScraper().function
export const scrapeHearings = new HearingScraper().function
+export const scrapeVideos = new HearingPostProcessor().function
diff --git a/functions/src/events/types.ts b/functions/src/events/types.ts
index 4101b41d1..9a00fe190 100644
--- a/functions/src/events/types.ts
+++ b/functions/src/events/types.ts
@@ -97,13 +97,20 @@ export const HearingContent = BaseEventContent.extend({
export type HearingListItem = Static
export const HearingListItem = Record({ EventId: Number })
+export type Video = Static
+export const Video = Record({
+ url: String,
+ title: String,
+ transcriptionId: String
+})
+
export type Hearing = Static
export const Hearing = BaseEvent.extend({
type: L("hearing"),
content: HearingContent,
- videoURL: Optional(String),
- videoTranscriptionId: Optional(String),
- videoFetchedAt: Optional(InstanceOf(Timestamp)),
+ videos: Array(Video),
+ transcriptionIds: Array(String),
+ videosFetchedAt: Optional(InstanceOf(Timestamp)),
committeeChairs: Optional(Array(String))
})
diff --git a/functions/src/hearings/search.ts b/functions/src/hearings/search.ts
index fe26d0385..ce2375042 100644
--- a/functions/src/hearings/search.ts
+++ b/functions/src/hearings/search.ts
@@ -33,7 +33,7 @@ export const {
documentTrigger: "events/{eventId}",
alias: "hearings",
idField: "id",
- filter: data => data.type === "hearing",
+ filter: data => data.type === "hearing" && "transcriptionIds" in data,
schema: {
fields: [
{ name: "eventId", type: "int32", facet: false },
@@ -57,7 +57,7 @@ export const {
},
convert: data => {
const hearing = Hearing.check(data)
- const { content, startsAt: startsAtTimestamp, id, videoURL } = hearing
+ const { content, startsAt: startsAtTimestamp, id, videos } = hearing
const startsAt = startsAtTimestamp.toMillis()
const schedule = DateTime.fromMillis(startsAt, { zone: timeZone })
@@ -115,7 +115,7 @@ export const {
bill => bill.slug || `${courtNumber}/${bill.number}`
),
court: courtNumber,
- hasVideo: Boolean(videoURL)
+ hasVideo: videos.length > 0
}
}
})
diff --git a/functions/src/index.ts b/functions/src/index.ts
index 641255bf4..970e31b59 100644
--- a/functions/src/index.ts
+++ b/functions/src/index.ts
@@ -17,6 +17,7 @@ export {
} from "./committees"
export {
scrapeHearings,
+ scrapeVideos,
scrapeSessions,
scrapeSpecialEvents,
scrapeSingleHearing,
diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts
index 04a5ed1f8..767f015af 100644
--- a/functions/src/webhooks/transcription.ts
+++ b/functions/src/webhooks/transcription.ts
@@ -1,5 +1,5 @@
import * as functions from "firebase-functions"
-import { AssemblyAI } from "assemblyai"
+import { assemblyAI } from "../events/AssemblyAIHandler"
import { db, Timestamp } from "../firebase"
import { sha256 } from "js-sha256"
@@ -10,13 +10,8 @@ export const transcription = functions
if (req.body.status === "completed") {
// If we get a request with the right header and status, get the
// transcription from the assembly API.
- const assembly = new AssemblyAI({
- apiKey: process.env.ASSEMBLY_API_KEY
- ? process.env.ASSEMBLY_API_KEY
- : ""
- })
- const transcript = await assembly.transcripts.get(
+ const transcript = await assemblyAI().getTranscript(
req.body.transcript_id
)
@@ -25,7 +20,7 @@ export const transcription = functions
// look for an event (aka Hearing) in the DB with a matching ID.
const maybeEventsInDb = await db
.collection("events")
- .where("videoTranscriptionId", "==", transcript.id)
+ .where("transcriptionIds", "array-contains", transcript.id)
.get()
if (maybeEventsInDb.docs.length) {
@@ -43,7 +38,7 @@ export const transcription = functions
.collection("events")
.doc(doc.id)
.collection("private")
- .doc("webhookAuth")
+ .doc(transcript.id)
.get()
const tokenDataInDb =
@@ -69,12 +64,12 @@ export const transcription = functions
// If there is one authenticated event, pull out the parts we want to
// save and try to save them in the db.
- const { paragraphs } = await assembly.transcripts.paragraphs(
+ const paragraphs = await assemblyAI().fetchParagraphs(
transcript.id
)
const { id, text, audio_url, utterances } = transcript
try {
- const transcriptionInDb = await db
+ const transcriptionInDb = db
.collection("transcriptions")
.doc(id)
diff --git a/pages/ballotQuestions/[id].tsx b/pages/ballotQuestions/[id].tsx
index 5bbbe1ff9..418628e3e 100644
--- a/pages/ballotQuestions/[id].tsx
+++ b/pages/ballotQuestions/[id].tsx
@@ -10,6 +10,7 @@ import {
Hearing
} from "../../components/ballotquestions/types"
import { BallotQuestion, Bill } from "../../components/db"
+import { Video } from "../../components/hearing/hearing"
import { createPage } from "../../components/page"
import { usePublishService } from "../../components/publish/hooks"
import { serverSideTranslations } from "next-i18next/serverSideTranslations"
@@ -22,7 +23,7 @@ async function getHearing(id: string): Promise {
const data = snap.data()
return {
id,
- videoURL: data.videoURL ?? undefined,
+ videoURLs: data.videos.map((item: Video) => item.url),
startsAt: data.startsAt?.toMillis() ?? 0
}
}
diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts
index a9c19daeb..7612cf9d7 100644
--- a/scripts/firebase-admin/backfillHearingTranscription.ts
+++ b/scripts/firebase-admin/backfillHearingTranscription.ts
@@ -1,15 +1,15 @@
-import { Timestamp } from "../../functions/src/firebase"
-import { Record, Number, String } from "runtypes"
+import { Record, Number, String, Boolean } from "runtypes"
import { Script } from "./types"
-import { getHearingVideoUrl, submitTranscription } from "functions/src/events"
+import { HearingPostProcessor } from "functions/src/events"
const Args = Record({
eventId: Number.optional(),
- bucketName: String.optional()
+ bucketName: String.optional(),
+ recreateTranscripts: Boolean.optional()
})
export const script: Script = async ({ db, args }) => {
- const { eventId, bucketName } = Args.check(args)
+ const { eventId, bucketName, recreateTranscripts } = Args.check(args)
// Process a single event by eventId
if (eventId) {
@@ -20,30 +20,22 @@ export const script: Script = async ({ db, args }) => {
return
}
const data = doc.data()
- if (data?.videoTranscriptionId) {
- console.log(`Hearing ${eventId} already has a transcription.`)
- return
- }
+ if (!data) return
try {
- const maybeVideoUrl = await getHearingVideoUrl(eventId)
- if (maybeVideoUrl) {
- const transcriptId = await submitTranscription({
- maybeVideoUrl,
- EventId: eventId,
- bucketName
- })
-
- await docRef.update({
- videoURL: maybeVideoUrl,
- videoFetchedAt: Timestamp.now(),
- videoTranscriptionId: transcriptId
- })
+ const update = recreateTranscripts
+ ? await new HearingPostProcessor().getUpdate({ EventId: eventId })
+ : await new HearingPostProcessor().getUpdate(
+ { EventId: eventId },
+ data.videos
+ )
+ if (update !== null) {
+ await docRef.update(update)
console.log(
- `Transcription submitted for hearing ${eventId}: ${transcriptId}`
+ `Transcriptions submitted for hearing ${eventId}: ${update.transcriptionIds}`
)
} else {
- console.log(`No valid video URL found for hearing ${eventId}`)
+ console.log(`No additional videos to be processed for ${eventId}`)
}
} catch (error) {
console.error(`Failed to process hearing ${eventId}:`, error)
@@ -60,40 +52,29 @@ export const script: Script = async ({ db, args }) => {
if (count >= 100) {
break // Limit to 100 operations for this run
}
+ const EventId = parseInt(doc.id.replace("hearing-", ""))
+ console.log(`Processing hearing ${EventId}...`)
const data = doc.data()
- if (!data.videoTranscriptionId) {
- const EventId = parseInt(doc.id.replace("hearing-", ""))
- console.log(`Processing hearing ${EventId}...`)
-
- try {
- const maybeVideoUrl = await getHearingVideoUrl(EventId)
- if (maybeVideoUrl) {
- const transcriptId = await submitTranscription({
- maybeVideoUrl,
- EventId,
- bucketName
- })
+ if (data.empty) continue
- await doc.ref.update({
- videoURL: maybeVideoUrl,
- videoFetchedAt: Timestamp.now(),
- videoTranscriptionId: transcriptId
- })
+ try {
+ const update = recreateTranscripts
+ ? await new HearingPostProcessor().getUpdate({ EventId })
+ : await new HearingPostProcessor().getUpdate({ EventId }, data.videos)
+ if (update.videos.length > data.videos.length) {
+ await doc.ref.update(update)
- console.log(
- `Transcription submitted for hearing ${EventId}: ${transcriptId}`
- )
- count++
- } else {
- console.log(`No valid video URL found for hearing ${EventId}`)
- }
- } catch (error) {
- console.error(`Failed to process hearing ${EventId}:`, error)
+ console.log(
+ `Transcriptions submitted for hearing ${EventId}: ${update.transcriptionIds}`
+ )
+ count++
+ } else {
+ console.log(
+ `No additional videos to be processed for hearing ${EventId}`
+ )
}
- } else {
- console.log(
- `Skipping hearing ${data.EventId}, already has transcription.`
- )
+ } catch (error) {
+ console.error(`Failed to process hearing ${EventId}:`, error)
}
}
console.log("Done processing hearings without transcriptions.")
diff --git a/scripts/firebase-admin/backfillHearingVideoFormat.ts b/scripts/firebase-admin/backfillHearingVideoFormat.ts
new file mode 100644
index 000000000..0596cb484
--- /dev/null
+++ b/scripts/firebase-admin/backfillHearingVideoFormat.ts
@@ -0,0 +1,96 @@
+import { FieldValue, Timestamp } from "../../functions/src/firebase"
+import { Record, Number } from "runtypes"
+import { Script } from "./types"
+
+const Args = Record({
+ eventId: Number.optional()
+})
+
+function migrateVideo(
+ data: FirebaseFirestore.DocumentData
+): FirebaseFirestore.DocumentData | null {
+ if ("videos" in data) {
+ return null
+ }
+
+ if (!("videoURL" in data)) {
+ return {
+ videos: [],
+ transcriptionIds: [],
+ videoTranscriptionId: FieldValue.delete(),
+ videoFetchedAt: FieldValue.delete(),
+ videoURL: FieldValue.delete()
+ }
+ }
+
+ const url = data.videoURL
+ const fetchedAt = data?.videoFetchedAt
+ const transcriptionId = data?.videoTranscriptionId
+
+ const transcriptionIds = transcriptionId ? [transcriptionId] : []
+
+ const videos = [
+ {
+ // Default; not shown
+ title: data.id,
+ url,
+ transcriptionId
+ }
+ ]
+
+ return {
+ videos,
+ transcriptionIds,
+ videosFetchedAt: fetchedAt || Timestamp.now(),
+ videoTranscriptionId: FieldValue.delete(),
+ videoFetchedAt: FieldValue.delete(),
+ videoURL: FieldValue.delete()
+ }
+}
+
+export const script: Script = async ({ db, args }) => {
+ const { eventId } = Args.check(args)
+
+ // Process a single event by eventId
+ if (eventId) {
+ const snapshot = await db
+ .collection("events")
+ .where("type", "==", "hearing")
+ .where("id", "==", eventId)
+ .get()
+
+ if (snapshot.empty || snapshot.docs.length !== 1) {
+ throw new Error(
+ `The number of documents matching the event id ${eventId} must be exactly one`
+ )
+ }
+
+ const doc = snapshot.docs[0]
+ const modify = migrateVideo(doc.data())
+ if (modify) {
+ doc.ref.update(modify)
+ }
+ } else {
+ const snapshot = await db
+ .collection("events")
+ .where("type", "==", "hearing")
+ .get()
+
+ if (snapshot.empty) {
+ throw new Error("Hearing backfill failed; no documents were found")
+ }
+
+ let bulkWriter = db.bulkWriter()
+
+ for (const doc of snapshot.docs) {
+ console.log(doc.data().id)
+ const modify = migrateVideo(doc.data())
+ if (modify) {
+ bulkWriter.update(doc.ref, modify)
+ }
+ }
+ await bulkWriter.close()
+ }
+
+ console.log("Video backfill complete")
+}
diff --git a/scripts/firebase-admin/migrateHearingTranscription.ts b/scripts/firebase-admin/migrateHearingTranscription.ts
index 910ba3943..03dce0f1c 100644
--- a/scripts/firebase-admin/migrateHearingTranscription.ts
+++ b/scripts/firebase-admin/migrateHearingTranscription.ts
@@ -39,6 +39,119 @@ function convertTimestamps(obj: any): any {
return obj
}
+async function migrateTranscription(
+ db: admin.firestore.Firestore,
+ devDb: admin.firestore.Firestore,
+ transcriptionId: string,
+ bulkWriter?: FirebaseFirestore.BulkWriter
+) {
+ const devTranscriptionDoc = await devDb
+ .collection("transcriptions")
+ .doc(transcriptionId)
+ .get()
+
+ const devTranscriptionData = devTranscriptionDoc.exists
+ ? devTranscriptionDoc.data()
+ : null
+
+ if (!devTranscriptionData) {
+ throw new Error(
+ `Transcription ${transcriptionId} not found in dev project.`
+ )
+ }
+
+ // Create transcription in target project instead of setting, in case it already exists, which will throw an error
+ const convertedData = convertTimestamps(devTranscriptionData)
+ console.log(`Creating transcription ${transcriptionId}...`)
+ if (bulkWriter) {
+ bulkWriter.create(
+ db.collection("transcriptions").doc(transcriptionId),
+ convertedData
+ )
+ } else {
+ await db
+ .collection("transcriptions")
+ .doc(transcriptionId)
+ .create(convertedData)
+ }
+
+ const subcollections = await devTranscriptionDoc.ref.listCollections()
+ for (const subcol of subcollections) {
+ const docs = await subcol.get()
+ for (const doc of docs.docs) {
+ const ref = db
+ .collection("transcriptions")
+ .doc(transcriptionId)
+ .collection(subcol.id)
+ .doc(doc.id)
+ if (bulkWriter) {
+ bulkWriter.set(ref, doc.data())
+ }
+ await ref.set(doc.data())
+ }
+ }
+}
+
+async function migrateHearing(
+ db: admin.firestore.Firestore,
+ devDb: admin.firestore.Firestore,
+ devDoc:
+ | admin.firestore.DocumentSnapshot
+ | admin.firestore.QueryDocumentSnapshot,
+ bulkWriter?: FirebaseFirestore.BulkWriter
+): Promise<"migrate" | "skip" | "fail"> {
+ const devData = devDoc.data()
+
+ if (!devData || !devData?.transcriptionIds?.length) {
+ console.log(`Hearing ${devDoc.id} has no transcription to migrate.`)
+ return "skip"
+ }
+ const targetDoc = await db.collection("events").doc(devDoc.id).get()
+ const targetData = targetDoc.exists ? targetDoc.data() : null
+
+ if (!targetData) {
+ console.log(`${devDoc.id} not found in target project.`)
+ return "skip"
+ }
+
+ let found = false
+ for (const transcriptionId of devData.transcriptionIds) {
+ if (!targetData.transcriptionIds.includes(transcriptionId)) {
+ found = true
+ try {
+ await migrateTranscription(db, devDb, transcriptionId, bulkWriter)
+ } catch (err) {
+ console.error(`Error creating transcription ${transcriptionId}:`, err)
+ return "fail"
+ }
+ }
+ }
+ if (!found) {
+ console.log(`${devDoc.id} has no new transcriptions.`)
+ return "skip"
+ }
+
+ console.log(`Updating hearing ${devDoc.id}...`)
+ if (bulkWriter) {
+ bulkWriter.update(db.collection("events").doc(devDoc.id), {
+ videos: devData.videos,
+ videosFetchedAt: convertTimestamps(devData.videosFetchedAt),
+ transcriptionIds: devData.transcriptionIds
+ })
+ } else {
+ await db
+ .collection("events")
+ .doc(devDoc.id)
+ .update({
+ videos: devData.videos,
+ videosFetchedAt: convertTimestamps(devData.videosFetchedAt),
+ transcriptionIds: devData.transcriptionIds
+ })
+ }
+
+ return "migrate"
+}
+
const Args = Record({
sourceProject: String,
hearing: Number.optional()
@@ -66,78 +179,15 @@ export const script: Script = async ({ db, args }) => {
if (hearing) {
const hearingId = "hearing-" + hearing
console.log(`Processing single hearing: ${hearingId}`)
- const devHearingsSnapshot = await devDb
- .collection("events")
- .doc(hearingId)
- .get()
+ const devDoc = await devDb.collection("events").doc(hearingId).get()
- if (!devHearingsSnapshot.exists) {
+ if (!devDoc.exists) {
console.error(`Hearing ${hearingId} not found in dev project.`)
return
}
- const devData = devHearingsSnapshot.data()
-
- if (!devData?.videoTranscriptionId) {
- console.log(`Hearing ${hearingId} has no transcription to migrate.`)
- return
- }
- const targetDoc = await db.collection("events").doc(hearingId).get()
- const targetData = targetDoc.exists ? targetDoc.data() : null
-
- // Only migrate if hearing in target environment does not have a transcription yet
- if (!targetData?.videoTranscriptionId) {
- const transcriptionId = devData.videoTranscriptionId
- const devTranscriptionDoc = await devDb
- .collection("transcriptions")
- .doc(transcriptionId)
- .get()
-
- const devTranscriptionData = devTranscriptionDoc.exists
- ? devTranscriptionDoc.data()
- : null
-
- if (devTranscriptionData) {
- // Create transcription in target project instead of setting, in case it already exists, which will throw an error
- const convertedData = convertTimestamps(devTranscriptionData)
- try {
- console.log(`Creating transcription ${transcriptionId}...`)
- await db
- .collection("transcriptions")
- .doc(transcriptionId)
- .create(convertedData)
- } catch (err) {
- console.error(`Error creating transcription ${transcriptionId}:`, err)
- return
- }
-
- const subcollections = await devTranscriptionDoc.ref.listCollections()
- for (const subcol of subcollections) {
- const docs = await subcol.get()
- for (const doc of docs.docs) {
- await db
- .collection("transcriptions")
- .doc(transcriptionId)
- .collection(subcol.id)
- .doc(doc.id)
- .set(doc.data())
- }
- }
- } else {
- console.error(
- `Transcription ${transcriptionId} not found in dev project.`
- )
- }
- await db
- .collection("events")
- .doc(hearingId)
- .update({
- videoURL: devData.videoURL,
- videoFetchedAt: convertTimestamps(devData.videoFetchedAt),
- videoTranscriptionId: devData.videoTranscriptionId
- })
- console.log(`Migration complete for hearing ${hearingId}.`)
- }
+ await migrateHearing(db, devDb, devDoc)
+ console.log(`Migration complete for hearing ${hearingId}.`)
} else {
// For full migration
const devHearingsSnapshot = await devDb
@@ -157,83 +207,14 @@ export const script: Script = async ({ db, args }) => {
console.log(`Migration limit of ${limit} reached. Stopping.`)
break
}
- const devData = devDoc.data()
- if (!devData.videoTranscriptionId) {
- skipped++
- console.log(`${devDoc.id} has no transcription to migrate.`)
- continue
- }
-
- const targetDoc = await db.collection("events").doc(devDoc.id).get()
- const targetData = targetDoc.exists ? targetDoc.data() : null
-
- if (!targetData) {
- skipped++
- console.log(`${devDoc.id} not found in target project.`)
- continue
- }
- // Only migrate if hearing in target environment does not have a transcription yet
- if (!targetData?.videoTranscriptionId) {
- console.log(`Migrating ${devDoc.id}...`)
- const transcriptionId = devData.videoTranscriptionId
- const devTranscriptionDoc = await devDb
- .collection("transcriptions")
- .doc(transcriptionId)
- .get()
-
- const devTranscriptionData = devTranscriptionDoc.exists
- ? devTranscriptionDoc.data()
- : null
-
- if (devTranscriptionData) {
- // Create transcription in target project instead of setting, in case it already exists, which will throw an error
- const convertedData = convertTimestamps(devTranscriptionData)
- try {
- console.log(`Creating transcription ${transcriptionId}...`)
- bulkWriter.create(
- db.collection("transcriptions").doc(transcriptionId),
- convertedData
- )
- } catch (err) {
- failed++
- console.error(
- `Error creating transcription ${transcriptionId}:`,
- err
- )
- continue
- }
-
- const subcollections = await devTranscriptionDoc.ref.listCollections()
- for (const subcol of subcollections) {
- const docs = await subcol.get()
- for (const doc of docs.docs) {
- await db
- .collection("transcriptions")
- .doc(transcriptionId)
- .collection(subcol.id)
- .doc(doc.id)
- .set(doc.data())
- }
- }
- } else {
- failed++
- console.error(
- `Transcription ${transcriptionId} not found in dev project.`
- )
- continue
- }
-
- console.log(`Updating ${devDoc.id}...`)
- bulkWriter.update(db.collection("events").doc(devDoc.id), {
- videoURL: devData.videoURL,
- videoFetchedAt: convertTimestamps(devData.videoFetchedAt),
- videoTranscriptionId: devData.videoTranscriptionId
- })
- migrated++
+ const result = await migrateHearing(db, devDb, devDoc, bulkWriter)
+ if (result === "migrate") {
+ migrated += 1
+ } else if (result === "skip") {
+ skipped += 1
} else {
- console.log(`${devDoc.id} already has a transcription, skipping.`)
- skipped++
+ failed += 1
}
}
diff --git a/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx b/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx
index 40ec1154c..c26d83150 100644
--- a/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx
+++ b/stories/organisms/ballotquestions/BallotQuestionDetails.stories.tsx
@@ -142,7 +142,10 @@ const sampleBill: Bill = {
const sampleHearing = {
id: "hearing-101",
startsAt: new Date("2026-03-12T10:00:00-05:00").getTime(),
- videoURL: "https://malegislature.gov/"
+ videoURLs: [
+ "https://prodarchivevideo.blob.core.windows.net/video/2022/Hearings/Joint/April/12.mp4",
+ "https://prodarchivevideo.blob.core.windows.net/video/2022/Hearings/Joint/April/12_1.mp4"
+ ]
}
const emptyTestimonyListing: UsePublishedTestimonyListing = {