From 9e191bc349f89ac08e6d97935feebbc92958bcc4 Mon Sep 17 00:00:00 2001 From: kiminkim724 Date: Wed, 24 Sep 2025 21:49:23 -0400 Subject: [PATCH 1/5] Initial version of backfill --- functions/src/events/scrapeEvents.ts | 4 +- .../backfillHearingTranscription.ts | 97 +++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 scripts/firebase-admin/backfillHearingTranscription.ts diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 5398728f5..5e875a486 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -217,7 +217,7 @@ const extractAudioFromVideo = async ( return url } -const submitTranscription = async ({ +export const submitTranscription = async ({ EventId, maybeVideoUrl }: { @@ -258,7 +258,7 @@ const submitTranscription = async ({ return transcript.id } -const getHearingVideoUrl = async (EventId: number) => { +export const getHearingVideoUrl = async (EventId: number) => { const req = await fetch( `https://malegislature.gov/Events/Hearings/Detail/${EventId}` ) diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts new file mode 100644 index 000000000..44e40a752 --- /dev/null +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -0,0 +1,97 @@ +import { Timestamp } from "../../functions/src/firebase" +import { Record, Number } from "runtypes" +import { Script } from "./types" +import { getHearingVideoUrl, submitTranscription } from "functions/src/events" + +const Args = Record({ + eventId: Number.optional() +}) + +export const script: Script = async ({ db, args }) => { + const { eventId } = Args.check(args) + + // Process a single event by eventId + if (eventId) { + const docRef = db.collection("events").doc(`hearing-${eventId}`) + const doc = await docRef.get() + if (!doc.exists) { + console.log(`No hearing found with EventId ${eventId}`) + return + } + const data = doc.data() + if (data?.videoTranscriptionId) { + console.log(`Hearing ${eventId} already has a transcription.`) + return + } + try { + const maybeVideoUrl = await getHearingVideoUrl(eventId) + if (maybeVideoUrl) { + const transcriptId = await submitTranscription({ + maybeVideoUrl, + EventId: eventId + }) + + await docRef.update({ + videoURL: maybeVideoUrl, + videoFetchedAt: Timestamp.now(), + videoTranscriptionId: transcriptId + }) + + console.log( + `Transcription submitted for hearing ${eventId}: ${transcriptId}` + ) + } else { + console.log(`No valid video URL found for hearing ${eventId}`) + } + } catch (error) { + console.error(`Failed to process hearing ${eventId}:`, error) + } + } else { + // Bulk process events + const hearingsSnapshot = await db + .collection("events") + .where("type", "==", "hearing") + .get() + + const writer = db.bulkWriter() + let count = 0 + + for (const doc of hearingsSnapshot.docs) { + if (count >= 200) { + break // Limit to 200 operations for this run + } + const data = doc.data() + if (!data.videoTranscriptionId) { + const EventId = parseInt(doc.id.replace("hearing-", "")) + console.log(`Processing hearing ${EventId}...`) + + try { + const maybeVideoUrl = await getHearingVideoUrl(EventId) + if (maybeVideoUrl) { + const transcriptId = await submitTranscription({ + maybeVideoUrl, + EventId + }) + + writer.update(doc.ref, { + videoURL: maybeVideoUrl, + videoFetchedAt: Timestamp.now(), + videoTranscriptionId: transcriptId + }) + + console.log( + `Transcription submitted for hearing ${EventId}: ${transcriptId}` + ) + count++ + } else { + console.log(`No valid video URL found for hearing ${EventId}`) + } + } catch (error) { + console.error(`Failed to process hearing ${EventId}:`, error) + } + } + } + await writer.close() + console.log("Done processing hearings without transcriptions.") + } +} From 751c3b473fe8d5075fd91dfb14ea347ce6dc241f Mon Sep 17 00:00:00 2001 From: kiminkim724 Date: Tue, 7 Oct 2025 19:49:53 -0400 Subject: [PATCH 2/5] Add bucketName arguments to specify Firebase bucket --- functions/src/events/scrapeEvents.ts | 15 +++++++++++---- .../backfillHearingTranscription.ts | 13 ++++++++----- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 5e875a486..11e1686bf 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -148,7 +148,8 @@ class SessionScraper extends EventScraper { const extractAudioFromVideo = async ( EventId: number, - videoUrl: string + videoUrl: string, + bucketName?: string ): Promise => { const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a` @@ -182,7 +183,7 @@ const extractAudioFromVideo = async ( }) // Upload the audio file - const bucket = storage.bucket() + const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket() const audioFileName = `hearing-${EventId}-${Date.now()}.m4a` const file = bucket.file(audioFileName) @@ -219,17 +220,23 @@ const extractAudioFromVideo = async ( export const submitTranscription = async ({ EventId, - maybeVideoUrl + maybeVideoUrl, + bucketName }: { EventId: number maybeVideoUrl: string + bucketName?: string }) => { const assembly = new AssemblyAI({ apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : "" }) const newToken = randomBytes(16).toString("hex") - const audioUrl = await extractAudioFromVideo(EventId, maybeVideoUrl) + const audioUrl = await extractAudioFromVideo( + EventId, + maybeVideoUrl, + bucketName + ) const transcript = await assembly.transcripts.submit({ audio: diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index 44e40a752..2cd698adb 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -1,14 +1,15 @@ import { Timestamp } from "../../functions/src/firebase" -import { Record, Number } from "runtypes" +import { Record, Number, String } from "runtypes" import { Script } from "./types" import { getHearingVideoUrl, submitTranscription } from "functions/src/events" const Args = Record({ - eventId: Number.optional() + eventId: Number.optional(), + bucketName: String.optional() }) export const script: Script = async ({ db, args }) => { - const { eventId } = Args.check(args) + const { eventId, bucketName } = Args.check(args) // Process a single event by eventId if (eventId) { @@ -28,7 +29,8 @@ export const script: Script = async ({ db, args }) => { if (maybeVideoUrl) { const transcriptId = await submitTranscription({ maybeVideoUrl, - EventId: eventId + EventId: eventId, + bucketName }) await docRef.update({ @@ -70,7 +72,8 @@ export const script: Script = async ({ db, args }) => { if (maybeVideoUrl) { const transcriptId = await submitTranscription({ maybeVideoUrl, - EventId + EventId, + bucketName }) writer.update(doc.ref, { From 31eba648e473809f7894875203c09e13d17a468e Mon Sep 17 00:00:00 2001 From: kiminkim724 Date: Tue, 7 Oct 2025 19:57:47 -0400 Subject: [PATCH 3/5] Change to sequential processing instead of parallel to prevent memory overload --- .../backfillHearingTranscription.ts | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index 2cd698adb..bf5417cc3 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -49,19 +49,13 @@ export const script: Script = async ({ db, args }) => { console.error(`Failed to process hearing ${eventId}:`, error) } } else { - // Bulk process events + // Bulk process events sequentially const hearingsSnapshot = await db .collection("events") .where("type", "==", "hearing") .get() - const writer = db.bulkWriter() - let count = 0 - for (const doc of hearingsSnapshot.docs) { - if (count >= 200) { - break // Limit to 200 operations for this run - } const data = doc.data() if (!data.videoTranscriptionId) { const EventId = parseInt(doc.id.replace("hearing-", "")) @@ -76,7 +70,7 @@ export const script: Script = async ({ db, args }) => { bucketName }) - writer.update(doc.ref, { + await doc.ref.update({ videoURL: maybeVideoUrl, videoFetchedAt: Timestamp.now(), videoTranscriptionId: transcriptId @@ -85,16 +79,18 @@ export const script: Script = async ({ db, args }) => { console.log( `Transcription submitted for hearing ${EventId}: ${transcriptId}` ) - count++ } else { console.log(`No valid video URL found for hearing ${EventId}`) } } catch (error) { console.error(`Failed to process hearing ${EventId}:`, error) } + } else { + console.log( + `Skipping hearing ${data.EventId}, already has transcription.` + ) } } - await writer.close() console.log("Done processing hearings without transcriptions.") } } From 90747b92b3b3f76b392705ada98bbad54801cbc2 Mon Sep 17 00:00:00 2001 From: kiminkim724 Date: Tue, 7 Oct 2025 21:18:11 -0400 Subject: [PATCH 4/5] Add limit to prevent transcription overload --- scripts/firebase-admin/backfillHearingTranscription.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts index bf5417cc3..a9c19daeb 100644 --- a/scripts/firebase-admin/backfillHearingTranscription.ts +++ b/scripts/firebase-admin/backfillHearingTranscription.ts @@ -49,13 +49,17 @@ export const script: Script = async ({ db, args }) => { console.error(`Failed to process hearing ${eventId}:`, error) } } else { - // Bulk process events sequentially + // Run events sequentially to avoid overloading the transcription service const hearingsSnapshot = await db .collection("events") .where("type", "==", "hearing") .get() + let count = 0 for (const doc of hearingsSnapshot.docs) { + if (count >= 100) { + break // Limit to 100 operations for this run + } const data = doc.data() if (!data.videoTranscriptionId) { const EventId = parseInt(doc.id.replace("hearing-", "")) @@ -79,6 +83,7 @@ export const script: Script = async ({ db, args }) => { console.log( `Transcription submitted for hearing ${EventId}: ${transcriptId}` ) + count++ } else { console.log(`No valid video URL found for hearing ${EventId}`) } From 6e1f4ea42dccd6566986137e0cdfa2c3ba5ac006 Mon Sep 17 00:00:00 2001 From: kiminkim724 Date: Tue, 11 Nov 2025 21:21:45 -0500 Subject: [PATCH 5/5] Initial migration script --- .../migrateHearingTranscription.ts | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 scripts/firebase-admin/migrateHearingTranscription.ts diff --git a/scripts/firebase-admin/migrateHearingTranscription.ts b/scripts/firebase-admin/migrateHearingTranscription.ts new file mode 100644 index 000000000..285b3875f --- /dev/null +++ b/scripts/firebase-admin/migrateHearingTranscription.ts @@ -0,0 +1,175 @@ +import * as admin from "firebase-admin" +import { Number, Record, String } from "runtypes" +import { Script } from "./types" + +function getDevServiceAccount(path: string) { + return require(path) +} + +// Initialize source (dev) Firebase +function initDevApp(devServiceAccountPath: string) { + const devServiceAccount = getDevServiceAccount(devServiceAccountPath) + return admin.initializeApp( + { + credential: admin.credential.cert(devServiceAccount) + }, + "dev" + ) +} + +const Args = Record({ + sourceProject: String, + hearing: Number.optional() +}) + +export const script: Script = async ({ db, args }) => { + const { sourceProject, hearing } = Args.check(args) + if (!sourceProject) { + console.error( + "Please provide the path to the dev service account JSON file as an argument." + ) + process.exit(1) + } + + // Initialize dev app and db (digital-testimony-dev) + const devApp = initDevApp(sourceProject) + const devDb = devApp.firestore() + + // For single hearing migration + if (hearing) { + const hearingId = "hearing-" + hearing + console.log(`Processing single hearing: ${hearingId}`) + const devHearingsSnapshot = await devDb + .collection("events") + .doc(hearingId) + .get() + + if (!devHearingsSnapshot.exists) { + console.error(`Hearing ${hearingId} not found in dev project.`) + return + } + const devData = devHearingsSnapshot.data() + + if (!devData?.videoTranscriptionId) { + console.log(`Hearing ${hearingId} has no transcription to migrate.`) + return + } + const targetDoc = await db.collection("events").doc(hearingId).get() + const targetData = targetDoc.exists ? targetDoc.data() : null + + // Only migrate if hearing in target environment does not have a transcription yet + if (!targetData?.videoTranscriptionId) { + const transcriptionId = devData.videoTranscriptionId + const devTranscriptionDoc = await devDb + .collection("transcriptions") + .doc(transcriptionId) + .get() + + const devTranscriptionData = devTranscriptionDoc.exists + ? devTranscriptionDoc.data() + : null + + if (devTranscriptionData) { + // Create transcription in target project instead of setting, in case it already exists, which will throw an error + try { + await db + .collection("transcriptions") + .doc(transcriptionId) + .create(devTranscriptionData) + } catch (err) { + console.error(`Error creating transcription ${transcriptionId}:`, err) + return + } + } else { + console.error( + `Transcription ${transcriptionId} not found in dev project.` + ) + } + + await db.collection("events").doc(hearingId).update({ + videoURL: devData.videoURL, + videoFetchedAt: devData.videoFetchedAt, + videoTranscriptionId: devData.videoTranscriptionId + }) + console.log(`Migration complete for hearing ${hearingId}.`) + } + } else { + // For full migration + const devHearingsSnapshot = await devDb + .collection("events") + .where("type", "==", "hearing") + .get() + + let migrated = 0, + skipped = 0, + failed = 0 + + const bulkWriter = db.bulkWriter() + + for (const devDoc of devHearingsSnapshot.docs) { + const devData = devDoc.data() + if (!devData.videoTranscriptionId) { + skipped++ + continue + } + + const targetDoc = await db.collection("events").doc(devDoc.id).get() + const targetData = targetDoc.exists ? targetDoc.data() : null + + if (!targetData) { + skipped++ + continue + } + + // Only migrate if hearing in target environment does not have a transcription yet + if (!targetData?.videoTranscriptionId) { + const transcriptionId = devData.videoTranscriptionId + const devTranscriptionDoc = await devDb + .collection("transcriptions") + .doc(transcriptionId) + .get() + + const devTranscriptionData = devTranscriptionDoc.exists + ? devTranscriptionDoc.data() + : null + + if (devTranscriptionData) { + // Create transcription in target project instead of setting, in case it already exists, which will throw an error + try { + bulkWriter.create( + db.collection("transcriptions").doc(transcriptionId), + devTranscriptionData + ) + } catch (err) { + failed++ + console.error( + `Error creating transcription ${transcriptionId}:`, + err + ) + continue + } + } else { + failed++ + console.error( + `Transcription ${transcriptionId} not found in dev project.` + ) + continue + } + + bulkWriter.update(db.collection("events").doc(devDoc.id), { + videoURL: devData.videoURL, + videoFetchedAt: devData.videoFetchedAt, + videoTranscriptionId: devData.videoTranscriptionId + }) + migrated++ + } else { + skipped++ + } + } + + await bulkWriter.close() + console.log( + `Migration complete. Migrated: ${migrated}, Skipped: ${skipped}, Failed: ${failed}` + ) + } +}