From 0115f21dc0471c6ba5b9e3835550f871a7cdc624 Mon Sep 17 00:00:00 2001 From: Gregor Martynus <39992+gr2m@users.noreply.github.com> Date: Wed, 1 Jul 2026 22:42:18 -0700 Subject: [PATCH] Backport: feat(assemblyai): support universal-3-5-pro and expand the transcription provider (#16548) Backport of #16548 to release-v6.0. Adapted to the v6 provider spec: the transcription model targets TranscriptionModelV3 (not V4), so the deprecation/nudge warnings are emitted as `type: 'other'` warnings (SharedV3Warning has no `deprecated` variant), and the workflow serialize/deserialize hooks from the original PR are omitted (not available in v6). The provider options schema remains inline in the model file, matching v6's layout. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-provider-update.md | 26 ++ .../docs/03-ai-sdk-core/36-transcription.mdx | 4 +- .../01-ai-sdk-providers/100-assemblyai.mdx | 143 ++++++- .../src/transcribe/assemblyai/basic.ts | 2 +- .../src/transcribe/assemblyai/string.ts | 2 +- .../src/transcribe/assemblyai/url.ts | 2 +- packages/assemblyai/README.md | 2 +- .../assemblyai/src/assemblyai-api-types.ts | 85 ++++- .../assemblyai/src/assemblyai-provider.ts | 2 +- .../assemblyai-transcription-model.test.ts | 359 +++++++++++++++++- .../src/assemblyai-transcription-model.ts | 345 ++++++++++++++++- .../src/assemblyai-transcription-settings.ts | 16 +- 12 files changed, 948 insertions(+), 40 deletions(-) create mode 100644 .changeset/assemblyai-provider-update.md diff --git a/.changeset/assemblyai-provider-update.md b/.changeset/assemblyai-provider-update.md new file mode 100644 index 000000000000..9019c00d1c88 --- /dev/null +++ b/.changeset/assemblyai-provider-update.md @@ -0,0 +1,26 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): support universal-3-5-pro and expand the transcription provider + +- Add current speech models `universal-3-5-pro`, `universal-3-pro`, and + `universal-2`, routed via AssemblyAI's `speech_models` parameter (the + deprecated singular `speech_model` is used only for the legacy `best` model). + Using `universal-3-pro`/`universal-2` emits an informational warning + suggesting `universal-3-5-pro`. +- Deprecate the legacy `best` model (still works, warns) and remove `nano`, + which AssemblyAI no longer accepts. +- Surface speaker diarization and audio-intelligence results: `doGenerate` now + returns the full raw response on `response.body` and populates + `providerMetadata.assemblyai` with `utterances`, `entities`, + `sentimentAnalysisResults`, `contentSafetyLabels`, `iabCategoriesResult`, and + `autoHighlightsResult`. +- Add provider options for newer request parameters: `prompt`, `keytermsPrompt`, + `temperature`, `removeAudioTags`, `domain`, `speakerOptions`, + `languageDetectionOptions`, `redactPiiAudioOptions`, + `redactPiiReturnUnredacted`, and `redactStaticEntities`. Deprecate + `wordBoost`/`boostParam` in favor of `keytermsPrompt` (AssemblyAI rejects + `word_boost` on the newer models). +- Fix transcription segment timings, which were reported in milliseconds instead + of seconds. diff --git a/content/docs/03-ai-sdk-core/36-transcription.mdx b/content/docs/03-ai-sdk-core/36-transcription.mdx index 143b04bdb002..839b89d53d75 100644 --- a/content/docs/03-ai-sdk-core/36-transcription.mdx +++ b/content/docs/03-ai-sdk-core/36-transcription.mdx @@ -219,8 +219,8 @@ try { | [Deepgram](/providers/ai-sdk-providers/deepgram#transcription-models) | `nova-2` (+ variants) | | [Deepgram](/providers/ai-sdk-providers/deepgram#transcription-models) | `nova-3` (+ variants) | | [Gladia](/providers/ai-sdk-providers/gladia#transcription-models) | `default` | -| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `best` | -| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `nano` | +| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-5-pro` | +| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-pro` | | [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `whisper` | | [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `wizper` | | [Google Vertex](/providers/ai-sdk-providers/google-vertex#transcription-models) | `chirp_2` | diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index 27e3c651604d..d90869a97c4a 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -5,7 +5,7 @@ description: Learn how to use the AssemblyAI provider for the AI SDK. # AssemblyAI Provider -The [AssemblyAI](https://assemblyai.com/) provider contains language model support for the AssemblyAI transcription API. +The [AssemblyAI](https://assemblyai.com/) provider contains transcription model support for the AssemblyAI transcription API. ## Setup @@ -69,12 +69,23 @@ You can use the following optional settings to customize the AssemblyAI provider You can create models that call the [AssemblyAI transcription API](https://www.assemblyai.com/docs/getting-started/transcribe-an-audio-file/typescript) using the `.transcription()` factory method. -The first argument is the model id e.g. `best`. +The first argument is the model id, e.g. `universal-3-5-pro`. ```ts -const model = assemblyai.transcription('best'); +const model = assemblyai.transcription('universal-3-5-pro'); ``` +The `best` model is a **deprecated** legacy model, sent using AssemblyAI's +deprecated `speech_model` request parameter. It still works, but using it emits a +deprecation warning — prefer `universal-3-5-pro`. The older `nano` model has been +removed by AssemblyAI and is no longer available. All newer models are sent using +the [`speech_models`](https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model) +request parameter, which the provider selects automatically based on the model id. + +`universal-3-pro` and `universal-2` are fully supported and continue to work, but +using them emits an informational warning (not a deprecation) suggesting +`universal-3-5-pro`, AssemblyAI's latest flagship model. + You can also pass additional provider-specific options using the `providerOptions` argument. For example, supplying the `contentSafety` option will enable content safety filtering. ```ts highlight="7" @@ -84,7 +95,7 @@ import { type AssemblyAITranscriptionModelOptions } from '@ai-sdk/assemblyai'; import { readFile } from 'fs/promises'; const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: await readFile('audio.mp3'), providerOptions: { assemblyai: { @@ -118,8 +129,9 @@ The following provider options are available: - **boostParam** _enum_ - Boost parameter for the transcription. + Boost parameter for `wordBoost`. Allowed values: `'low'`, `'default'`, `'high'`. + **Deprecated** — only applies to the deprecated `wordBoost`; use `keytermsPrompt` instead. Optional. - **contentSafety** _boolean_ @@ -143,6 +155,12 @@ The following provider options are available: Whether to include disfluencies (um, uh, etc.) in the transcription. Optional. +- **domain** _string_ + + Enable a domain-specific model for specialized terminology. + Currently supports `'medical-v1'` (Medical Mode). + Optional. + - **entityDetection** _boolean_ Whether to detect entities in the transcription. @@ -163,6 +181,13 @@ The following provider options are available: Whether to include IAB categories in the transcription. Optional. +- **keytermsPrompt** _array of strings_ + + Domain-specific keyterms to boost recognition for (max 6 words per phrase). + Replaces `wordBoost` for newer models — supported by `universal-3-pro`, + `universal-3-5-pro`, and `slam-1` (and `universal-2` when enabled). + Optional. + - **languageCode** _string_ Language code for the audio. @@ -179,11 +204,24 @@ The following provider options are available: Whether to enable language detection. Optional. +- **languageDetectionOptions** _object_ + + Options for automatic language detection: `expectedLanguages` (array of + strings), `fallbackLanguage` (string), `codeSwitching` (boolean), + `codeSwitchingConfidenceThreshold` (number, 0-1). + Optional. + - **multichannel** _boolean_ Whether to process multiple audio channels separately. Optional. +- **prompt** _string_ + + Natural-language context (up to 1,500 words) to steer the model. + Only supported by `universal-3-pro`, `universal-3-5-pro`, and `slam-1`. + Optional. + - **punctuate** _boolean_ Whether to add punctuation to the transcription. @@ -199,6 +237,12 @@ The following provider options are available: Whether to redact PII in the audio file. Optional. +- **redactPiiAudioOptions** _object_ + + Options for PII-redacted audio: `returnRedactedNoSpeechAudio` (boolean), + `overrideAudioRedactionMethod` (`'silence'`). Requires `redactPiiAudio`. + Optional. + - **redactPiiAudioQuality** _enum_ Quality of the redacted audio file. @@ -211,12 +255,32 @@ The following provider options are available: Supports numerous types like `'person_name'`, `'phone_number'`, etc. Optional. +- **redactPiiReturnUnredacted** _boolean_ + + Return the original unredacted transcript alongside the redacted one. + Requires `redactPii`. + Optional. + - **redactPiiSub** _enum_ Substitution method for redacted PII. Allowed values: `'entity_name'`, `'hash'`. Optional. +- **redactStaticEntities** _object_ + + Map of user-defined labels to exact terms to redact, e.g. + `{ INTERNAL_TOOL: ['Bearclaw'] }`. Applied on top of standard PII redaction. + Requires `redactPii`. + Optional. + +- **removeAudioTags** _enum_ + + Remove inline annotations from rich transcripts. + Allowed values: `'all'` (all annotations), `'speaker'` (speaker cues only). + Universal-3 Pro models. + Optional. + - **sentimentAnalysis** _boolean_ Whether to perform sentiment analysis on the transcription. @@ -227,6 +291,12 @@ The following provider options are available: Whether to label different speakers in the transcription. Optional. +- **speakerOptions** _object_ + + Options for speaker diarization: `minSpeakersExpected` (number), + `maxSpeakersExpected` (number). + Optional. + - **speakersExpected** _number_ Expected number of speakers in the audio. @@ -254,6 +324,12 @@ The following provider options are available: Allowed values: `'bullets'`, `'bullets_verbose'`, `'gist'`, `'headline'`, `'paragraph'`. Optional. +- **temperature** _number_ + + Sampling temperature (0-1) controlling randomness. + Universal-3 Pro models. + Optional. + - **webhookAuthHeaderName** _string_ Name of the authentication header for webhook requests. @@ -272,11 +348,60 @@ The following provider options are available: - **wordBoost** _array of strings_ List of words to boost in the transcription. + **Deprecated** — rejected by `universal-3-pro`, `universal-3-5-pro`, and `slam-1` + (works only on `universal-2`/`best`); use `keytermsPrompt` instead. Optional. +### Speaker diarization and audio-intelligence results + +The AI SDK's `transcribe` result exposes `text`, `segments`, `language`, and +`durationInSeconds`. AssemblyAI's richer results — speaker diarization and +audio-intelligence features — don't fit that shape, so they are surfaced in two +places: + +- **`providerMetadata.assemblyai`** — structured results for the features you + enabled: `utterances` (speaker-diarized segments, when `speakerLabels` is set), + `entities`, `sentimentAnalysisResults`, `contentSafetyLabels`, + `iabCategoriesResult`, and `autoHighlightsResult`. +- **`response.body`** — the complete, raw AssemblyAI transcript response, so any + field not surfaced above (e.g. `chapters`, word-level `speaker` labels) is + still available. + +Note: timings inside `providerMetadata` and `response.body` (e.g. +`utterances[].start`) are in **milliseconds**, matching the AssemblyAI API — +whereas the top-level `segments` use **seconds**. + +```ts +import { experimental_transcribe as transcribe } from 'ai'; +import { assemblyai } from '@ai-sdk/assemblyai'; +import { readFile } from 'fs/promises'; + +const result = await transcribe({ + model: assemblyai.transcription('universal-3-5-pro'), + audio: await readFile('audio.mp3'), + providerOptions: { + assemblyai: { + speakerLabels: true, + entityDetection: true, + }, + }, +}); + +const { utterances, entities } = result.providerMetadata?.assemblyai ?? {}; +// utterances: [{ speaker: 'A', text: '…', start, end, … }, …] (start/end in ms) +``` + +The following AssemblyAI features are **deprecated** by the API and not surfaced +in `providerMetadata` (their output remains on the raw `response.body` if +enabled): Summarization, Auto Chapters, and Custom Topics. Note also that some +features are language-gated (e.g. sentiment analysis is English-centric); see +AssemblyAI's documentation for per-language availability. + ### Model Capabilities -| Model | Transcription | Duration | Segments | Language | -| ------ | ------------------- | ------------------- | ------------------- | ------------------- | -| `best` | | | | | -| `nano` | | | | | +| Model | Transcription | Duration | Segments | Language | +| ------------------- | ------------------- | ------------------- | ------------------- | ------------------- | +| `universal-3-5-pro` | | | | | +| `universal-3-pro` | | | | | +| `universal-2` | | | | | +| `best` | | | | | diff --git a/examples/ai-functions/src/transcribe/assemblyai/basic.ts b/examples/ai-functions/src/transcribe/assemblyai/basic.ts index 0c7ec77faf70..2f4abafce37e 100644 --- a/examples/ai-functions/src/transcribe/assemblyai/basic.ts +++ b/examples/ai-functions/src/transcribe/assemblyai/basic.ts @@ -5,7 +5,7 @@ import { run } from '../../lib/run'; run(async () => { const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: await readFile('data/galileo.mp3'), }); diff --git a/examples/ai-functions/src/transcribe/assemblyai/string.ts b/examples/ai-functions/src/transcribe/assemblyai/string.ts index 664091306b78..a77334febd36 100644 --- a/examples/ai-functions/src/transcribe/assemblyai/string.ts +++ b/examples/ai-functions/src/transcribe/assemblyai/string.ts @@ -5,7 +5,7 @@ import { run } from '../../lib/run'; run(async () => { const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: Buffer.from(await readFile('./data/galileo.mp3')).toString('base64'), }); diff --git a/examples/ai-functions/src/transcribe/assemblyai/url.ts b/examples/ai-functions/src/transcribe/assemblyai/url.ts index 6eab50c11acf..d675470c9869 100644 --- a/examples/ai-functions/src/transcribe/assemblyai/url.ts +++ b/examples/ai-functions/src/transcribe/assemblyai/url.ts @@ -4,7 +4,7 @@ import { run } from '../../lib/run'; run(async () => { const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: new URL( 'https://github.com/vercel/ai/raw/refs/heads/main/examples/ai-functions/data/galileo.mp3', ), diff --git a/packages/assemblyai/README.md b/packages/assemblyai/README.md index 81df1dbb2313..9c387e5394e1 100644 --- a/packages/assemblyai/README.md +++ b/packages/assemblyai/README.md @@ -36,7 +36,7 @@ import { assemblyai } from '@ai-sdk/assemblyai'; import { experimental_transcribe as transcribe } from 'ai'; const { text } = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: new URL( 'https://github.com/vercel/ai/raw/refs/heads/main/examples/ai-functions/data/galileo.mp3', ), diff --git a/packages/assemblyai/src/assemblyai-api-types.ts b/packages/assemblyai/src/assemblyai-api-types.ts index 22fd08b0f30c..17d690374cc8 100644 --- a/packages/assemblyai/src/assemblyai-api-types.ts +++ b/packages/assemblyai/src/assemblyai-api-types.ts @@ -28,6 +28,7 @@ export type AssemblyAITranscriptionAPITypes = { /** * How much to boost specified words + * @deprecated Only used with the deprecated `word_boost`. Use `keyterms_prompt`. */ boost_param?: 'low' | 'default' | 'high'; @@ -207,6 +208,16 @@ export type AssemblyAITranscriptionAPITypes = { */ language_detection?: boolean; + /** + * Options for automatic language detection. + */ + language_detection_options?: { + expected_languages?: string[]; + fallback_language?: string; + code_switching?: boolean; + code_switching_confidence_threshold?: number; + }; + /** * Enable Multichannel transcription, can be true or false. * @default false @@ -231,6 +242,14 @@ export type AssemblyAITranscriptionAPITypes = { */ redact_pii_audio?: boolean; + /** + * Options for PII-redacted audio files. Requires redact_pii. + */ + redact_pii_audio_options?: { + return_redacted_no_speech_audio?: boolean; + override_audio_redaction_method?: 'silence'; + }; + /** * Controls the filetype of the audio created by redact_pii_audio. Currently supports mp3 (default) and wav. */ @@ -286,11 +305,23 @@ export type AssemblyAITranscriptionAPITypes = { | 'zodiac_sign' >; + /** + * Return the original unredacted transcript alongside the redacted one. + * Requires redact_pii. + */ + redact_pii_return_unredacted?: boolean; + /** * The replacement logic for detected PII, can be "entity_name" or "hash". */ redact_pii_sub?: 'entity_name' | 'hash'; + /** + * Map of user-defined labels to exact terms to redact, applied on top of + * standard PII redaction. Requires redact_pii. + */ + redact_static_entities?: Record; + /** * Enable Sentiment Analysis, can be true or false * @default false @@ -303,6 +334,14 @@ export type AssemblyAITranscriptionAPITypes = { */ speaker_labels?: boolean; + /** + * Options for speaker diarization, e.g. a range of possible speakers. + */ + speaker_options?: { + min_speakers_expected?: number; + max_speakers_expected?: number; + }; + /** * Tells the speaker label model how many speakers it should attempt to identify, up to 10. */ @@ -310,8 +349,21 @@ export type AssemblyAITranscriptionAPITypes = { /** * The speech model to use for the transcription. + * + * @deprecated This parameter has been replaced with `speech_models`. It only + * supports the legacy `best` model. Use `speech_models` for `universal-2`, + * `universal-3-pro`, `universal-3-5-pro`, etc. + * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model */ - speech_model?: 'best' | 'nano'; + speech_model?: 'best'; + + /** + * List of speech models in priority order, allowing the system to + * automatically route the audio to the best available option. When omitted, + * the API defaults to `['universal-3-pro', 'universal-2']`. + * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model + */ + speech_models?: string[]; /** * Reject audio files that contain less than this fraction of speech. Valid values are in the range [0, 1] inclusive. @@ -357,6 +409,37 @@ export type AssemblyAITranscriptionAPITypes = { /** * The list of custom vocabulary to boost transcription probability for + * @deprecated Rejected by `universal-3-pro` / `universal-3-5-pro` and + * `slam-1` (works only on `universal-2`/`best`). Use `keyterms_prompt`. */ word_boost?: string[]; + + /** + * Domain-specific keyterms to boost (max 6 words per phrase). Replaces + * `word_boost` for `universal-3-pro` / `universal-3-5-pro` and `slam-1`. + */ + keyterms_prompt?: string[]; + + /** + * Natural-language context (up to 1,500 words) to steer the model. + * Only supported by `universal-3-pro` / `universal-3-5-pro` and `slam-1`. + */ + prompt?: string; + + /** + * Sampling temperature (0-1) controlling randomness. Universal-3 Pro models. + */ + temperature?: number; + + /** + * Remove inline annotations from rich transcripts: `'all'` removes all + * annotations, `'speaker'` removes only speaker cues. Universal-3 Pro models. + */ + remove_audio_tags?: 'all' | 'speaker'; + + /** + * Enable a domain-specific model to improve accuracy for specialized + * terminology, e.g. `'medical-v1'` for Medical Mode. + */ + domain?: string; }; diff --git a/packages/assemblyai/src/assemblyai-provider.ts b/packages/assemblyai/src/assemblyai-provider.ts index 9522b55ecc50..f938e1aa7821 100644 --- a/packages/assemblyai/src/assemblyai-provider.ts +++ b/packages/assemblyai/src/assemblyai-provider.ts @@ -14,7 +14,7 @@ import { VERSION } from './version'; export interface AssemblyAIProvider extends ProviderV3 { ( - modelId: 'best', + modelId: AssemblyAITranscriptionModelId, settings?: {}, ): { transcription: AssemblyAITranscriptionModel; diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index 2147a6c05b66..048e32b9df0f 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -233,6 +233,16 @@ describe('doGenerate', () => { summary_model: 'informative', summary: '- Hello, world!', sentiment_analysis: true, + sentiment_analysis_results: [ + { + text: 'Hello, world!', + start: 250, + end: 26950, + sentiment: 'POSITIVE', + confidence: 0.9, + speaker: 'A', + }, + ], entity_detection: true, entities: [ { @@ -256,18 +266,361 @@ describe('doGenerate', () => { }; } - it('should pass the model', async () => { + it('should pass the legacy model via the speech_model parameter', async () => { prepareJsonResponse(); - await model.doGenerate({ + const result = await model.doGenerate({ audio: audioData, mediaType: 'audio/wav', }); - expect(await server.calls[1].requestBodyJson).toMatchObject({ + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ audio_url: 'https://storage.assemblyai.com/mock-upload-url', speech_model: 'best', }); + expect(requestBody.speech_models).toBeUndefined(); + + const [deprecation] = result.warnings.filter( + warning => warning.type === 'other', + ); + expect(deprecation?.message).toContain('universal-3-5-pro'); + expect(deprecation?.message).toContain( + 'https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model', + ); + }); + + it('should pass newer models via the speech_models parameter', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + audio_url: 'https://storage.assemblyai.com/mock-upload-url', + speech_models: ['universal-3-5-pro'], + }); + expect(requestBody.speech_model).toBeUndefined(); + + // No deprecation and no nudge for the latest flagship model. + expect(result.warnings).toEqual([]); + }); + + it('should route universal-3-pro via speech_models and nudge to universal-3-5-pro', async () => { + prepareJsonResponse(); + + const result = await provider.transcription('universal-3-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody.speech_models).toEqual(['universal-3-pro']); + expect(requestBody.speech_model).toBeUndefined(); + + // universal-3-pro is the model universal-3-5-pro replaces, so the message + // names it explicitly. + const [nudge] = result.warnings.filter(w => w.type === 'other'); + expect(nudge?.message).toContain('universal-3-5-pro'); + expect(nudge?.message).toContain("replace 'universal-3-pro'"); + }); + + it('should nudge universal-2 users toward universal-3-5-pro', async () => { + prepareJsonResponse(); + + const result = await provider.transcription('universal-2').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + // The nudge for universal-2 must not claim it is replaced by universal-3-pro. + const [nudge] = result.warnings.filter(w => w.type === 'other'); + expect(nudge?.message).toContain('universal-3-5-pro'); + expect(nudge?.message).not.toContain("replace 'universal-3-pro'"); + }); + + it('should not special-case the removed nano model', async () => { + prepareJsonResponse(); + + const result = await provider.transcription('nano').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + // `nano` is no longer a legacy `speech_model` alias: it falls through to + // `speech_models` (where the live API rejects it) and emits no warning. + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody.speech_models).toEqual(['nano']); + expect(requestBody.speech_model).toBeUndefined(); + expect(result.warnings).toEqual([]); + }); + + it('should still send provider options alongside speech_models', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-5-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + languageDetection: true, + punctuate: false, + }, + }, + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + speech_models: ['universal-3-5-pro'], + language_detection: true, + punctuate: false, + }); + }); + + it('should surface diarization + audio-intelligence via providerMetadata', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const metadata = result.providerMetadata?.assemblyai as + | Record + | undefined; + expect(metadata).toBeDefined(); + + // Speaker diarization + expect(metadata?.utterances?.[0]).toMatchObject({ + speaker: 'A', + text: 'Hello, world!', + }); + + // Audio-intelligence results + expect(metadata?.entities?.[0]).toMatchObject({ + entity_type: 'location', + text: 'Canada', + }); + expect(metadata?.sentimentAnalysisResults?.[0]).toMatchObject({ + sentiment: 'POSITIVE', + text: 'Hello, world!', + }); + expect(metadata?.contentSafetyLabels).toBeDefined(); + expect(metadata?.iabCategoriesResult).toBeDefined(); + expect(metadata?.autoHighlightsResult).toBeDefined(); + }); + + it('should preserve the full raw response on response.body', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const body = result.response.body as Record; + // Word-level speaker label survives on the raw body. + expect(body.words[0].speaker).toBe('speaker'); + // Fields not modeled in our schema (e.g. chapters, summary) are no longer + // stripped — proves response.body is the raw response, not the parsed one. + expect(body.chapters).toBeDefined(); + expect(body.summary).toBe('- Hello, world!'); + }); + + it('should pass the Universal-3-Pro input params', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-5-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + prompt: 'This is a conversation about the AI SDK.', + keytermsPrompt: ['Vercel', 'AI SDK'], + temperature: 0.2, + removeAudioTags: 'speaker', + domain: 'medical-v1', + }, + }, + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + speech_models: ['universal-3-5-pro'], + prompt: 'This is a conversation about the AI SDK.', + keyterms_prompt: ['Vercel', 'AI SDK'], + temperature: 0.2, + remove_audio_tags: 'speaker', + domain: 'medical-v1', + }); + }); + + it('should pass the GA nested input params', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-5-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + redactPii: true, + speakerOptions: { minSpeakersExpected: 1, maxSpeakersExpected: 3 }, + languageDetectionOptions: { + expectedLanguages: ['en', 'es'], + fallbackLanguage: 'en', + codeSwitching: true, + codeSwitchingConfidenceThreshold: 0.5, + }, + redactPiiAudioOptions: { + returnRedactedNoSpeechAudio: true, + overrideAudioRedactionMethod: 'silence', + }, + redactPiiReturnUnredacted: true, + redactStaticEntities: { INTERNAL_TOOL: ['Bearclaw'] }, + }, + }, + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + speaker_options: { min_speakers_expected: 1, max_speakers_expected: 3 }, + language_detection_options: { + expected_languages: ['en', 'es'], + fallback_language: 'en', + code_switching: true, + code_switching_confidence_threshold: 0.5, + }, + redact_pii_audio_options: { + return_redacted_no_speech_audio: true, + override_audio_redaction_method: 'silence', + }, + redact_pii_return_unredacted: true, + redact_static_entities: { INTERNAL_TOOL: ['Bearclaw'] }, + }); + }); + + it('should warn when deprecated wordBoost/boostParam options are used', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { wordBoost: ['Vercel'], boostParam: 'high' }, + }, + }); + + const [boostWarning] = result.warnings.filter(w => w.type === 'other'); + expect(boostWarning?.message).toContain('wordBoost'); + expect(boostWarning?.message).toContain('boostParam'); + expect(boostWarning?.message).toContain('keytermsPrompt'); + }); + + it('should attribute the deprecation warning to boostParam when only boostParam is set', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { assemblyai: { boostParam: 'high' } }, + }); + + const [boostWarning] = result.warnings.filter(w => w.type === 'other'); + expect(boostWarning?.message).toContain('boostParam'); + expect(boostWarning?.message).not.toContain('wordBoost'); + expect(boostWarning?.message).toContain('keytermsPrompt'); + }); + + it('should warn when redactPii-dependent options are set without redactPii', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { redactStaticEntities: { TOOL: ['Vercel'] } }, + }, + }); + + expect( + result.warnings.some( + w => w.type === 'other' && w.message.includes('redactPii'), + ), + ).toBe(true); + }); + + it('should warn when redactPiiAudioOptions is set without redactPiiAudio', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + redactPii: true, + redactPiiAudioOptions: { overrideAudioRedactionMethod: 'silence' }, + }, + }, + }); + + expect( + result.warnings.some( + w => w.type === 'other' && w.message.includes('redactPiiAudio'), + ), + ).toBe(true); + }); + + it('should warn when languageCode and languageDetection are combined', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { languageCode: 'en', languageDetection: true }, + }, + }); + + expect( + result.warnings.some( + w => w.type === 'other' && w.message.includes('languageDetection'), + ), + ).toBe(true); + }); + + it('should report segment timings in seconds (ms converted)', async () => { + prepareJsonResponse(); + + const result = await model.doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + // Fixture word[0] is start: 250ms, end: 650ms → 0.25s / 0.65s. + expect(result.segments[0]).toEqual({ + text: 'Hello,', + startSecond: 0.25, + endSecond: 0.65, + }); }); it('should pass headers', async () => { diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index 3d39255e32a4..1326186384fe 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -1,4 +1,8 @@ -import type { TranscriptionModelV3, SharedV3Warning } from '@ai-sdk/provider'; +import type { + TranscriptionModelV3, + SharedV3Warning, + SharedV3ProviderMetadata, +} from '@ai-sdk/provider'; import { combineHeaders, createJsonResponseHandler, @@ -32,8 +36,12 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ */ autoHighlights: z.boolean().nullish(), /** - * Boost parameter for the transcription. + * Boost parameter for word boost (used with `wordBoost`). * Allowed values: 'low', 'default', 'high'. + * + * @deprecated Only applies to the deprecated `wordBoost` option. Use + * `keytermsPrompt` instead, which works with the recommended `universal-*` + * models. */ boostParam: z.string().nullish(), /** @@ -59,6 +67,11 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to include filler words (um, uh, etc.) in the transcription. */ disfluencies: z.boolean().nullish(), + /** + * Enable a domain-specific model to improve accuracy for specialized + * terminology. Currently supports `'medical-v1'` (Medical Mode). + */ + domain: z.string().nullish(), /** * Whether to enable entity detection. */ @@ -75,6 +88,13 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to enable IAB categories detection. */ iabCategories: z.boolean().nullish(), + /** + * Domain-specific keyterms to boost recognition for (max 6 words per phrase). + * Replaces `wordBoost` for newer models: supported by `universal-3-pro` / + * `universal-3-5-pro` and `slam-1` (and `universal-2` when metaphone is + * enabled for the account). + */ + keytermsPrompt: z.array(z.string()).nullish(), /** * Language code for the transcription. */ @@ -87,10 +107,30 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to enable language detection. */ languageDetection: z.boolean().nullish(), + /** + * Options for automatic language detection. + */ + languageDetectionOptions: z + .object({ + /** List of languages expected in the audio file. */ + expectedLanguages: z.array(z.string()).nullish(), + /** Fallback language if the detected language is not expected. */ + fallbackLanguage: z.string().nullish(), + /** Whether code switching should be detected. */ + codeSwitching: z.boolean().nullish(), + /** Confidence threshold for code switching detection (0-1). */ + codeSwitchingConfidenceThreshold: z.number().min(0).max(1).nullish(), + }) + .nullish(), /** * Whether to process audio as multichannel. */ multichannel: z.boolean().nullish(), + /** + * Provide natural-language context (up to 1,500 words) to steer the model. + * Only supported by `universal-3-pro` / `universal-3-5-pro` and `slam-1`. + */ + prompt: z.string().nullish(), /** * Whether to add punctuation to the transcription. */ @@ -103,6 +143,17 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to redact PII in the audio file. */ redactPiiAudio: z.boolean().nullish(), + /** + * Options for PII-redacted audio files. Requires `redactPiiAudio`. + */ + redactPiiAudioOptions: z + .object({ + /** Return redacted audio even for files without detected speech. */ + returnRedactedNoSpeechAudio: z.boolean().nullish(), + /** Redaction method; set to `'silence'` to replace PII with silence. */ + overrideAudioRedactionMethod: z.enum(['silence']).nullish(), + }) + .nullish(), /** * Audio format for PII redaction. */ @@ -111,10 +162,26 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * List of PII types to redact. */ redactPiiPolicies: z.array(z.string()).nullish(), + /** + * Return the original unredacted transcript alongside the redacted one. + * Requires `redactPii`. + */ + redactPiiReturnUnredacted: z.boolean().nullish(), /** * Substitution method for redacted PII. */ redactPiiSub: z.string().nullish(), + /** + * Map of user-defined labels to exact terms to redact, e.g. + * `{ INTERNAL_TOOL: ['Bearclaw'] }`. Applied on top of standard PII redaction + * using `redactPiiSub`. Requires `redactPii`. + */ + redactStaticEntities: z.record(z.string(), z.array(z.string())).nullish(), + /** + * Remove inline annotations from rich transcripts. `'all'` removes all inline + * annotations; `'speaker'` removes only speaker cues. Universal-3 Pro models. + */ + removeAudioTags: z.enum(['all', 'speaker']).nullish(), /** * Whether to enable sentiment analysis. */ @@ -123,6 +190,17 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to identify different speakers in the audio. */ speakerLabels: z.boolean().nullish(), + /** + * Options for speaker diarization, e.g. a range of possible speakers. + */ + speakerOptions: z + .object({ + /** Minimum number of speakers expected in the audio file. */ + minSpeakersExpected: z.number().int().nullish(), + /** Maximum number of speakers expected in the audio file. */ + maxSpeakersExpected: z.number().int().nullish(), + }) + .nullish(), /** * Number of speakers expected in the audio. */ @@ -143,6 +221,10 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Type of summary to generate. */ summaryType: z.string().nullish(), + /** + * Sampling temperature (0-1) controlling randomness. Universal-3 Pro models. + */ + temperature: z.number().min(0).max(1).nullish(), /** * Name of the authentication header for webhook requests. */ @@ -157,6 +239,10 @@ const assemblyaiTranscriptionModelOptionsSchema = z.object({ webhookUrl: z.string().nullish(), /** * List of words to boost recognition for. + * + * @deprecated `wordBoost` is rejected by `universal-3-pro` / + * `universal-3-5-pro` and `slam-1` (it only works on `universal-2`/`best`). + * Use `keytermsPrompt` instead. */ wordBoost: z.array(z.string()).nullish(), }); @@ -200,9 +286,41 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV3 { schema: assemblyaiTranscriptionModelOptionsSchema, }); - const body: Omit = { - speech_model: this.modelId, - }; + const body: Omit = {}; + + // The legacy `best` model is selected via the deprecated singular + // `speech_model` parameter. All other models (e.g. `universal-2`, + // `universal-3-pro`, `universal-3-5-pro`) are only accessible via the + // `speech_models` array and are rejected by `speech_model`. + // See https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model + if (this.modelId === 'best') { + body.speech_model = this.modelId as 'best'; + warnings.push({ + type: 'other', + message: + "The 'best' model is a legacy AssemblyAI model. Use 'universal-3-5-pro' instead. See documentation: https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model", + }); + } else { + body.speech_models = [this.modelId]; + + // Forward-looking nudge: universal-3-5-pro is AssemblyAI's latest + // flagship and is set to replace universal-3-pro. Not a deprecation — + // both models still work — so this is an informational warning only. + if ( + this.modelId === 'universal-3-pro' || + this.modelId === 'universal-2' + ) { + const docsUrl = + 'https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model'; + warnings.push({ + type: 'other', + message: + this.modelId === 'universal-3-pro' + ? `'universal-3-5-pro' is AssemblyAI's latest flagship model and is set to replace 'universal-3-pro'. See ${docsUrl}` + : `'universal-3-5-pro' is AssemblyAI's latest flagship model. See ${docsUrl}`, + }); + } + } // Add provider-specific options if (assemblyaiOptions) { @@ -252,6 +370,103 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV3 { assemblyaiOptions.webhookAuthHeaderValue ?? undefined; body.webhook_url = assemblyaiOptions.webhookUrl ?? undefined; body.word_boost = assemblyaiOptions.wordBoost ?? undefined; + body.keyterms_prompt = assemblyaiOptions.keytermsPrompt ?? undefined; + body.prompt = assemblyaiOptions.prompt ?? undefined; + body.temperature = assemblyaiOptions.temperature ?? undefined; + body.remove_audio_tags = assemblyaiOptions.removeAudioTags ?? undefined; + body.domain = assemblyaiOptions.domain ?? undefined; + body.redact_pii_return_unredacted = + assemblyaiOptions.redactPiiReturnUnredacted ?? undefined; + body.redact_static_entities = + assemblyaiOptions.redactStaticEntities ?? undefined; + + if (assemblyaiOptions.speakerOptions) { + body.speaker_options = { + min_speakers_expected: + assemblyaiOptions.speakerOptions.minSpeakersExpected ?? undefined, + max_speakers_expected: + assemblyaiOptions.speakerOptions.maxSpeakersExpected ?? undefined, + }; + } + + if (assemblyaiOptions.languageDetectionOptions) { + body.language_detection_options = { + expected_languages: + assemblyaiOptions.languageDetectionOptions.expectedLanguages ?? + undefined, + fallback_language: + assemblyaiOptions.languageDetectionOptions.fallbackLanguage ?? + undefined, + code_switching: + assemblyaiOptions.languageDetectionOptions.codeSwitching ?? + undefined, + code_switching_confidence_threshold: + assemblyaiOptions.languageDetectionOptions + .codeSwitchingConfidenceThreshold ?? undefined, + }; + } + + if (assemblyaiOptions.redactPiiAudioOptions) { + body.redact_pii_audio_options = { + return_redacted_no_speech_audio: + assemblyaiOptions.redactPiiAudioOptions + .returnRedactedNoSpeechAudio ?? undefined, + override_audio_redaction_method: + assemblyaiOptions.redactPiiAudioOptions + .overrideAudioRedactionMethod ?? undefined, + }; + } + + const deprecatedBoostOptions: string[] = []; + if (assemblyaiOptions.wordBoost != null) { + deprecatedBoostOptions.push('wordBoost'); + } + if (assemblyaiOptions.boostParam != null) { + deprecatedBoostOptions.push('boostParam'); + } + if (deprecatedBoostOptions.length > 0) { + warnings.push({ + type: 'other', + message: `${deprecatedBoostOptions.join(', ')} ${ + deprecatedBoostOptions.length > 1 ? 'are' : 'is' + } deprecated and rejected by 'universal-3-pro' / 'universal-3-5-pro' and 'slam-1'. Use 'keytermsPrompt' instead.`, + }); + } + + // The following options only take effect alongside a prerequisite + // option; without it AssemblyAI either rejects the request (400) or + // silently ignores the option. Warn rather than mutate user input. + if ( + (assemblyaiOptions.redactPiiReturnUnredacted != null || + assemblyaiOptions.redactStaticEntities != null) && + !assemblyaiOptions.redactPii + ) { + warnings.push({ + type: 'other', + message: + "'redactPiiReturnUnredacted' and 'redactStaticEntities' require 'redactPii' to be enabled; AssemblyAI rejects the request otherwise.", + }); + } + if ( + assemblyaiOptions.redactPiiAudioOptions != null && + !assemblyaiOptions.redactPiiAudio + ) { + warnings.push({ + type: 'other', + message: + "'redactPiiAudioOptions' only applies when 'redactPiiAudio' is enabled; it is otherwise ignored.", + }); + } + if ( + assemblyaiOptions.languageCode != null && + assemblyaiOptions.languageDetection + ) { + warnings.push({ + type: 'other', + message: + "'languageDetection' cannot be combined with an explicit 'languageCode'; AssemblyAI rejects requests that set both.", + }); + } } return { @@ -271,17 +486,22 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV3 { abortSignal?: AbortSignal, ): Promise<{ transcript: z.infer; + rawTranscript: unknown; responseHeaders: Record; }> { const pollingInterval = this.config.pollingInterval ?? this.POLLING_INTERVAL_MS; + // Honor a caller-provided fetch (proxy, auth injection, tests) for the + // polling GETs, matching the upload/submit calls that use config.fetch. + const fetchImpl = this.config.fetch ?? globalThis.fetch; + while (true) { if (abortSignal?.aborted) { throw new Error('Transcription request was aborted'); } - const response = await fetch( + const response = await fetchImpl( this.config.url({ path: `/v2/transcript/${transcriptId}`, modelId: this.modelId, @@ -307,13 +527,14 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV3 { }); } - const transcript = assemblyaiTranscriptionResponseSchema.parse( - await response.json(), - ); + const rawTranscript = await response.json(); + const transcript = + assemblyaiTranscriptionResponseSchema.parse(rawTranscript); if (transcript.status === 'completed') { return { transcript, + rawTranscript, responseHeaders: extractResponseHeaders(response), }; } @@ -374,29 +595,70 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV3 { fetch: this.config.fetch, }); - const { transcript, responseHeaders } = await this.waitForCompletion( - submitResponse.id, - options.headers, - options.abortSignal, - ); + const { transcript, rawTranscript, responseHeaders } = + await this.waitForCompletion( + submitResponse.id, + options.headers, + options.abortSignal, + ); + + // Surface diarization and audio-intelligence results that the AI SDK's + // `segments` shape can't represent, keyed under `assemblyai`. Presence is + // gated on the parsed transcript, but values are taken from the raw + // response so no fields are stripped by the schema. + // + // NOTE: timings inside these objects (e.g. `utterances[].start`) are in + // milliseconds, matching the AssemblyAI API — unlike the top-level + // `segments`, whose `startSecond`/`endSecond` are in seconds. + const raw = (rawTranscript ?? {}) as Record; + const assemblyaiMetadata: Record = {}; + if (transcript.utterances != null) { + assemblyaiMetadata.utterances = raw.utterances; + } + if (transcript.sentiment_analysis_results != null) { + assemblyaiMetadata.sentimentAnalysisResults = + raw.sentiment_analysis_results; + } + if (transcript.entities != null) { + assemblyaiMetadata.entities = raw.entities; + } + if (transcript.content_safety_labels != null) { + assemblyaiMetadata.contentSafetyLabels = raw.content_safety_labels; + } + if (transcript.iab_categories_result != null) { + assemblyaiMetadata.iabCategoriesResult = raw.iab_categories_result; + } + if (transcript.auto_highlights_result != null) { + assemblyaiMetadata.autoHighlightsResult = raw.auto_highlights_result; + } + + const lastWordEndMs = transcript.words?.at(-1)?.end; return { text: transcript.text ?? '', + // AssemblyAI returns word timings in milliseconds; the AI SDK reports + // segment timings in seconds. segments: transcript.words?.map(word => ({ text: word.text, - startSecond: word.start, - endSecond: word.end, + startSecond: word.start / 1000, + endSecond: word.end / 1000, })) ?? [], language: transcript.language_code ?? undefined, durationInSeconds: - transcript.audio_duration ?? transcript.words?.at(-1)?.end ?? undefined, + transcript.audio_duration ?? + (lastWordEndMs != null ? lastWordEndMs / 1000 : undefined), warnings, + ...(Object.keys(assemblyaiMetadata).length > 0 && { + providerMetadata: { + assemblyai: assemblyaiMetadata, + } as SharedV3ProviderMetadata, + }), response: { timestamp: currentDate, modelId: this.modelId, headers: responseHeaders, // Headers from final GET request - body: transcript, // Raw response from final GET request + body: rawTranscript, // Full raw response from final GET request }, }; } @@ -411,20 +673,65 @@ const assemblyaiSubmitResponseSchema = z.object({ status: z.enum(['queued', 'processing', 'completed', 'error']), }); +const assemblyaiWordSchema = z.object({ + start: z.number(), + end: z.number(), + text: z.string(), + confidence: z.number().nullish(), + // Speaker label (e.g. 'A', 'B') when speaker diarization is enabled, else null. + speaker: z.string().nullish(), + channel: z.string().nullish(), +}); + const assemblyaiTranscriptionResponseSchema = z.object({ id: z.string(), status: z.enum(['queued', 'processing', 'completed', 'error']), text: z.string().nullish(), language_code: z.string().nullish(), - words: z + speech_model_used: z.string().nullish(), + words: z.array(assemblyaiWordSchema).nullish(), + // Speaker-diarized utterances (present when `speaker_labels` is enabled). + utterances: z .array( z.object({ start: z.number(), end: z.number(), text: z.string(), + confidence: z.number().nullish(), + speaker: z.string().nullish(), + channel: z.string().nullish(), + words: z.array(assemblyaiWordSchema).nullish(), + }), + ) + .nullish(), + // Audio-intelligence results, present only when the matching feature is + // enabled. Kept intentionally permissive (the full structures are also + // available on the raw `response.body`). + sentiment_analysis_results: z + .array( + z.object({ + text: z.string(), + start: z.number().nullish(), + end: z.number().nullish(), + sentiment: z.string(), + confidence: z.number().nullish(), + speaker: z.string().nullish(), + }), + ) + .nullish(), + entities: z + .array( + z.object({ + entity_type: z.string(), + text: z.string(), + start: z.number().nullish(), + end: z.number().nullish(), }), ) .nullish(), + content_safety_labels: z.record(z.string(), z.any()).nullish(), + iab_categories_result: z.record(z.string(), z.any()).nullish(), + auto_highlights_result: z.record(z.string(), z.any()).nullish(), audio_duration: z.number().nullish(), error: z.string().nullish(), }); diff --git a/packages/assemblyai/src/assemblyai-transcription-settings.ts b/packages/assemblyai/src/assemblyai-transcription-settings.ts index f83e8d2e297a..ef3ba0c1ba1a 100644 --- a/packages/assemblyai/src/assemblyai-transcription-settings.ts +++ b/packages/assemblyai/src/assemblyai-transcription-settings.ts @@ -1 +1,15 @@ -export type AssemblyAITranscriptionModelId = 'best' | 'nano'; +/** + * Legacy AssemblyAI speech model, sent via the deprecated singular + * `speech_model` request parameter. + * + * @deprecated Use `universal-3-5-pro` instead. + * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model + */ +export type AssemblyAIDeprecatedTranscriptionModelId = 'best'; + +export type AssemblyAITranscriptionModelId = + | 'universal-2' + | 'universal-3-pro' + | 'universal-3-5-pro' + | AssemblyAIDeprecatedTranscriptionModelId + | (string & {});