From 7b3f4efa8b59e65b76e00352b2eb121b0d489982 Mon Sep 17 00:00:00 2001 From: David Lange Date: Mon, 29 Jun 2026 17:39:26 -0400 Subject: [PATCH 01/12] feat(assemblyai): support universal-3-5-pro and current speech models Add universal-3-5-pro, universal-3-pro, and universal-2 to the transcription model ids. These newer models are only accessible through AssemblyAI's speech_models request parameter (the singular speech_model parameter is deprecated and rejects them), so the provider now routes the model id to the correct parameter automatically: legacy best/nano use speech_model, all other models use speech_models. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-universal-3-5-pro.md | 12 ++++++++++ .../docs/03-ai-sdk-core/36-transcription.mdx | 2 ++ .../01-ai-sdk-providers/100-assemblyai.mdx | 17 +++++++++----- .../assemblyai/universal-3-5-pro.ts | 18 +++++++++++++++ .../assemblyai/src/assemblyai-api-types.ts | 13 +++++++++++ .../assemblyai-transcription-model.test.ts | 22 +++++++++++++++++-- .../src/assemblyai-transcription-model.ts | 16 +++++++++++--- .../src/assemblyai-transcription-settings.ts | 8 ++++++- 8 files changed, 96 insertions(+), 12 deletions(-) create mode 100644 .changeset/assemblyai-universal-3-5-pro.md create mode 100644 examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts diff --git a/.changeset/assemblyai-universal-3-5-pro.md b/.changeset/assemblyai-universal-3-5-pro.md new file mode 100644 index 000000000000..37251ae95d2d --- /dev/null +++ b/.changeset/assemblyai-universal-3-5-pro.md @@ -0,0 +1,12 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): support `universal-3-5-pro` and other current speech models + +Adds `universal-3-5-pro`, `universal-3-pro`, and `universal-2` to the +transcription model ids. These newer models are only accessible through +AssemblyAI's `speech_models` request parameter (the singular `speech_model` +parameter is deprecated and rejects them), so the provider now routes the model +id to the correct parameter automatically: the legacy `best` and `nano` models +continue to use `speech_model`, while all other models use `speech_models`. diff --git a/content/docs/03-ai-sdk-core/36-transcription.mdx b/content/docs/03-ai-sdk-core/36-transcription.mdx index da049e93a9e4..d6a2b3573a34 100644 --- a/content/docs/03-ai-sdk-core/36-transcription.mdx +++ b/content/docs/03-ai-sdk-core/36-transcription.mdx @@ -214,6 +214,8 @@ try { | [Deepgram](/providers/ai-sdk-providers/deepgram#transcription-models) | `nova-2` (+ variants) | | [Deepgram](/providers/ai-sdk-providers/deepgram#transcription-models) | `nova-3` (+ variants) | | [Gladia](/providers/ai-sdk-providers/gladia#transcription-models) | `default` | +| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-5-pro` | +| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-pro` | | [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `best` | | [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `nano` | | [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `whisper` | diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index 23fb292986d0..d1611b55cbdf 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -69,12 +69,17 @@ You can use the following optional settings to customize the AssemblyAI provider You can create models that call the [AssemblyAI transcription API](https://www.assemblyai.com/docs/getting-started/transcribe-an-audio-file/typescript) using the `.transcription()` factory method. -The first argument is the model id e.g. `best`. +The first argument is the model id, e.g. `universal-3-5-pro`. ```ts -const model = assemblyai.transcription('best'); +const model = assemblyai.transcription('universal-3-5-pro'); ``` +The legacy `best` and `nano` models are sent using AssemblyAI's deprecated +`speech_model` request parameter. All newer models (e.g. `universal-3-5-pro`) are sent using the +[`speech_models`](https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model) +request parameter, which the provider selects automatically based on the model id. + You can also pass additional provider-specific options using the `providerOptions` argument. For example, supplying the `contentSafety` option will enable content safety filtering. ```ts highlight="7" @@ -276,7 +281,7 @@ The following provider options are available: ### Model Capabilities -| Model | Transcription | Duration | Segments | Language | -| ------ | ------------------- | ------------------- | ------------------- | ------------------- | -| `best` | | | | | -| `nano` | | | | | +| Model | Transcription | Duration | Segments | Language | +| ------------------- | ------------------- | ------------------- | ------------------- | ------------------- | +| `universal-3-5-pro` | | | | | + diff --git a/examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts b/examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts new file mode 100644 index 000000000000..f50e5f519232 --- /dev/null +++ b/examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts @@ -0,0 +1,18 @@ +import { assemblyai } from '@ai-sdk/assemblyai'; +import { transcribe } from 'ai'; +import { readFile } from 'fs/promises'; +import { run } from '../../lib/run'; + +run(async () => { + const result = await transcribe({ + model: assemblyai.transcription('universal-3-5-pro'), + audio: await readFile('data/galileo.mp3'), + }); + + console.log('Text:', result.text); + console.log('Duration:', result.durationInSeconds); + console.log('Language:', result.language); + console.log('Segments:', result.segments); + console.log('Warnings:', result.warnings); + console.log('Responses:', result.responses); +}); diff --git a/packages/assemblyai/src/assemblyai-api-types.ts b/packages/assemblyai/src/assemblyai-api-types.ts index 22fd08b0f30c..99fa2f1c474e 100644 --- a/packages/assemblyai/src/assemblyai-api-types.ts +++ b/packages/assemblyai/src/assemblyai-api-types.ts @@ -310,9 +310,22 @@ export type AssemblyAITranscriptionAPITypes = { /** * The speech model to use for the transcription. + * + * @deprecated This parameter has been replaced with `speech_models`. It only + * supports the legacy `best` and `nano` models. Use `speech_models` for + * `universal-2`, `universal-3-pro`, `universal-3-5-pro`, etc. + * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model */ speech_model?: 'best' | 'nano'; + /** + * List of speech models in priority order, allowing the system to + * automatically route the audio to the best available option. When omitted, + * the API defaults to `['universal-3-pro', 'universal-2']`. + * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model + */ + speech_models?: string[]; + /** * Reject audio files that contain less than this fraction of speech. Valid values are in the range [0, 1] inclusive. */ diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index 2147a6c05b66..c6563b4b3fd8 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -256,7 +256,7 @@ describe('doGenerate', () => { }; } - it('should pass the model', async () => { + it('should pass the legacy model via the speech_model parameter', async () => { prepareJsonResponse(); await model.doGenerate({ @@ -264,10 +264,28 @@ describe('doGenerate', () => { mediaType: 'audio/wav', }); - expect(await server.calls[1].requestBodyJson).toMatchObject({ + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ audio_url: 'https://storage.assemblyai.com/mock-upload-url', speech_model: 'best', }); + expect(requestBody.speech_models).toBeUndefined(); + }); + + it('should pass newer models via the speech_models parameter', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-5-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + audio_url: 'https://storage.assemblyai.com/mock-upload-url', + speech_models: ['universal-3-5-pro'], + }); + expect(requestBody.speech_model).toBeUndefined(); }); it('should pass headers', async () => { diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index f6b6ce2e3b69..91e0efb772b9 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -66,9 +66,18 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { schema: assemblyaiTranscriptionModelOptionsSchema, }); - const body: Omit = { - speech_model: this.modelId as 'best' | 'nano', - }; + const body: Omit = {}; + + // The legacy `best` and `nano` models are selected via the deprecated + // singular `speech_model` parameter. All newer models (e.g. `universal-2`, + // `universal-3-pro`, `universal-3-5-pro`) are only accessible via the + // `speech_models` array and are rejected by `speech_model`. + // See https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model + if (this.modelId === 'best' || this.modelId === 'nano') { + body.speech_model = this.modelId as 'best' | 'nano'; + } else { + body.speech_models = [this.modelId]; + } // Add provider-specific options if (assemblyaiOptions) { @@ -282,6 +291,7 @@ const assemblyaiTranscriptionResponseSchema = z.object({ status: z.enum(['queued', 'processing', 'completed', 'error']), text: z.string().nullish(), language_code: z.string().nullish(), + speech_model_used: z.string().nullish(), words: z .array( z.object({ diff --git a/packages/assemblyai/src/assemblyai-transcription-settings.ts b/packages/assemblyai/src/assemblyai-transcription-settings.ts index 61ee8e09f29d..9be2a7f011e1 100644 --- a/packages/assemblyai/src/assemblyai-transcription-settings.ts +++ b/packages/assemblyai/src/assemblyai-transcription-settings.ts @@ -1 +1,7 @@ -export type AssemblyAITranscriptionModelId = 'best' | 'nano' | (string & {}); +export type AssemblyAITranscriptionModelId = + | 'best' + | 'nano' + | 'universal-2' + | 'universal-3-pro' + | 'universal-3-5-pro' + | (string & {}); From a8f198cc1d88087570598414df8b9d2d9eb3fee3 Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 00:28:28 -0400 Subject: [PATCH 02/12] feat(assemblyai): deprecate best, drop unavailable nano MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy `best` model is deprecated (still functional, routes via the deprecated singular `speech_model` parameter): the model id type marks it `@deprecated` and `doGenerate` emits a deprecation warning pointing to `universal-3-5-pro`. The `nano` model is removed entirely — AssemblyAI's API now rejects it with a 400 ("the 'nano' speech model has been deprecated and is no longer available"), confirmed end-to-end against the live API. Repoint examples, docs, and README to `universal-3-5-pro`, generalize the callable provider overload to the full model id type, and expand tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-universal-3-5-pro.md | 9 +++- .../docs/03-ai-sdk-core/36-transcription.mdx | 2 - .../01-ai-sdk-providers/100-assemblyai.mdx | 10 ++-- .../src/transcribe/assemblyai/basic.ts | 2 +- .../src/transcribe/assemblyai/string.ts | 2 +- .../assemblyai/universal-3-5-pro.ts | 18 -------- .../src/transcribe/assemblyai/url.ts | 2 +- packages/assemblyai/README.md | 2 +- .../assemblyai/src/assemblyai-api-types.ts | 6 +-- .../assemblyai/src/assemblyai-provider.ts | 2 +- .../assemblyai-transcription-model.test.ts | 46 +++++++++++++++++-- .../src/assemblyai-transcription-model.ts | 14 ++++-- .../src/assemblyai-transcription-settings.ts | 12 ++++- 13 files changed, 84 insertions(+), 43 deletions(-) delete mode 100644 examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts diff --git a/.changeset/assemblyai-universal-3-5-pro.md b/.changeset/assemblyai-universal-3-5-pro.md index 37251ae95d2d..7b869c182f34 100644 --- a/.changeset/assemblyai-universal-3-5-pro.md +++ b/.changeset/assemblyai-universal-3-5-pro.md @@ -8,5 +8,10 @@ Adds `universal-3-5-pro`, `universal-3-pro`, and `universal-2` to the transcription model ids. These newer models are only accessible through AssemblyAI's `speech_models` request parameter (the singular `speech_model` parameter is deprecated and rejects them), so the provider now routes the model -id to the correct parameter automatically: the legacy `best` and `nano` models -continue to use `speech_model`, while all other models use `speech_models`. +id to the correct parameter automatically: the legacy `best` model continues to +use `speech_model`, while all other models use `speech_models`. + +The `best` model is now deprecated. It continues to work, but the model id type +marks it `@deprecated` and the provider emits a deprecation warning when it is +used. Prefer `universal-3-5-pro` instead. The `nano` model has been removed, as +AssemblyAI no longer supports it (the API now rejects it). diff --git a/content/docs/03-ai-sdk-core/36-transcription.mdx b/content/docs/03-ai-sdk-core/36-transcription.mdx index d6a2b3573a34..141a589ab849 100644 --- a/content/docs/03-ai-sdk-core/36-transcription.mdx +++ b/content/docs/03-ai-sdk-core/36-transcription.mdx @@ -216,8 +216,6 @@ try { | [Gladia](/providers/ai-sdk-providers/gladia#transcription-models) | `default` | | [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-5-pro` | | [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-pro` | -| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `best` | -| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `nano` | | [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `whisper` | | [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `wizper` | | [Google Vertex](/providers/ai-sdk-providers/google-vertex#transcription-models) | `chirp_2` | diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index d1611b55cbdf..435906321bff 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -75,9 +75,11 @@ The first argument is the model id, e.g. `universal-3-5-pro`. const model = assemblyai.transcription('universal-3-5-pro'); ``` -The legacy `best` and `nano` models are sent using AssemblyAI's deprecated -`speech_model` request parameter. All newer models (e.g. `universal-3-5-pro`) are sent using the -[`speech_models`](https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model) +The `best` model is a **deprecated** legacy model, sent using AssemblyAI's +deprecated `speech_model` request parameter. It still works, but using it emits a +deprecation warning — prefer `universal-3-5-pro`. The older `nano` model has been +removed by AssemblyAI and is no longer available. All newer models are sent using +the [`speech_models`](https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model) request parameter, which the provider selects automatically based on the model id. You can also pass additional provider-specific options using the `providerOptions` argument. For example, supplying the `contentSafety` option will enable content safety filtering. @@ -89,7 +91,7 @@ import { type AssemblyAITranscriptionModelOptions } from '@ai-sdk/assemblyai'; import { readFile } from 'fs/promises'; const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: await readFile('audio.mp3'), providerOptions: { assemblyai: { diff --git a/examples/ai-functions/src/transcribe/assemblyai/basic.ts b/examples/ai-functions/src/transcribe/assemblyai/basic.ts index 5d1ebf221ef4..f50e5f519232 100644 --- a/examples/ai-functions/src/transcribe/assemblyai/basic.ts +++ b/examples/ai-functions/src/transcribe/assemblyai/basic.ts @@ -5,7 +5,7 @@ import { run } from '../../lib/run'; run(async () => { const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: await readFile('data/galileo.mp3'), }); diff --git a/examples/ai-functions/src/transcribe/assemblyai/string.ts b/examples/ai-functions/src/transcribe/assemblyai/string.ts index 9210cf1f263a..01a1d8172dd2 100644 --- a/examples/ai-functions/src/transcribe/assemblyai/string.ts +++ b/examples/ai-functions/src/transcribe/assemblyai/string.ts @@ -5,7 +5,7 @@ import { run } from '../../lib/run'; run(async () => { const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: Buffer.from(await readFile('./data/galileo.mp3')).toString('base64'), }); diff --git a/examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts b/examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts deleted file mode 100644 index f50e5f519232..000000000000 --- a/examples/ai-functions/src/transcribe/assemblyai/universal-3-5-pro.ts +++ /dev/null @@ -1,18 +0,0 @@ -import { assemblyai } from '@ai-sdk/assemblyai'; -import { transcribe } from 'ai'; -import { readFile } from 'fs/promises'; -import { run } from '../../lib/run'; - -run(async () => { - const result = await transcribe({ - model: assemblyai.transcription('universal-3-5-pro'), - audio: await readFile('data/galileo.mp3'), - }); - - console.log('Text:', result.text); - console.log('Duration:', result.durationInSeconds); - console.log('Language:', result.language); - console.log('Segments:', result.segments); - console.log('Warnings:', result.warnings); - console.log('Responses:', result.responses); -}); diff --git a/examples/ai-functions/src/transcribe/assemblyai/url.ts b/examples/ai-functions/src/transcribe/assemblyai/url.ts index 6493a81b7786..c48905c936b1 100644 --- a/examples/ai-functions/src/transcribe/assemblyai/url.ts +++ b/examples/ai-functions/src/transcribe/assemblyai/url.ts @@ -4,7 +4,7 @@ import { run } from '../../lib/run'; run(async () => { const result = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: new URL( 'https://github.com/vercel/ai/raw/refs/heads/main/examples/ai-functions/data/galileo.mp3', ), diff --git a/packages/assemblyai/README.md b/packages/assemblyai/README.md index a9de04ae4bc1..fc4930387235 100644 --- a/packages/assemblyai/README.md +++ b/packages/assemblyai/README.md @@ -36,7 +36,7 @@ import { assemblyai } from '@ai-sdk/assemblyai'; import { transcribe } from 'ai'; const { text } = await transcribe({ - model: assemblyai.transcription('best'), + model: assemblyai.transcription('universal-3-5-pro'), audio: new URL( 'https://github.com/vercel/ai/raw/refs/heads/main/examples/ai-functions/data/galileo.mp3', ), diff --git a/packages/assemblyai/src/assemblyai-api-types.ts b/packages/assemblyai/src/assemblyai-api-types.ts index 99fa2f1c474e..85c9cf125ebc 100644 --- a/packages/assemblyai/src/assemblyai-api-types.ts +++ b/packages/assemblyai/src/assemblyai-api-types.ts @@ -312,11 +312,11 @@ export type AssemblyAITranscriptionAPITypes = { * The speech model to use for the transcription. * * @deprecated This parameter has been replaced with `speech_models`. It only - * supports the legacy `best` and `nano` models. Use `speech_models` for - * `universal-2`, `universal-3-pro`, `universal-3-5-pro`, etc. + * supports the legacy `best` model. Use `speech_models` for `universal-2`, + * `universal-3-pro`, `universal-3-5-pro`, etc. * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model */ - speech_model?: 'best' | 'nano'; + speech_model?: 'best'; /** * List of speech models in priority order, allowing the system to diff --git a/packages/assemblyai/src/assemblyai-provider.ts b/packages/assemblyai/src/assemblyai-provider.ts index 8c517e5d932d..ff4779284953 100644 --- a/packages/assemblyai/src/assemblyai-provider.ts +++ b/packages/assemblyai/src/assemblyai-provider.ts @@ -14,7 +14,7 @@ import { VERSION } from './version'; export interface AssemblyAIProvider extends ProviderV4 { ( - modelId: 'best', + modelId: AssemblyAITranscriptionModelId, settings?: {}, ): { transcription: AssemblyAITranscriptionModel; diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index c6563b4b3fd8..2c559b3ce92b 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -259,7 +259,7 @@ describe('doGenerate', () => { it('should pass the legacy model via the speech_model parameter', async () => { prepareJsonResponse(); - await model.doGenerate({ + const result = await model.doGenerate({ audio: audioData, mediaType: 'audio/wav', }); @@ -270,22 +270,62 @@ describe('doGenerate', () => { speech_model: 'best', }); expect(requestBody.speech_models).toBeUndefined(); + + expect(result.warnings).toContainEqual({ + type: 'deprecated', + setting: "model 'best'", + message: expect.stringContaining('universal-3-5-pro'), + }); + const [deprecation] = result.warnings.filter( + warning => warning.type === 'deprecated', + ); + expect(deprecation?.message).toContain( + 'https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model', + ); }); it('should pass newer models via the speech_models parameter', async () => { prepareJsonResponse(); + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + audio_url: 'https://storage.assemblyai.com/mock-upload-url', + speech_models: ['universal-3-5-pro'], + }); + expect(requestBody.speech_model).toBeUndefined(); + + expect( + result.warnings.filter(warning => warning.type === 'deprecated'), + ).toEqual([]); + }); + + it('should still send provider options alongside speech_models', async () => { + prepareJsonResponse(); + await provider.transcription('universal-3-5-pro').doGenerate({ audio: audioData, mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + languageDetection: true, + punctuate: false, + }, + }, }); const requestBody = await server.calls[1].requestBodyJson; expect(requestBody).toMatchObject({ - audio_url: 'https://storage.assemblyai.com/mock-upload-url', speech_models: ['universal-3-5-pro'], + language_detection: true, + punctuate: false, }); - expect(requestBody.speech_model).toBeUndefined(); }); it('should pass headers', async () => { diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index 91e0efb772b9..a48459ff2ec8 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -68,13 +68,19 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { const body: Omit = {}; - // The legacy `best` and `nano` models are selected via the deprecated - // singular `speech_model` parameter. All newer models (e.g. `universal-2`, + // The legacy `best` model is selected via the deprecated singular + // `speech_model` parameter. All other models (e.g. `universal-2`, // `universal-3-pro`, `universal-3-5-pro`) are only accessible via the // `speech_models` array and are rejected by `speech_model`. // See https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model - if (this.modelId === 'best' || this.modelId === 'nano') { - body.speech_model = this.modelId as 'best' | 'nano'; + if (this.modelId === 'best') { + body.speech_model = this.modelId as 'best'; + warnings.push({ + type: 'deprecated', + setting: `model '${this.modelId}'`, + message: + "The 'best' model is a legacy AssemblyAI model. Use 'universal-3-5-pro' instead. See documentation: https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model", + }); } else { body.speech_models = [this.modelId]; } diff --git a/packages/assemblyai/src/assemblyai-transcription-settings.ts b/packages/assemblyai/src/assemblyai-transcription-settings.ts index 9be2a7f011e1..ef3ba0c1ba1a 100644 --- a/packages/assemblyai/src/assemblyai-transcription-settings.ts +++ b/packages/assemblyai/src/assemblyai-transcription-settings.ts @@ -1,7 +1,15 @@ +/** + * Legacy AssemblyAI speech model, sent via the deprecated singular + * `speech_model` request parameter. + * + * @deprecated Use `universal-3-5-pro` instead. + * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model + */ +export type AssemblyAIDeprecatedTranscriptionModelId = 'best'; + export type AssemblyAITranscriptionModelId = - | 'best' - | 'nano' | 'universal-2' | 'universal-3-pro' | 'universal-3-5-pro' + | AssemblyAIDeprecatedTranscriptionModelId | (string & {}); From b696b328d533b619aeaf99c89c4caf1da3b12310 Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 01:57:04 -0400 Subject: [PATCH 03/12] feat(assemblyai): surface speaker diarization and audio-intelligence output The provider previously returned a Zod-parsed (stripped) transcript as response.body, dropping speaker labels, utterances, and all audio-intelligence results even when enabled via providerOptions. Now doGenerate returns the full raw AssemblyAI response on response.body, and populates providerMetadata.assemblyai with structured results for the currently-available features: utterances (diarization), entities, sentimentAnalysisResults, contentSafetyLabels, iabCategoriesResult, and autoHighlightsResult. The words schema gains speaker/channel/confidence and a typed utterances array. Verified availability against the DeepLearning backend and the public API reference: deprecated features (Summarization, Auto Chapters, Custom Topics) are intentionally left off providerMetadata but remain on the raw body. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-diarization-output.md | 24 ++++ .../01-ai-sdk-providers/100-assemblyai.mdx | 41 +++++++ .../assemblyai-transcription-model.test.ts | 50 +++++++++ .../src/assemblyai-transcription-model.ts | 103 ++++++++++++++++-- 4 files changed, 207 insertions(+), 11 deletions(-) create mode 100644 .changeset/assemblyai-diarization-output.md diff --git a/.changeset/assemblyai-diarization-output.md b/.changeset/assemblyai-diarization-output.md new file mode 100644 index 000000000000..6bfe68696071 --- /dev/null +++ b/.changeset/assemblyai-diarization-output.md @@ -0,0 +1,24 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): surface speaker diarization and audio-intelligence results + +Previously the AssemblyAI provider parsed the transcript response with a +restrictive schema and returned that parsed object as `response.body`, which +silently dropped speaker labels, utterances, and all audio-intelligence results +(even though the matching `providerOptions` could enable them). + +The provider now: + +- returns the complete, raw AssemblyAI response on `response.body` (nothing is + stripped), and +- surfaces structured results for currently-available features on + `providerMetadata.assemblyai`: `utterances` (speaker diarization), + `entities`, `sentimentAnalysisResults`, `contentSafetyLabels`, + `iabCategoriesResult`, and `autoHighlightsResult`. + +Word-level `speaker`/`channel`/`confidence` and `utterances` are now parsed. +Deprecated AssemblyAI features (Summarization, Auto Chapters, Custom Topics) are +intentionally not promoted to `providerMetadata` but remain on the raw +`response.body` when enabled. diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index 435906321bff..0911b70b7fbd 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -281,6 +281,47 @@ The following provider options are available: List of words to boost in the transcription. Optional. +### Speaker diarization and audio-intelligence results + +The AI SDK's `transcribe` result exposes `text`, `segments`, `language`, and +`durationInSeconds`. AssemblyAI's richer results — speaker diarization and +audio-intelligence features — don't fit that shape, so they are surfaced in two +places: + +- **`providerMetadata.assemblyai`** — structured results for the features you + enabled: `utterances` (speaker-diarized segments, when `speakerLabels` is set), + `entities`, `sentimentAnalysisResults`, `contentSafetyLabels`, + `iabCategoriesResult`, and `autoHighlightsResult`. +- **`response.body`** — the complete, raw AssemblyAI transcript response, so any + field not surfaced above (e.g. `chapters`, word-level `speaker` labels) is + still available. + +```ts +import { transcribe } from 'ai'; +import { assemblyai } from '@ai-sdk/assemblyai'; +import { readFile } from 'fs/promises'; + +const result = await transcribe({ + model: assemblyai.transcription('universal-3-5-pro'), + audio: await readFile('audio.mp3'), + providerOptions: { + assemblyai: { + speakerLabels: true, + entityDetection: true, + }, + }, +}); + +const { utterances, entities } = result.providerMetadata?.assemblyai ?? {}; +// utterances: [{ speaker: 'A', text: '…', start, end, … }, …] +``` + +The following AssemblyAI features are **deprecated** by the API and not surfaced +in `providerMetadata` (their output remains on the raw `response.body` if +enabled): Summarization, Auto Chapters, and Custom Topics. Note also that some +features are language-gated (e.g. sentiment analysis is English-centric); see +AssemblyAI's documentation for per-language availability. + ### Model Capabilities | Model | Transcription | Duration | Segments | Language | diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index 2c559b3ce92b..e8b55665f81f 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -328,6 +328,56 @@ describe('doGenerate', () => { }); }); + it('should surface diarization + audio-intelligence via providerMetadata', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const metadata = result.providerMetadata?.assemblyai as + | Record + | undefined; + expect(metadata).toBeDefined(); + + // Speaker diarization + expect(metadata?.utterances?.[0]).toMatchObject({ + speaker: 'A', + text: 'Hello, world!', + }); + + // Audio-intelligence results + expect(metadata?.entities?.[0]).toMatchObject({ + entity_type: 'location', + text: 'Canada', + }); + expect(metadata?.contentSafetyLabels).toBeDefined(); + expect(metadata?.iabCategoriesResult).toBeDefined(); + expect(metadata?.autoHighlightsResult).toBeDefined(); + }); + + it('should preserve the full raw response on response.body', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const body = result.response.body as Record; + // Word-level speaker label survives on the raw body. + expect(body.words[0].speaker).toBe('speaker'); + // Fields not modeled in our schema (e.g. chapters, summary) are no longer + // stripped — proves response.body is the raw response, not the parsed one. + expect(body.chapters).toBeDefined(); + expect(body.summary).toBe('- Hello, world!'); + }); + it('should pass headers', async () => { prepareJsonResponse(); diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index a48459ff2ec8..611fa2ea6dec 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -1,4 +1,8 @@ -import type { TranscriptionModelV4, SharedV4Warning } from '@ai-sdk/provider'; +import type { + TranscriptionModelV4, + SharedV4Warning, + SharedV4ProviderMetadata, +} from '@ai-sdk/provider'; import { combineHeaders, createJsonResponseHandler, @@ -152,6 +156,7 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { abortSignal?: AbortSignal, ): Promise<{ transcript: z.infer; + rawTranscript: unknown; responseHeaders: Record; }> { const pollingInterval = @@ -188,13 +193,14 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { }); } - const transcript = assemblyaiTranscriptionResponseSchema.parse( - await response.json(), - ); + const rawTranscript = await response.json(); + const transcript = + assemblyaiTranscriptionResponseSchema.parse(rawTranscript); if (transcript.status === 'completed') { return { transcript, + rawTranscript, responseHeaders: extractResponseHeaders(response), }; } @@ -255,11 +261,37 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { fetch: this.config.fetch, }); - const { transcript, responseHeaders } = await this.waitForCompletion( - submitResponse.id, - options.headers, - options.abortSignal, - ); + const { transcript, rawTranscript, responseHeaders } = + await this.waitForCompletion( + submitResponse.id, + options.headers, + options.abortSignal, + ); + + // Surface diarization and audio-intelligence results that the AI SDK's + // `segments` shape can't represent. Only included when the corresponding + // feature was enabled (and thus present in the response). + const assemblyaiMetadata: Record = {}; + if (transcript.utterances != null) { + assemblyaiMetadata.utterances = transcript.utterances; + } + if (transcript.sentiment_analysis_results != null) { + assemblyaiMetadata.sentimentAnalysisResults = + transcript.sentiment_analysis_results; + } + if (transcript.entities != null) { + assemblyaiMetadata.entities = transcript.entities; + } + if (transcript.content_safety_labels != null) { + assemblyaiMetadata.contentSafetyLabels = transcript.content_safety_labels; + } + if (transcript.iab_categories_result != null) { + assemblyaiMetadata.iabCategoriesResult = transcript.iab_categories_result; + } + if (transcript.auto_highlights_result != null) { + assemblyaiMetadata.autoHighlightsResult = + transcript.auto_highlights_result; + } return { text: transcript.text ?? '', @@ -273,11 +305,16 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { durationInSeconds: transcript.audio_duration ?? transcript.words?.at(-1)?.end ?? undefined, warnings, + ...(Object.keys(assemblyaiMetadata).length > 0 && { + providerMetadata: { + assemblyai: assemblyaiMetadata, + } as SharedV4ProviderMetadata, + }), response: { timestamp: currentDate, modelId: this.modelId, headers: responseHeaders, // Headers from final GET request - body: transcript, // Raw response from final GET request + body: rawTranscript, // Full raw response from final GET request }, }; } @@ -292,21 +329,65 @@ const assemblyaiSubmitResponseSchema = z.object({ status: z.enum(['queued', 'processing', 'completed', 'error']), }); +const assemblyaiWordSchema = z.object({ + start: z.number(), + end: z.number(), + text: z.string(), + confidence: z.number().nullish(), + // Speaker label (e.g. 'A', 'B') when speaker diarization is enabled, else null. + speaker: z.string().nullish(), + channel: z.string().nullish(), +}); + const assemblyaiTranscriptionResponseSchema = z.object({ id: z.string(), status: z.enum(['queued', 'processing', 'completed', 'error']), text: z.string().nullish(), language_code: z.string().nullish(), speech_model_used: z.string().nullish(), - words: z + words: z.array(assemblyaiWordSchema).nullish(), + // Speaker-diarized utterances (present when `speaker_labels` is enabled). + utterances: z .array( z.object({ start: z.number(), end: z.number(), text: z.string(), + confidence: z.number().nullish(), + speaker: z.string().nullish(), + channel: z.string().nullish(), + words: z.array(assemblyaiWordSchema).nullish(), + }), + ) + .nullish(), + // Audio-intelligence results, present only when the matching feature is + // enabled. Kept intentionally permissive (the full structures are also + // available on the raw `response.body`). + sentiment_analysis_results: z + .array( + z.object({ + text: z.string(), + start: z.number().nullish(), + end: z.number().nullish(), + sentiment: z.string(), + confidence: z.number().nullish(), + speaker: z.string().nullish(), + }), + ) + .nullish(), + entities: z + .array( + z.object({ + entity_type: z.string(), + text: z.string(), + start: z.number().nullish(), + end: z.number().nullish(), }), ) .nullish(), + content_safety_labels: z.record(z.string(), z.any()).nullish(), + iab_categories_result: z.record(z.string(), z.any()).nullish(), + auto_highlights_result: z.record(z.string(), z.any()).nullish(), audio_duration: z.number().nullish(), error: z.string().nullish(), }); From e143eb14397df7953b89c3ecc63843b6df608d6e Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 02:01:11 -0400 Subject: [PATCH 04/12] fix(assemblyai): convert segment timings from milliseconds to seconds AssemblyAI returns word start/end in milliseconds; the provider put them directly into segments' startSecond/endSecond (and the durationInSeconds fallback), making timings 1000x too large. Confirmed against the live API (a 3s clip reported a first word at startSecond: 183). Now divided by 1000. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-segment-timing.md | 10 ++++++++++ .../src/assemblyai-transcription-model.test.ts | 16 ++++++++++++++++ .../src/assemblyai-transcription-model.ts | 11 ++++++++--- 3 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 .changeset/assemblyai-segment-timing.md diff --git a/.changeset/assemblyai-segment-timing.md b/.changeset/assemblyai-segment-timing.md new file mode 100644 index 000000000000..f1aafe8d2afa --- /dev/null +++ b/.changeset/assemblyai-segment-timing.md @@ -0,0 +1,10 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +fix(assemblyai): report transcription segment timings in seconds + +AssemblyAI returns word timings in milliseconds, but the provider placed them +directly into the `startSecond`/`endSecond` fields of `transcribe` segments (and +into the `durationInSeconds` fallback), so segment timings were off by 1000×. +The provider now converts milliseconds to seconds. diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index e8b55665f81f..3fe1d64ed4d7 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -378,6 +378,22 @@ describe('doGenerate', () => { expect(body.summary).toBe('- Hello, world!'); }); + it('should report segment timings in seconds (ms converted)', async () => { + prepareJsonResponse(); + + const result = await model.doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + // Fixture word[0] is start: 250ms, end: 650ms → 0.25s / 0.65s. + expect(result.segments[0]).toEqual({ + text: 'Hello,', + startSecond: 0.25, + endSecond: 0.65, + }); + }); + it('should pass headers', async () => { prepareJsonResponse(); diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index 611fa2ea6dec..7db06c773ce0 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -293,17 +293,22 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { transcript.auto_highlights_result; } + const lastWordEndMs = transcript.words?.at(-1)?.end; + return { text: transcript.text ?? '', + // AssemblyAI returns word timings in milliseconds; the AI SDK reports + // segment timings in seconds. segments: transcript.words?.map(word => ({ text: word.text, - startSecond: word.start, - endSecond: word.end, + startSecond: word.start / 1000, + endSecond: word.end / 1000, })) ?? [], language: transcript.language_code ?? undefined, durationInSeconds: - transcript.audio_duration ?? transcript.words?.at(-1)?.end ?? undefined, + transcript.audio_duration ?? + (lastWordEndMs != null ? lastWordEndMs / 1000 : undefined), warnings, ...(Object.keys(assemblyaiMetadata).length > 0 && { providerMetadata: { From 0e647f345a64281321cacaa3659e9946ae5ccf13 Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 02:21:02 -0400 Subject: [PATCH 05/12] feat(assemblyai): add Universal-3-Pro input params, deprecate wordBoost Add provider options for newer AssemblyAI request params: prompt, keytermsPrompt, temperature, removeAudioTags, and domain (wired through api-types + getArgs). Deprecate wordBoost/boostParam: AssemblyAI rejects word_boost with a 400 on universal-3-pro / universal-3-5-pro / slam-1 (works only on universal-2/best), so using either now emits a deprecation warning pointing to keytermsPrompt. Verified param availability and model-gating against the DeepLearning backend and the public API reference. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-input-params.md | 18 +++++++ .../01-ai-sdk-providers/100-assemblyai.mdx | 37 +++++++++++++- .../assemblyai/src/assemblyai-api-types.ts | 32 +++++++++++++ .../assemblyai-transcription-model-options.ts | 36 +++++++++++++- .../assemblyai-transcription-model.test.ts | 48 +++++++++++++++++++ .../src/assemblyai-transcription-model.ts | 18 +++++++ 6 files changed, 187 insertions(+), 2 deletions(-) create mode 100644 .changeset/assemblyai-input-params.md diff --git a/.changeset/assemblyai-input-params.md b/.changeset/assemblyai-input-params.md new file mode 100644 index 000000000000..0171192bd468 --- /dev/null +++ b/.changeset/assemblyai-input-params.md @@ -0,0 +1,18 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): add Universal-3-Pro input params and deprecate wordBoost + +Adds provider options for the newer AssemblyAI request parameters: + +- `prompt` — natural-language prompting (Universal-3 Pro / SLAM-1) +- `keytermsPrompt` — domain keyterm boosting (replaces `wordBoost` for newer models) +- `temperature` — sampling temperature (Universal-3 Pro) +- `removeAudioTags` — strip inline annotations (Universal-3 Pro) +- `domain` — domain-specific model, e.g. `'medical-v1'` + +Deprecates `wordBoost` and `boostParam`: AssemblyAI rejects `word_boost` with a +400 on `universal-3-pro` / `universal-3-5-pro` / `slam-1` (it only works on the +legacy `universal-2`/`best` models). Using either option now emits a deprecation +warning pointing to `keytermsPrompt`. diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index 0911b70b7fbd..251f06f65755 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -125,8 +125,9 @@ The following provider options are available: - **boostParam** _enum_ - Boost parameter for the transcription. + Boost parameter for `wordBoost`. Allowed values: `'low'`, `'default'`, `'high'`. + **Deprecated** — only applies to the deprecated `wordBoost`; use `keytermsPrompt` instead. Optional. - **contentSafety** _boolean_ @@ -150,6 +151,12 @@ The following provider options are available: Whether to include disfluencies (um, uh, etc.) in the transcription. Optional. +- **domain** _string_ + + Enable a domain-specific model for specialized terminology. + Currently supports `'medical-v1'` (Medical Mode). + Optional. + - **entityDetection** _boolean_ Whether to detect entities in the transcription. @@ -170,6 +177,13 @@ The following provider options are available: Whether to include IAB categories in the transcription. Optional. +- **keytermsPrompt** _array of strings_ + + Domain-specific keyterms to boost recognition for (max 6 words per phrase). + Replaces `wordBoost` for newer models — supported by `universal-3-pro`, + `universal-3-5-pro`, and `slam-1` (and `universal-2` when enabled). + Optional. + - **languageCode** _string_ Language code for the audio. @@ -191,6 +205,12 @@ The following provider options are available: Whether to process multiple audio channels separately. Optional. +- **prompt** _string_ + + Natural-language context (up to 1,500 words) to steer the model. + Only supported by `universal-3-pro`, `universal-3-5-pro`, and `slam-1`. + Optional. + - **punctuate** _boolean_ Whether to add punctuation to the transcription. @@ -224,6 +244,13 @@ The following provider options are available: Allowed values: `'entity_name'`, `'hash'`. Optional. +- **removeAudioTags** _enum_ + + Remove inline annotations from rich transcripts. + Allowed values: `'all'` (all annotations), `'speaker'` (speaker cues only). + Universal-3 Pro models. + Optional. + - **sentimentAnalysis** _boolean_ Whether to perform sentiment analysis on the transcription. @@ -261,6 +288,12 @@ The following provider options are available: Allowed values: `'bullets'`, `'bullets_verbose'`, `'gist'`, `'headline'`, `'paragraph'`. Optional. +- **temperature** _number_ + + Sampling temperature (0-1) controlling randomness. + Universal-3 Pro models. + Optional. + - **webhookAuthHeaderName** _string_ Name of the authentication header for webhook requests. @@ -279,6 +312,8 @@ The following provider options are available: - **wordBoost** _array of strings_ List of words to boost in the transcription. + **Deprecated** — rejected by `universal-3-pro`, `universal-3-5-pro`, and `slam-1` + (works only on `universal-2`/`best`); use `keytermsPrompt` instead. Optional. ### Speaker diarization and audio-intelligence results diff --git a/packages/assemblyai/src/assemblyai-api-types.ts b/packages/assemblyai/src/assemblyai-api-types.ts index 85c9cf125ebc..13badf051ece 100644 --- a/packages/assemblyai/src/assemblyai-api-types.ts +++ b/packages/assemblyai/src/assemblyai-api-types.ts @@ -28,6 +28,7 @@ export type AssemblyAITranscriptionAPITypes = { /** * How much to boost specified words + * @deprecated Only used with the deprecated `word_boost`. Use `keyterms_prompt`. */ boost_param?: 'low' | 'default' | 'high'; @@ -370,6 +371,37 @@ export type AssemblyAITranscriptionAPITypes = { /** * The list of custom vocabulary to boost transcription probability for + * @deprecated Rejected by `universal-3-pro` / `universal-3-5-pro` and + * `slam-1` (works only on `universal-2`/`best`). Use `keyterms_prompt`. */ word_boost?: string[]; + + /** + * Domain-specific keyterms to boost (max 6 words per phrase). Replaces + * `word_boost` for `universal-3-pro` / `universal-3-5-pro` and `slam-1`. + */ + keyterms_prompt?: string[]; + + /** + * Natural-language context (up to 1,500 words) to steer the model. + * Only supported by `universal-3-pro` / `universal-3-5-pro` and `slam-1`. + */ + prompt?: string; + + /** + * Sampling temperature (0-1) controlling randomness. Universal-3 Pro models. + */ + temperature?: number; + + /** + * Remove inline annotations from rich transcripts: `'all'` removes all + * annotations, `'speaker'` removes only speaker cues. Universal-3 Pro models. + */ + remove_audio_tags?: 'all' | 'speaker'; + + /** + * Enable a domain-specific model to improve accuracy for specialized + * terminology, e.g. `'medical-v1'` for Medical Mode. + */ + domain?: string; }; diff --git a/packages/assemblyai/src/assemblyai-transcription-model-options.ts b/packages/assemblyai/src/assemblyai-transcription-model-options.ts index 7fc753ea86cd..97a137e7fb5c 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model-options.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model-options.ts @@ -19,8 +19,12 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ */ autoHighlights: z.boolean().nullish(), /** - * Boost parameter for the transcription. + * Boost parameter for word boost (used with `wordBoost`). * Allowed values: 'low', 'default', 'high'. + * + * @deprecated Only applies to the deprecated `wordBoost` option. Use + * `keytermsPrompt` instead, which works with the recommended `universal-*` + * models. */ boostParam: z.string().nullish(), /** @@ -46,6 +50,11 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to include filler words (um, uh, etc.) in the transcription. */ disfluencies: z.boolean().nullish(), + /** + * Enable a domain-specific model to improve accuracy for specialized + * terminology. Currently supports `'medical-v1'` (Medical Mode). + */ + domain: z.string().nullish(), /** * Whether to enable entity detection. */ @@ -62,6 +71,13 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to enable IAB categories detection. */ iabCategories: z.boolean().nullish(), + /** + * Domain-specific keyterms to boost recognition for (max 6 words per phrase). + * Replaces `wordBoost` for newer models: supported by `universal-3-pro` / + * `universal-3-5-pro` and `slam-1` (and `universal-2` when metaphone is + * enabled for the account). + */ + keytermsPrompt: z.array(z.string()).nullish(), /** * Language code for the transcription. */ @@ -78,6 +94,11 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to process audio as multichannel. */ multichannel: z.boolean().nullish(), + /** + * Provide natural-language context (up to 1,500 words) to steer the model. + * Only supported by `universal-3-pro` / `universal-3-5-pro` and `slam-1`. + */ + prompt: z.string().nullish(), /** * Whether to add punctuation to the transcription. */ @@ -102,6 +123,11 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Substitution method for redacted PII. */ redactPiiSub: z.string().nullish(), + /** + * Remove inline annotations from rich transcripts. `'all'` removes all inline + * annotations; `'speaker'` removes only speaker cues. Universal-3 Pro models. + */ + removeAudioTags: z.string().nullish(), /** * Whether to enable sentiment analysis. */ @@ -130,6 +156,10 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Type of summary to generate. */ summaryType: z.string().nullish(), + /** + * Sampling temperature (0-1) controlling randomness. Universal-3 Pro models. + */ + temperature: z.number().min(0).max(1).nullish(), /** * Name of the authentication header for webhook requests. */ @@ -144,6 +174,10 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ webhookUrl: z.string().nullish(), /** * List of words to boost recognition for. + * + * @deprecated `wordBoost` is rejected by `universal-3-pro` / + * `universal-3-5-pro` and `slam-1` (it only works on `universal-2`/`best`). + * Use `keytermsPrompt` instead. */ wordBoost: z.array(z.string()).nullish(), }); diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index 3fe1d64ed4d7..49f7035a6eaf 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -378,6 +378,54 @@ describe('doGenerate', () => { expect(body.summary).toBe('- Hello, world!'); }); + it('should pass the Universal-3-Pro input params', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-5-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + prompt: 'This is a conversation about the AI SDK.', + keytermsPrompt: ['Vercel', 'AI SDK'], + temperature: 0.2, + removeAudioTags: 'speaker', + domain: 'medical-v1', + }, + }, + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + speech_models: ['universal-3-5-pro'], + prompt: 'This is a conversation about the AI SDK.', + keyterms_prompt: ['Vercel', 'AI SDK'], + temperature: 0.2, + remove_audio_tags: 'speaker', + domain: 'medical-v1', + }); + }); + + it('should warn when deprecated wordBoost/boostParam options are used', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { wordBoost: ['Vercel'], boostParam: 'high' }, + }, + }); + + expect(result.warnings).toContainEqual({ + type: 'deprecated', + setting: 'wordBoost', + message: expect.stringContaining('keytermsPrompt'), + }); + }); + it('should report segment timings in seconds (ms converted)', async () => { prepareJsonResponse(); diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index 7db06c773ce0..e2df63c053e0 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -137,6 +137,24 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { assemblyaiOptions.webhookAuthHeaderValue ?? undefined; body.webhook_url = assemblyaiOptions.webhookUrl ?? undefined; body.word_boost = assemblyaiOptions.wordBoost ?? undefined; + body.keyterms_prompt = assemblyaiOptions.keytermsPrompt ?? undefined; + body.prompt = assemblyaiOptions.prompt ?? undefined; + body.temperature = assemblyaiOptions.temperature ?? undefined; + body.remove_audio_tags = + (assemblyaiOptions.removeAudioTags as never) ?? undefined; + body.domain = assemblyaiOptions.domain ?? undefined; + + if ( + assemblyaiOptions.wordBoost != null || + assemblyaiOptions.boostParam != null + ) { + warnings.push({ + type: 'deprecated', + setting: 'wordBoost', + message: + "The 'wordBoost' and 'boostParam' options are deprecated and are rejected by 'universal-3-pro' / 'universal-3-5-pro' and 'slam-1'. Use 'keytermsPrompt' instead.", + }); + } } return { From c328a001a64f87740a00c2a494d3450987a72b98 Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 02:33:45 -0400 Subject: [PATCH 06/12] feat(assemblyai): add speaker, language-detection, and PII-redaction options Add provider options for AssemblyAI's GA nested request params: speakerOptions, languageDetectionOptions, redactPiiAudioOptions, redactPiiReturnUnredacted, and redactStaticEntities (wired through api-types + getArgs with camelCase->snake_case mapping). Shapes verified against the live AssemblyAI docs OpenAPI (the assemblyai-api-spec repo was stale for redact_static_entities). Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-nested-params.md | 15 ++++++ .../01-ai-sdk-providers/100-assemblyai.mdx | 32 +++++++++++++ .../assemblyai/src/assemblyai-api-types.ts | 38 +++++++++++++++ .../assemblyai-transcription-model-options.ts | 48 +++++++++++++++++++ .../assemblyai-transcription-model.test.ts | 44 +++++++++++++++++ .../src/assemblyai-transcription-model.ts | 41 ++++++++++++++++ 6 files changed, 218 insertions(+) create mode 100644 .changeset/assemblyai-nested-params.md diff --git a/.changeset/assemblyai-nested-params.md b/.changeset/assemblyai-nested-params.md new file mode 100644 index 000000000000..cf1b642d1981 --- /dev/null +++ b/.changeset/assemblyai-nested-params.md @@ -0,0 +1,15 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): add speaker, language-detection, and PII-redaction options + +Adds provider options for AssemblyAI's nested request parameters: + +- `speakerOptions` — `{ minSpeakersExpected, maxSpeakersExpected }` +- `languageDetectionOptions` — `{ expectedLanguages, fallbackLanguage, codeSwitching, codeSwitchingConfidenceThreshold }` +- `redactPiiAudioOptions` — `{ returnRedactedNoSpeechAudio, overrideAudioRedactionMethod }` +- `redactPiiReturnUnredacted` — return the unredacted transcript alongside the redacted one +- `redactStaticEntities` — map of labels to exact terms to redact (e.g. `{ INTERNAL_TOOL: ['Bearclaw'] }`) + +The `redactPii*` options require `redactPii` to be enabled. diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index 251f06f65755..e2fd195df0a1 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -200,6 +200,13 @@ The following provider options are available: Whether to enable language detection. Optional. +- **languageDetectionOptions** _object_ + + Options for automatic language detection: `expectedLanguages` (array of + strings), `fallbackLanguage` (string), `codeSwitching` (boolean), + `codeSwitchingConfidenceThreshold` (number, 0-1). + Optional. + - **multichannel** _boolean_ Whether to process multiple audio channels separately. @@ -226,6 +233,12 @@ The following provider options are available: Whether to redact PII in the audio file. Optional. +- **redactPiiAudioOptions** _object_ + + Options for PII-redacted audio: `returnRedactedNoSpeechAudio` (boolean), + `overrideAudioRedactionMethod` (`'silence'`). Requires `redactPii`. + Optional. + - **redactPiiAudioQuality** _enum_ Quality of the redacted audio file. @@ -238,12 +251,25 @@ The following provider options are available: Supports numerous types like `'person_name'`, `'phone_number'`, etc. Optional. +- **redactPiiReturnUnredacted** _boolean_ + + Return the original unredacted transcript alongside the redacted one. + Requires `redactPii`. + Optional. + - **redactPiiSub** _enum_ Substitution method for redacted PII. Allowed values: `'entity_name'`, `'hash'`. Optional. +- **redactStaticEntities** _object_ + + Map of user-defined labels to exact terms to redact, e.g. + `{ INTERNAL_TOOL: ['Bearclaw'] }`. Applied on top of standard PII redaction. + Requires `redactPii`. + Optional. + - **removeAudioTags** _enum_ Remove inline annotations from rich transcripts. @@ -261,6 +287,12 @@ The following provider options are available: Whether to label different speakers in the transcription. Optional. +- **speakerOptions** _object_ + + Options for speaker diarization: `minSpeakersExpected` (number), + `maxSpeakersExpected` (number). + Optional. + - **speakersExpected** _number_ Expected number of speakers in the audio. diff --git a/packages/assemblyai/src/assemblyai-api-types.ts b/packages/assemblyai/src/assemblyai-api-types.ts index 13badf051ece..17d690374cc8 100644 --- a/packages/assemblyai/src/assemblyai-api-types.ts +++ b/packages/assemblyai/src/assemblyai-api-types.ts @@ -208,6 +208,16 @@ export type AssemblyAITranscriptionAPITypes = { */ language_detection?: boolean; + /** + * Options for automatic language detection. + */ + language_detection_options?: { + expected_languages?: string[]; + fallback_language?: string; + code_switching?: boolean; + code_switching_confidence_threshold?: number; + }; + /** * Enable Multichannel transcription, can be true or false. * @default false @@ -232,6 +242,14 @@ export type AssemblyAITranscriptionAPITypes = { */ redact_pii_audio?: boolean; + /** + * Options for PII-redacted audio files. Requires redact_pii. + */ + redact_pii_audio_options?: { + return_redacted_no_speech_audio?: boolean; + override_audio_redaction_method?: 'silence'; + }; + /** * Controls the filetype of the audio created by redact_pii_audio. Currently supports mp3 (default) and wav. */ @@ -287,11 +305,23 @@ export type AssemblyAITranscriptionAPITypes = { | 'zodiac_sign' >; + /** + * Return the original unredacted transcript alongside the redacted one. + * Requires redact_pii. + */ + redact_pii_return_unredacted?: boolean; + /** * The replacement logic for detected PII, can be "entity_name" or "hash". */ redact_pii_sub?: 'entity_name' | 'hash'; + /** + * Map of user-defined labels to exact terms to redact, applied on top of + * standard PII redaction. Requires redact_pii. + */ + redact_static_entities?: Record; + /** * Enable Sentiment Analysis, can be true or false * @default false @@ -304,6 +334,14 @@ export type AssemblyAITranscriptionAPITypes = { */ speaker_labels?: boolean; + /** + * Options for speaker diarization, e.g. a range of possible speakers. + */ + speaker_options?: { + min_speakers_expected?: number; + max_speakers_expected?: number; + }; + /** * Tells the speaker label model how many speakers it should attempt to identify, up to 10. */ diff --git a/packages/assemblyai/src/assemblyai-transcription-model-options.ts b/packages/assemblyai/src/assemblyai-transcription-model-options.ts index 97a137e7fb5c..ef3df22b71aa 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model-options.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model-options.ts @@ -90,6 +90,21 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to enable language detection. */ languageDetection: z.boolean().nullish(), + /** + * Options for automatic language detection. + */ + languageDetectionOptions: z + .object({ + /** List of languages expected in the audio file. */ + expectedLanguages: z.array(z.string()).nullish(), + /** Fallback language if the detected language is not expected. */ + fallbackLanguage: z.string().nullish(), + /** Whether code switching should be detected. */ + codeSwitching: z.boolean().nullish(), + /** Confidence threshold for code switching detection (0-1). */ + codeSwitchingConfidenceThreshold: z.number().min(0).max(1).nullish(), + }) + .nullish(), /** * Whether to process audio as multichannel. */ @@ -111,6 +126,17 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to redact PII in the audio file. */ redactPiiAudio: z.boolean().nullish(), + /** + * Options for PII-redacted audio files. Requires `redactPii`. + */ + redactPiiAudioOptions: z + .object({ + /** Return redacted audio even for files without detected speech. */ + returnRedactedNoSpeechAudio: z.boolean().nullish(), + /** Redaction method; set to `'silence'` to replace PII with silence. */ + overrideAudioRedactionMethod: z.string().nullish(), + }) + .nullish(), /** * Audio format for PII redaction. */ @@ -119,10 +145,21 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * List of PII types to redact. */ redactPiiPolicies: z.array(z.string()).nullish(), + /** + * Return the original unredacted transcript alongside the redacted one. + * Requires `redactPii`. + */ + redactPiiReturnUnredacted: z.boolean().nullish(), /** * Substitution method for redacted PII. */ redactPiiSub: z.string().nullish(), + /** + * Map of user-defined labels to exact terms to redact, e.g. + * `{ INTERNAL_TOOL: ['Bearclaw'] }`. Applied on top of standard PII redaction + * using `redactPiiSub`. Requires `redactPii`. + */ + redactStaticEntities: z.record(z.string(), z.array(z.string())).nullish(), /** * Remove inline annotations from rich transcripts. `'all'` removes all inline * annotations; `'speaker'` removes only speaker cues. Universal-3 Pro models. @@ -136,6 +173,17 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Whether to identify different speakers in the audio. */ speakerLabels: z.boolean().nullish(), + /** + * Options for speaker diarization, e.g. a range of possible speakers. + */ + speakerOptions: z + .object({ + /** Minimum number of speakers expected in the audio file. */ + minSpeakersExpected: z.number().int().nullish(), + /** Maximum number of speakers expected in the audio file. */ + maxSpeakersExpected: z.number().int().nullish(), + }) + .nullish(), /** * Number of speakers expected in the audio. */ diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index 49f7035a6eaf..e263ddff0f46 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -406,6 +406,50 @@ describe('doGenerate', () => { }); }); + it('should pass the GA nested input params', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-5-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + redactPii: true, + speakerOptions: { minSpeakersExpected: 1, maxSpeakersExpected: 3 }, + languageDetectionOptions: { + expectedLanguages: ['en', 'es'], + fallbackLanguage: 'en', + codeSwitching: true, + codeSwitchingConfidenceThreshold: 0.5, + }, + redactPiiAudioOptions: { + returnRedactedNoSpeechAudio: true, + overrideAudioRedactionMethod: 'silence', + }, + redactPiiReturnUnredacted: true, + redactStaticEntities: { INTERNAL_TOOL: ['Bearclaw'] }, + }, + }, + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody).toMatchObject({ + speaker_options: { min_speakers_expected: 1, max_speakers_expected: 3 }, + language_detection_options: { + expected_languages: ['en', 'es'], + fallback_language: 'en', + code_switching: true, + code_switching_confidence_threshold: 0.5, + }, + redact_pii_audio_options: { + return_redacted_no_speech_audio: true, + override_audio_redaction_method: 'silence', + }, + redact_pii_return_unredacted: true, + redact_static_entities: { INTERNAL_TOOL: ['Bearclaw'] }, + }); + }); + it('should warn when deprecated wordBoost/boostParam options are used', async () => { prepareJsonResponse(); diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index e2df63c053e0..27b9d1415980 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -143,6 +143,47 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { body.remove_audio_tags = (assemblyaiOptions.removeAudioTags as never) ?? undefined; body.domain = assemblyaiOptions.domain ?? undefined; + body.redact_pii_return_unredacted = + assemblyaiOptions.redactPiiReturnUnredacted ?? undefined; + body.redact_static_entities = + assemblyaiOptions.redactStaticEntities ?? undefined; + + if (assemblyaiOptions.speakerOptions) { + body.speaker_options = { + min_speakers_expected: + assemblyaiOptions.speakerOptions.minSpeakersExpected ?? undefined, + max_speakers_expected: + assemblyaiOptions.speakerOptions.maxSpeakersExpected ?? undefined, + }; + } + + if (assemblyaiOptions.languageDetectionOptions) { + body.language_detection_options = { + expected_languages: + assemblyaiOptions.languageDetectionOptions.expectedLanguages ?? + undefined, + fallback_language: + assemblyaiOptions.languageDetectionOptions.fallbackLanguage ?? + undefined, + code_switching: + assemblyaiOptions.languageDetectionOptions.codeSwitching ?? + undefined, + code_switching_confidence_threshold: + assemblyaiOptions.languageDetectionOptions + .codeSwitchingConfidenceThreshold ?? undefined, + }; + } + + if (assemblyaiOptions.redactPiiAudioOptions) { + body.redact_pii_audio_options = { + return_redacted_no_speech_audio: + assemblyaiOptions.redactPiiAudioOptions + .returnRedactedNoSpeechAudio ?? undefined, + override_audio_redaction_method: + (assemblyaiOptions.redactPiiAudioOptions + .overrideAudioRedactionMethod as never) ?? undefined, + }; + } if ( assemblyaiOptions.wordBoost != null || From 743584b607c3a3c47f2072f15e4cfa8923d7f0d8 Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 11:47:25 -0400 Subject: [PATCH 07/12] test(assemblyai): cover nano removal and universal-3-pro routing Add a regression test asserting nano is no longer special-cased (routes via speech_models, no deprecation warning) and that universal-3-pro routes via speech_models. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../assemblyai-transcription-model.test.ts | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index e263ddff0f46..a9ab505fb8be 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -306,6 +306,37 @@ describe('doGenerate', () => { ).toEqual([]); }); + it('should route universal-3-pro via the speech_models parameter', async () => { + prepareJsonResponse(); + + await provider.transcription('universal-3-pro').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody.speech_models).toEqual(['universal-3-pro']); + expect(requestBody.speech_model).toBeUndefined(); + }); + + it('should not special-case the removed nano model', async () => { + prepareJsonResponse(); + + const result = await provider.transcription('nano').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + // `nano` is no longer a legacy `speech_model` alias: it falls through to + // `speech_models` (where the live API rejects it) and emits no warning. + const requestBody = await server.calls[1].requestBodyJson; + expect(requestBody.speech_models).toEqual(['nano']); + expect(requestBody.speech_model).toBeUndefined(); + expect( + result.warnings.filter(warning => warning.type === 'deprecated'), + ).toEqual([]); + }); + it('should still send provider options alongside speech_models', async () => { prepareJsonResponse(); From 69a22ccdf7497296cf023672b74c118dcf5ab506 Mon Sep 17 00:00:00 2001 From: David Lange Date: Tue, 30 Jun 2026 13:22:41 -0400 Subject: [PATCH 08/12] feat(assemblyai): nudge universal-3-pro/universal-2 users to universal-3-5-pro Emit an informational warning (type: 'other', not a deprecation) when universal-3-pro or universal-2 is used, noting that universal-3-5-pro is the latest flagship and is set to replace universal-3-pro. Both models remain fully supported; universal-3-5-pro emits no warning. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-latest-model-nudge.md | 10 +++++++ .../01-ai-sdk-providers/100-assemblyai.mdx | 4 +++ .../assemblyai-transcription-model.test.ts | 28 +++++++++++++++---- .../src/assemblyai-transcription-model.ts | 14 ++++++++++ 4 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 .changeset/assemblyai-latest-model-nudge.md diff --git a/.changeset/assemblyai-latest-model-nudge.md b/.changeset/assemblyai-latest-model-nudge.md new file mode 100644 index 000000000000..283afc1b0802 --- /dev/null +++ b/.changeset/assemblyai-latest-model-nudge.md @@ -0,0 +1,10 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): nudge universal-3-pro/universal-2 users toward universal-3-5-pro + +Using `universal-3-pro` or `universal-2` now emits an informational warning +(`type: 'other'`, not a deprecation) noting that `universal-3-5-pro` is +AssemblyAI's latest flagship model and is set to replace `universal-3-pro`. Both +models remain fully supported. diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index e2fd195df0a1..a864c8f875d7 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -82,6 +82,10 @@ removed by AssemblyAI and is no longer available. All newer models are sent usin the [`speech_models`](https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model) request parameter, which the provider selects automatically based on the model id. +`universal-3-pro` and `universal-2` are fully supported and continue to work, but +using them emits an informational warning (not a deprecation) suggesting +`universal-3-5-pro`, AssemblyAI's latest flagship model. + You can also pass additional provider-specific options using the `providerOptions` argument. For example, supplying the `contentSafety` option will enable content safety filtering. ```ts highlight="7" diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index a9ab505fb8be..4ca9b511f102 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -301,15 +301,14 @@ describe('doGenerate', () => { }); expect(requestBody.speech_model).toBeUndefined(); - expect( - result.warnings.filter(warning => warning.type === 'deprecated'), - ).toEqual([]); + // No deprecation and no nudge for the latest flagship model. + expect(result.warnings).toEqual([]); }); - it('should route universal-3-pro via the speech_models parameter', async () => { + it('should route universal-3-pro via speech_models and nudge to universal-3-5-pro', async () => { prepareJsonResponse(); - await provider.transcription('universal-3-pro').doGenerate({ + const result = await provider.transcription('universal-3-pro').doGenerate({ audio: audioData, mediaType: 'audio/wav', }); @@ -317,6 +316,25 @@ describe('doGenerate', () => { const requestBody = await server.calls[1].requestBodyJson; expect(requestBody.speech_models).toEqual(['universal-3-pro']); expect(requestBody.speech_model).toBeUndefined(); + + expect(result.warnings).toContainEqual({ + type: 'other', + message: expect.stringContaining('universal-3-5-pro'), + }); + }); + + it('should nudge universal-2 users toward universal-3-5-pro', async () => { + prepareJsonResponse(); + + const result = await provider.transcription('universal-2').doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + }); + + expect(result.warnings).toContainEqual({ + type: 'other', + message: expect.stringContaining('universal-3-5-pro'), + }); }); it('should not special-case the removed nano model', async () => { diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index 27b9d1415980..8a1bb3ec91b3 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -87,6 +87,20 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { }); } else { body.speech_models = [this.modelId]; + + // Forward-looking nudge: universal-3-5-pro is AssemblyAI's latest + // flagship and is set to replace universal-3-pro. Not a deprecation — + // both models still work — so this is an informational warning only. + if ( + this.modelId === 'universal-3-pro' || + this.modelId === 'universal-2' + ) { + warnings.push({ + type: 'other', + message: + "'universal-3-5-pro' is AssemblyAI's latest flagship model and is set to replace 'universal-3-pro'. See https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model", + }); + } } // Add provider-specific options From be6ee349f96e1e881ff3e79a411f47285eaad63b Mon Sep 17 00:00:00 2001 From: David Lange Date: Wed, 1 Jul 2026 02:06:04 -0400 Subject: [PATCH 09/12] fix(assemblyai): address code-review findings - warn on options missing prerequisites (redactPii*, languageCode+languageDetection) - fix universal-2 nudge message and wordBoost/boostParam warning attribution - type removeAudioTags / overrideAudioRedactionMethod as enums (drop `as never`) - honor config.fetch for polling GETs - source providerMetadata from the raw response (no field stripping); document its timings are in ms while segments are in seconds - fix redactPiiAudioOptions docs (requires redactPiiAudio); restore the Model Capabilities table rows Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-review-fixes.md | 22 ++++ .../01-ai-sdk-providers/100-assemblyai.mdx | 11 +- .../assemblyai-transcription-model-options.ts | 6 +- .../assemblyai-transcription-model.test.ts | 114 ++++++++++++++++-- .../src/assemblyai-transcription-model.ts | 91 +++++++++++--- 5 files changed, 210 insertions(+), 34 deletions(-) create mode 100644 .changeset/assemblyai-review-fixes.md diff --git a/.changeset/assemblyai-review-fixes.md b/.changeset/assemblyai-review-fixes.md new file mode 100644 index 000000000000..fb58f3c11a22 --- /dev/null +++ b/.changeset/assemblyai-review-fixes.md @@ -0,0 +1,22 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +fix(assemblyai): address code-review findings for the transcription provider + +- Warn when options are set without their required prerequisite + (`redactPiiReturnUnredacted`/`redactStaticEntities` without `redactPii`; + `redactPiiAudioOptions` without `redactPiiAudio`; `languageCode` together with + `languageDetection`), since AssemblyAI otherwise 400s or silently ignores them. +- Fix the `universal-2` nudge message (it previously claimed `universal-2` was + being replaced by `universal-3-pro`). +- Attribute the `wordBoost`/`boostParam` deprecation warning to whichever option + was actually set. +- Type `removeAudioTags` and `overrideAudioRedactionMethod` as enums (dropping + the `as never` casts) so invalid values fail validation client-side. +- Honor a caller-provided `fetch` for the polling requests (not just upload/submit). +- Populate `providerMetadata.assemblyai` from the raw response so nested fields + aren't stripped; document that its timings are in milliseconds while + `segments` are in seconds. +- Correct the `redactPiiAudioOptions` docs (requires `redactPiiAudio`) and + restore the Model Capabilities table rows for the supported models. diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index a864c8f875d7..b898a24c366a 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -240,7 +240,7 @@ The following provider options are available: - **redactPiiAudioOptions** _object_ Options for PII-redacted audio: `returnRedactedNoSpeechAudio` (boolean), - `overrideAudioRedactionMethod` (`'silence'`). Requires `redactPii`. + `overrideAudioRedactionMethod` (`'silence'`). Requires `redactPiiAudio`. Optional. - **redactPiiAudioQuality** _enum_ @@ -367,6 +367,10 @@ places: field not surfaced above (e.g. `chapters`, word-level `speaker` labels) is still available. +Note: timings inside `providerMetadata` and `response.body` (e.g. +`utterances[].start`) are in **milliseconds**, matching the AssemblyAI API — +whereas the top-level `segments` use **seconds**. + ```ts import { transcribe } from 'ai'; import { assemblyai } from '@ai-sdk/assemblyai'; @@ -384,7 +388,7 @@ const result = await transcribe({ }); const { utterances, entities } = result.providerMetadata?.assemblyai ?? {}; -// utterances: [{ speaker: 'A', text: '…', start, end, … }, …] +// utterances: [{ speaker: 'A', text: '…', start, end, … }, …] (start/end in ms) ``` The following AssemblyAI features are **deprecated** by the API and not surfaced @@ -398,4 +402,7 @@ AssemblyAI's documentation for per-language availability. | Model | Transcription | Duration | Segments | Language | | ------------------- | ------------------- | ------------------- | ------------------- | ------------------- | | `universal-3-5-pro` | | | | | +| `universal-3-pro` | | | | | +| `universal-2` | | | | | +| `best` | | | | | diff --git a/packages/assemblyai/src/assemblyai-transcription-model-options.ts b/packages/assemblyai/src/assemblyai-transcription-model-options.ts index ef3df22b71aa..ef04e94f9e3a 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model-options.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model-options.ts @@ -127,14 +127,14 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ */ redactPiiAudio: z.boolean().nullish(), /** - * Options for PII-redacted audio files. Requires `redactPii`. + * Options for PII-redacted audio files. Requires `redactPiiAudio`. */ redactPiiAudioOptions: z .object({ /** Return redacted audio even for files without detected speech. */ returnRedactedNoSpeechAudio: z.boolean().nullish(), /** Redaction method; set to `'silence'` to replace PII with silence. */ - overrideAudioRedactionMethod: z.string().nullish(), + overrideAudioRedactionMethod: z.enum(['silence']).nullish(), }) .nullish(), /** @@ -164,7 +164,7 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({ * Remove inline annotations from rich transcripts. `'all'` removes all inline * annotations; `'speaker'` removes only speaker cues. Universal-3 Pro models. */ - removeAudioTags: z.string().nullish(), + removeAudioTags: z.enum(['all', 'speaker']).nullish(), /** * Whether to enable sentiment analysis. */ diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts index 4ca9b511f102..ddc9cabb390d 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts @@ -233,6 +233,16 @@ describe('doGenerate', () => { summary_model: 'informative', summary: '- Hello, world!', sentiment_analysis: true, + sentiment_analysis_results: [ + { + text: 'Hello, world!', + start: 250, + end: 26950, + sentiment: 'POSITIVE', + confidence: 0.9, + speaker: 'A', + }, + ], entity_detection: true, entities: [ { @@ -317,10 +327,11 @@ describe('doGenerate', () => { expect(requestBody.speech_models).toEqual(['universal-3-pro']); expect(requestBody.speech_model).toBeUndefined(); - expect(result.warnings).toContainEqual({ - type: 'other', - message: expect.stringContaining('universal-3-5-pro'), - }); + // universal-3-pro is the model universal-3-5-pro replaces, so the message + // names it explicitly. + const [nudge] = result.warnings.filter(w => w.type === 'other'); + expect(nudge?.message).toContain('universal-3-5-pro'); + expect(nudge?.message).toContain("replace 'universal-3-pro'"); }); it('should nudge universal-2 users toward universal-3-5-pro', async () => { @@ -331,10 +342,10 @@ describe('doGenerate', () => { mediaType: 'audio/wav', }); - expect(result.warnings).toContainEqual({ - type: 'other', - message: expect.stringContaining('universal-3-5-pro'), - }); + // The nudge for universal-2 must not claim it is replaced by universal-3-pro. + const [nudge] = result.warnings.filter(w => w.type === 'other'); + expect(nudge?.message).toContain('universal-3-5-pro'); + expect(nudge?.message).not.toContain("replace 'universal-3-pro'"); }); it('should not special-case the removed nano model', async () => { @@ -403,6 +414,10 @@ describe('doGenerate', () => { entity_type: 'location', text: 'Canada', }); + expect(metadata?.sentimentAnalysisResults?.[0]).toMatchObject({ + sentiment: 'POSITIVE', + text: 'Hello, world!', + }); expect(metadata?.contentSafetyLabels).toBeDefined(); expect(metadata?.iabCategoriesResult).toBeDefined(); expect(metadata?.autoHighlightsResult).toBeDefined(); @@ -514,11 +529,92 @@ describe('doGenerate', () => { expect(result.warnings).toContainEqual({ type: 'deprecated', - setting: 'wordBoost', + setting: 'wordBoost, boostParam', message: expect.stringContaining('keytermsPrompt'), }); }); + it('should attribute the deprecation warning to boostParam when only boostParam is set', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { assemblyai: { boostParam: 'high' } }, + }); + + expect(result.warnings).toContainEqual({ + type: 'deprecated', + setting: 'boostParam', + message: expect.stringContaining('keytermsPrompt'), + }); + }); + + it('should warn when redactPii-dependent options are set without redactPii', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { redactStaticEntities: { TOOL: ['Vercel'] } }, + }, + }); + + expect( + result.warnings.some( + w => w.type === 'other' && w.message.includes('redactPii'), + ), + ).toBe(true); + }); + + it('should warn when redactPiiAudioOptions is set without redactPiiAudio', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { + redactPii: true, + redactPiiAudioOptions: { overrideAudioRedactionMethod: 'silence' }, + }, + }, + }); + + expect( + result.warnings.some( + w => w.type === 'other' && w.message.includes('redactPiiAudio'), + ), + ).toBe(true); + }); + + it('should warn when languageCode and languageDetection are combined', async () => { + prepareJsonResponse(); + + const result = await provider + .transcription('universal-3-5-pro') + .doGenerate({ + audio: audioData, + mediaType: 'audio/wav', + providerOptions: { + assemblyai: { languageCode: 'en', languageDetection: true }, + }, + }); + + expect( + result.warnings.some( + w => w.type === 'other' && w.message.includes('languageDetection'), + ), + ).toBe(true); + }); + it('should report segment timings in seconds (ms converted)', async () => { prepareJsonResponse(); diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts index 8a1bb3ec91b3..4059430e5058 100644 --- a/packages/assemblyai/src/assemblyai-transcription-model.ts +++ b/packages/assemblyai/src/assemblyai-transcription-model.ts @@ -95,10 +95,14 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { this.modelId === 'universal-3-pro' || this.modelId === 'universal-2' ) { + const docsUrl = + 'https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model'; warnings.push({ type: 'other', message: - "'universal-3-5-pro' is AssemblyAI's latest flagship model and is set to replace 'universal-3-pro'. See https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model", + this.modelId === 'universal-3-pro' + ? `'universal-3-5-pro' is AssemblyAI's latest flagship model and is set to replace 'universal-3-pro'. See ${docsUrl}` + : `'universal-3-5-pro' is AssemblyAI's latest flagship model. See ${docsUrl}`, }); } } @@ -154,8 +158,7 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { body.keyterms_prompt = assemblyaiOptions.keytermsPrompt ?? undefined; body.prompt = assemblyaiOptions.prompt ?? undefined; body.temperature = assemblyaiOptions.temperature ?? undefined; - body.remove_audio_tags = - (assemblyaiOptions.removeAudioTags as never) ?? undefined; + body.remove_audio_tags = assemblyaiOptions.removeAudioTags ?? undefined; body.domain = assemblyaiOptions.domain ?? undefined; body.redact_pii_return_unredacted = assemblyaiOptions.redactPiiReturnUnredacted ?? undefined; @@ -194,20 +197,59 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { assemblyaiOptions.redactPiiAudioOptions .returnRedactedNoSpeechAudio ?? undefined, override_audio_redaction_method: - (assemblyaiOptions.redactPiiAudioOptions - .overrideAudioRedactionMethod as never) ?? undefined, + assemblyaiOptions.redactPiiAudioOptions + .overrideAudioRedactionMethod ?? undefined, }; } + const deprecatedBoostOptions: string[] = []; + if (assemblyaiOptions.wordBoost != null) { + deprecatedBoostOptions.push('wordBoost'); + } + if (assemblyaiOptions.boostParam != null) { + deprecatedBoostOptions.push('boostParam'); + } + if (deprecatedBoostOptions.length > 0) { + warnings.push({ + type: 'deprecated', + setting: deprecatedBoostOptions.join(', '), + message: + "'wordBoost' and 'boostParam' are deprecated and are rejected by 'universal-3-pro' / 'universal-3-5-pro' and 'slam-1'. Use 'keytermsPrompt' instead.", + }); + } + + // The following options only take effect alongside a prerequisite + // option; without it AssemblyAI either rejects the request (400) or + // silently ignores the option. Warn rather than mutate user input. if ( - assemblyaiOptions.wordBoost != null || - assemblyaiOptions.boostParam != null + (assemblyaiOptions.redactPiiReturnUnredacted != null || + assemblyaiOptions.redactStaticEntities != null) && + !assemblyaiOptions.redactPii ) { warnings.push({ - type: 'deprecated', - setting: 'wordBoost', + type: 'other', + message: + "'redactPiiReturnUnredacted' and 'redactStaticEntities' require 'redactPii' to be enabled; AssemblyAI rejects the request otherwise.", + }); + } + if ( + assemblyaiOptions.redactPiiAudioOptions != null && + !assemblyaiOptions.redactPiiAudio + ) { + warnings.push({ + type: 'other', message: - "The 'wordBoost' and 'boostParam' options are deprecated and are rejected by 'universal-3-pro' / 'universal-3-5-pro' and 'slam-1'. Use 'keytermsPrompt' instead.", + "'redactPiiAudioOptions' only applies when 'redactPiiAudio' is enabled; it is otherwise ignored.", + }); + } + if ( + assemblyaiOptions.languageCode != null && + assemblyaiOptions.languageDetection + ) { + warnings.push({ + type: 'other', + message: + "'languageDetection' cannot be combined with an explicit 'languageCode'; AssemblyAI rejects requests that set both.", }); } } @@ -235,12 +277,16 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { const pollingInterval = this.config.pollingInterval ?? this.POLLING_INTERVAL_MS; + // Honor a caller-provided fetch (proxy, auth injection, tests) for the + // polling GETs, matching the upload/submit calls that use config.fetch. + const fetchImpl = this.config.fetch ?? globalThis.fetch; + while (true) { if (abortSignal?.aborted) { throw new Error('Transcription request was aborted'); } - const response = await fetch( + const response = await fetchImpl( this.config.url({ path: `/v2/transcript/${transcriptId}`, modelId: this.modelId, @@ -342,28 +388,33 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 { ); // Surface diarization and audio-intelligence results that the AI SDK's - // `segments` shape can't represent. Only included when the corresponding - // feature was enabled (and thus present in the response). + // `segments` shape can't represent, keyed under `assemblyai`. Presence is + // gated on the parsed transcript, but values are taken from the raw + // response so no fields are stripped by the schema. + // + // NOTE: timings inside these objects (e.g. `utterances[].start`) are in + // milliseconds, matching the AssemblyAI API — unlike the top-level + // `segments`, whose `startSecond`/`endSecond` are in seconds. + const raw = (rawTranscript ?? {}) as Record; const assemblyaiMetadata: Record = {}; if (transcript.utterances != null) { - assemblyaiMetadata.utterances = transcript.utterances; + assemblyaiMetadata.utterances = raw.utterances; } if (transcript.sentiment_analysis_results != null) { assemblyaiMetadata.sentimentAnalysisResults = - transcript.sentiment_analysis_results; + raw.sentiment_analysis_results; } if (transcript.entities != null) { - assemblyaiMetadata.entities = transcript.entities; + assemblyaiMetadata.entities = raw.entities; } if (transcript.content_safety_labels != null) { - assemblyaiMetadata.contentSafetyLabels = transcript.content_safety_labels; + assemblyaiMetadata.contentSafetyLabels = raw.content_safety_labels; } if (transcript.iab_categories_result != null) { - assemblyaiMetadata.iabCategoriesResult = transcript.iab_categories_result; + assemblyaiMetadata.iabCategoriesResult = raw.iab_categories_result; } if (transcript.auto_highlights_result != null) { - assemblyaiMetadata.autoHighlightsResult = - transcript.auto_highlights_result; + assemblyaiMetadata.autoHighlightsResult = raw.auto_highlights_result; } const lastWordEndMs = transcript.words?.at(-1)?.end; From 782584d90a8f2e5f8e7ec40ebb5564b0e4a2f14f Mon Sep 17 00:00:00 2001 From: David Lange Date: Wed, 1 Jul 2026 14:00:19 -0400 Subject: [PATCH 10/12] docs(assemblyai): describe the provider as transcription, not language, models The provider only exposes transcription models (it throws on languageModel), so the intro line is corrected to match the README and actual behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- content/providers/01-ai-sdk-providers/100-assemblyai.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index b898a24c366a..256ac97c3ffa 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -5,7 +5,7 @@ description: Learn how to use the AssemblyAI provider for the AI SDK. # AssemblyAI Provider -The [AssemblyAI](https://assemblyai.com/) provider contains language model support for the AssemblyAI transcription API. +The [AssemblyAI](https://assemblyai.com/) provider contains transcription model support for the AssemblyAI transcription API. ## Setup From c80634b149a003783c41813c7a0e2b1950b355f5 Mon Sep 17 00:00:00 2001 From: David Lange Date: Wed, 1 Jul 2026 14:10:21 -0400 Subject: [PATCH 11/12] chore(assemblyai): consolidate changesets into one Combine the per-change changeset files into a single @ai-sdk/assemblyai patch entry describing the provider update, matching the repo's one-changeset-per-PR convention. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/assemblyai-diarization-output.md | 24 ------------------- .changeset/assemblyai-input-params.md | 18 -------------- .changeset/assemblyai-latest-model-nudge.md | 10 -------- .changeset/assemblyai-nested-params.md | 15 ------------ .changeset/assemblyai-provider-update.md | 26 +++++++++++++++++++++ .changeset/assemblyai-review-fixes.md | 22 ----------------- .changeset/assemblyai-segment-timing.md | 10 -------- .changeset/assemblyai-universal-3-5-pro.md | 17 -------------- 8 files changed, 26 insertions(+), 116 deletions(-) delete mode 100644 .changeset/assemblyai-diarization-output.md delete mode 100644 .changeset/assemblyai-input-params.md delete mode 100644 .changeset/assemblyai-latest-model-nudge.md delete mode 100644 .changeset/assemblyai-nested-params.md create mode 100644 .changeset/assemblyai-provider-update.md delete mode 100644 .changeset/assemblyai-review-fixes.md delete mode 100644 .changeset/assemblyai-segment-timing.md delete mode 100644 .changeset/assemblyai-universal-3-5-pro.md diff --git a/.changeset/assemblyai-diarization-output.md b/.changeset/assemblyai-diarization-output.md deleted file mode 100644 index 6bfe68696071..000000000000 --- a/.changeset/assemblyai-diarization-output.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -feat(assemblyai): surface speaker diarization and audio-intelligence results - -Previously the AssemblyAI provider parsed the transcript response with a -restrictive schema and returned that parsed object as `response.body`, which -silently dropped speaker labels, utterances, and all audio-intelligence results -(even though the matching `providerOptions` could enable them). - -The provider now: - -- returns the complete, raw AssemblyAI response on `response.body` (nothing is - stripped), and -- surfaces structured results for currently-available features on - `providerMetadata.assemblyai`: `utterances` (speaker diarization), - `entities`, `sentimentAnalysisResults`, `contentSafetyLabels`, - `iabCategoriesResult`, and `autoHighlightsResult`. - -Word-level `speaker`/`channel`/`confidence` and `utterances` are now parsed. -Deprecated AssemblyAI features (Summarization, Auto Chapters, Custom Topics) are -intentionally not promoted to `providerMetadata` but remain on the raw -`response.body` when enabled. diff --git a/.changeset/assemblyai-input-params.md b/.changeset/assemblyai-input-params.md deleted file mode 100644 index 0171192bd468..000000000000 --- a/.changeset/assemblyai-input-params.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -feat(assemblyai): add Universal-3-Pro input params and deprecate wordBoost - -Adds provider options for the newer AssemblyAI request parameters: - -- `prompt` — natural-language prompting (Universal-3 Pro / SLAM-1) -- `keytermsPrompt` — domain keyterm boosting (replaces `wordBoost` for newer models) -- `temperature` — sampling temperature (Universal-3 Pro) -- `removeAudioTags` — strip inline annotations (Universal-3 Pro) -- `domain` — domain-specific model, e.g. `'medical-v1'` - -Deprecates `wordBoost` and `boostParam`: AssemblyAI rejects `word_boost` with a -400 on `universal-3-pro` / `universal-3-5-pro` / `slam-1` (it only works on the -legacy `universal-2`/`best` models). Using either option now emits a deprecation -warning pointing to `keytermsPrompt`. diff --git a/.changeset/assemblyai-latest-model-nudge.md b/.changeset/assemblyai-latest-model-nudge.md deleted file mode 100644 index 283afc1b0802..000000000000 --- a/.changeset/assemblyai-latest-model-nudge.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -feat(assemblyai): nudge universal-3-pro/universal-2 users toward universal-3-5-pro - -Using `universal-3-pro` or `universal-2` now emits an informational warning -(`type: 'other'`, not a deprecation) noting that `universal-3-5-pro` is -AssemblyAI's latest flagship model and is set to replace `universal-3-pro`. Both -models remain fully supported. diff --git a/.changeset/assemblyai-nested-params.md b/.changeset/assemblyai-nested-params.md deleted file mode 100644 index cf1b642d1981..000000000000 --- a/.changeset/assemblyai-nested-params.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -feat(assemblyai): add speaker, language-detection, and PII-redaction options - -Adds provider options for AssemblyAI's nested request parameters: - -- `speakerOptions` — `{ minSpeakersExpected, maxSpeakersExpected }` -- `languageDetectionOptions` — `{ expectedLanguages, fallbackLanguage, codeSwitching, codeSwitchingConfidenceThreshold }` -- `redactPiiAudioOptions` — `{ returnRedactedNoSpeechAudio, overrideAudioRedactionMethod }` -- `redactPiiReturnUnredacted` — return the unredacted transcript alongside the redacted one -- `redactStaticEntities` — map of labels to exact terms to redact (e.g. `{ INTERNAL_TOOL: ['Bearclaw'] }`) - -The `redactPii*` options require `redactPii` to be enabled. diff --git a/.changeset/assemblyai-provider-update.md b/.changeset/assemblyai-provider-update.md new file mode 100644 index 000000000000..9019c00d1c88 --- /dev/null +++ b/.changeset/assemblyai-provider-update.md @@ -0,0 +1,26 @@ +--- +'@ai-sdk/assemblyai': patch +--- + +feat(assemblyai): support universal-3-5-pro and expand the transcription provider + +- Add current speech models `universal-3-5-pro`, `universal-3-pro`, and + `universal-2`, routed via AssemblyAI's `speech_models` parameter (the + deprecated singular `speech_model` is used only for the legacy `best` model). + Using `universal-3-pro`/`universal-2` emits an informational warning + suggesting `universal-3-5-pro`. +- Deprecate the legacy `best` model (still works, warns) and remove `nano`, + which AssemblyAI no longer accepts. +- Surface speaker diarization and audio-intelligence results: `doGenerate` now + returns the full raw response on `response.body` and populates + `providerMetadata.assemblyai` with `utterances`, `entities`, + `sentimentAnalysisResults`, `contentSafetyLabels`, `iabCategoriesResult`, and + `autoHighlightsResult`. +- Add provider options for newer request parameters: `prompt`, `keytermsPrompt`, + `temperature`, `removeAudioTags`, `domain`, `speakerOptions`, + `languageDetectionOptions`, `redactPiiAudioOptions`, + `redactPiiReturnUnredacted`, and `redactStaticEntities`. Deprecate + `wordBoost`/`boostParam` in favor of `keytermsPrompt` (AssemblyAI rejects + `word_boost` on the newer models). +- Fix transcription segment timings, which were reported in milliseconds instead + of seconds. diff --git a/.changeset/assemblyai-review-fixes.md b/.changeset/assemblyai-review-fixes.md deleted file mode 100644 index fb58f3c11a22..000000000000 --- a/.changeset/assemblyai-review-fixes.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -fix(assemblyai): address code-review findings for the transcription provider - -- Warn when options are set without their required prerequisite - (`redactPiiReturnUnredacted`/`redactStaticEntities` without `redactPii`; - `redactPiiAudioOptions` without `redactPiiAudio`; `languageCode` together with - `languageDetection`), since AssemblyAI otherwise 400s or silently ignores them. -- Fix the `universal-2` nudge message (it previously claimed `universal-2` was - being replaced by `universal-3-pro`). -- Attribute the `wordBoost`/`boostParam` deprecation warning to whichever option - was actually set. -- Type `removeAudioTags` and `overrideAudioRedactionMethod` as enums (dropping - the `as never` casts) so invalid values fail validation client-side. -- Honor a caller-provided `fetch` for the polling requests (not just upload/submit). -- Populate `providerMetadata.assemblyai` from the raw response so nested fields - aren't stripped; document that its timings are in milliseconds while - `segments` are in seconds. -- Correct the `redactPiiAudioOptions` docs (requires `redactPiiAudio`) and - restore the Model Capabilities table rows for the supported models. diff --git a/.changeset/assemblyai-segment-timing.md b/.changeset/assemblyai-segment-timing.md deleted file mode 100644 index f1aafe8d2afa..000000000000 --- a/.changeset/assemblyai-segment-timing.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -fix(assemblyai): report transcription segment timings in seconds - -AssemblyAI returns word timings in milliseconds, but the provider placed them -directly into the `startSecond`/`endSecond` fields of `transcribe` segments (and -into the `durationInSeconds` fallback), so segment timings were off by 1000×. -The provider now converts milliseconds to seconds. diff --git a/.changeset/assemblyai-universal-3-5-pro.md b/.changeset/assemblyai-universal-3-5-pro.md deleted file mode 100644 index 7b869c182f34..000000000000 --- a/.changeset/assemblyai-universal-3-5-pro.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -'@ai-sdk/assemblyai': patch ---- - -feat(assemblyai): support `universal-3-5-pro` and other current speech models - -Adds `universal-3-5-pro`, `universal-3-pro`, and `universal-2` to the -transcription model ids. These newer models are only accessible through -AssemblyAI's `speech_models` request parameter (the singular `speech_model` -parameter is deprecated and rejects them), so the provider now routes the model -id to the correct parameter automatically: the legacy `best` model continues to -use `speech_model`, while all other models use `speech_models`. - -The `best` model is now deprecated. It continues to work, but the model id type -marks it `@deprecated` and the provider emits a deprecation warning when it is -used. Prefer `universal-3-5-pro` instead. The `nano` model has been removed, as -AssemblyAI no longer supports it (the API now rejects it). From 98e591997de11993c1c36ea23d7532ca4e6a2857 Mon Sep 17 00:00:00 2001 From: Gregor Martynus <39992+gr2m@users.noreply.github.com> Date: Wed, 1 Jul 2026 21:16:37 -0700 Subject: [PATCH 12/12] style --- content/providers/01-ai-sdk-providers/100-assemblyai.mdx | 1 - 1 file changed, 1 deletion(-) diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx index 256ac97c3ffa..568888ddcb93 100644 --- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx +++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx @@ -405,4 +405,3 @@ AssemblyAI's documentation for per-language availability. | `universal-3-pro` | | | | | | `universal-2` | | | | | | `best` | | | | | -