diff --git a/.changeset/assemblyai-provider-update.md b/.changeset/assemblyai-provider-update.md
new file mode 100644
index 000000000000..9019c00d1c88
--- /dev/null
+++ b/.changeset/assemblyai-provider-update.md
@@ -0,0 +1,26 @@
+---
+'@ai-sdk/assemblyai': patch
+---
+
+feat(assemblyai): support universal-3-5-pro and expand the transcription provider
+
+- Add current speech models `universal-3-5-pro`, `universal-3-pro`, and
+ `universal-2`, routed via AssemblyAI's `speech_models` parameter (the
+ deprecated singular `speech_model` is used only for the legacy `best` model).
+ Using `universal-3-pro`/`universal-2` emits an informational warning
+ suggesting `universal-3-5-pro`.
+- Deprecate the legacy `best` model (still works, warns) and remove `nano`,
+ which AssemblyAI no longer accepts.
+- Surface speaker diarization and audio-intelligence results: `doGenerate` now
+ returns the full raw response on `response.body` and populates
+ `providerMetadata.assemblyai` with `utterances`, `entities`,
+ `sentimentAnalysisResults`, `contentSafetyLabels`, `iabCategoriesResult`, and
+ `autoHighlightsResult`.
+- Add provider options for newer request parameters: `prompt`, `keytermsPrompt`,
+ `temperature`, `removeAudioTags`, `domain`, `speakerOptions`,
+ `languageDetectionOptions`, `redactPiiAudioOptions`,
+ `redactPiiReturnUnredacted`, and `redactStaticEntities`. Deprecate
+ `wordBoost`/`boostParam` in favor of `keytermsPrompt` (AssemblyAI rejects
+ `word_boost` on the newer models).
+- Fix transcription segment timings, which were reported in milliseconds instead
+ of seconds.
diff --git a/content/docs/03-ai-sdk-core/36-transcription.mdx b/content/docs/03-ai-sdk-core/36-transcription.mdx
index da049e93a9e4..141a589ab849 100644
--- a/content/docs/03-ai-sdk-core/36-transcription.mdx
+++ b/content/docs/03-ai-sdk-core/36-transcription.mdx
@@ -214,8 +214,8 @@ try {
| [Deepgram](/providers/ai-sdk-providers/deepgram#transcription-models) | `nova-2` (+ variants) |
| [Deepgram](/providers/ai-sdk-providers/deepgram#transcription-models) | `nova-3` (+ variants) |
| [Gladia](/providers/ai-sdk-providers/gladia#transcription-models) | `default` |
-| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `best` |
-| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `nano` |
+| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-5-pro` |
+| [AssemblyAI](/providers/ai-sdk-providers/assemblyai#transcription-models) | `universal-3-pro` |
| [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `whisper` |
| [Fal](/providers/ai-sdk-providers/fal#transcription-models) | `wizper` |
| [Google Vertex](/providers/ai-sdk-providers/google-vertex#transcription-models) | `chirp_2` |
diff --git a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx
index 23fb292986d0..568888ddcb93 100644
--- a/content/providers/01-ai-sdk-providers/100-assemblyai.mdx
+++ b/content/providers/01-ai-sdk-providers/100-assemblyai.mdx
@@ -5,7 +5,7 @@ description: Learn how to use the AssemblyAI provider for the AI SDK.
# AssemblyAI Provider
-The [AssemblyAI](https://assemblyai.com/) provider contains language model support for the AssemblyAI transcription API.
+The [AssemblyAI](https://assemblyai.com/) provider contains transcription model support for the AssemblyAI transcription API.
## Setup
@@ -69,12 +69,23 @@ You can use the following optional settings to customize the AssemblyAI provider
You can create models that call the [AssemblyAI transcription API](https://www.assemblyai.com/docs/getting-started/transcribe-an-audio-file/typescript)
using the `.transcription()` factory method.
-The first argument is the model id e.g. `best`.
+The first argument is the model id, e.g. `universal-3-5-pro`.
```ts
-const model = assemblyai.transcription('best');
+const model = assemblyai.transcription('universal-3-5-pro');
```
+The `best` model is a **deprecated** legacy model, sent using AssemblyAI's
+deprecated `speech_model` request parameter. It still works, but using it emits a
+deprecation warning — prefer `universal-3-5-pro`. The older `nano` model has been
+removed by AssemblyAI and is no longer available. All newer models are sent using
+the [`speech_models`](https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model)
+request parameter, which the provider selects automatically based on the model id.
+
+`universal-3-pro` and `universal-2` are fully supported and continue to work, but
+using them emits an informational warning (not a deprecation) suggesting
+`universal-3-5-pro`, AssemblyAI's latest flagship model.
+
You can also pass additional provider-specific options using the `providerOptions` argument. For example, supplying the `contentSafety` option will enable content safety filtering.
```ts highlight="7"
@@ -84,7 +95,7 @@ import { type AssemblyAITranscriptionModelOptions } from '@ai-sdk/assemblyai';
import { readFile } from 'fs/promises';
const result = await transcribe({
- model: assemblyai.transcription('best'),
+ model: assemblyai.transcription('universal-3-5-pro'),
audio: await readFile('audio.mp3'),
providerOptions: {
assemblyai: {
@@ -118,8 +129,9 @@ The following provider options are available:
- **boostParam** _enum_
- Boost parameter for the transcription.
+ Boost parameter for `wordBoost`.
Allowed values: `'low'`, `'default'`, `'high'`.
+ **Deprecated** — only applies to the deprecated `wordBoost`; use `keytermsPrompt` instead.
Optional.
- **contentSafety** _boolean_
@@ -143,6 +155,12 @@ The following provider options are available:
Whether to include disfluencies (um, uh, etc.) in the transcription.
Optional.
+- **domain** _string_
+
+ Enable a domain-specific model for specialized terminology.
+ Currently supports `'medical-v1'` (Medical Mode).
+ Optional.
+
- **entityDetection** _boolean_
Whether to detect entities in the transcription.
@@ -163,6 +181,13 @@ The following provider options are available:
Whether to include IAB categories in the transcription.
Optional.
+- **keytermsPrompt** _array of strings_
+
+ Domain-specific keyterms to boost recognition for (max 6 words per phrase).
+ Replaces `wordBoost` for newer models — supported by `universal-3-pro`,
+ `universal-3-5-pro`, and `slam-1` (and `universal-2` when enabled).
+ Optional.
+
- **languageCode** _string_
Language code for the audio.
@@ -179,11 +204,24 @@ The following provider options are available:
Whether to enable language detection.
Optional.
+- **languageDetectionOptions** _object_
+
+ Options for automatic language detection: `expectedLanguages` (array of
+ strings), `fallbackLanguage` (string), `codeSwitching` (boolean),
+ `codeSwitchingConfidenceThreshold` (number, 0-1).
+ Optional.
+
- **multichannel** _boolean_
Whether to process multiple audio channels separately.
Optional.
+- **prompt** _string_
+
+ Natural-language context (up to 1,500 words) to steer the model.
+ Only supported by `universal-3-pro`, `universal-3-5-pro`, and `slam-1`.
+ Optional.
+
- **punctuate** _boolean_
Whether to add punctuation to the transcription.
@@ -199,6 +237,12 @@ The following provider options are available:
Whether to redact PII in the audio file.
Optional.
+- **redactPiiAudioOptions** _object_
+
+ Options for PII-redacted audio: `returnRedactedNoSpeechAudio` (boolean),
+ `overrideAudioRedactionMethod` (`'silence'`). Requires `redactPiiAudio`.
+ Optional.
+
- **redactPiiAudioQuality** _enum_
Quality of the redacted audio file.
@@ -211,12 +255,32 @@ The following provider options are available:
Supports numerous types like `'person_name'`, `'phone_number'`, etc.
Optional.
+- **redactPiiReturnUnredacted** _boolean_
+
+ Return the original unredacted transcript alongside the redacted one.
+ Requires `redactPii`.
+ Optional.
+
- **redactPiiSub** _enum_
Substitution method for redacted PII.
Allowed values: `'entity_name'`, `'hash'`.
Optional.
+- **redactStaticEntities** _object_
+
+ Map of user-defined labels to exact terms to redact, e.g.
+ `{ INTERNAL_TOOL: ['Bearclaw'] }`. Applied on top of standard PII redaction.
+ Requires `redactPii`.
+ Optional.
+
+- **removeAudioTags** _enum_
+
+ Remove inline annotations from rich transcripts.
+ Allowed values: `'all'` (all annotations), `'speaker'` (speaker cues only).
+ Universal-3 Pro models.
+ Optional.
+
- **sentimentAnalysis** _boolean_
Whether to perform sentiment analysis on the transcription.
@@ -227,6 +291,12 @@ The following provider options are available:
Whether to label different speakers in the transcription.
Optional.
+- **speakerOptions** _object_
+
+ Options for speaker diarization: `minSpeakersExpected` (number),
+ `maxSpeakersExpected` (number).
+ Optional.
+
- **speakersExpected** _number_
Expected number of speakers in the audio.
@@ -254,6 +324,12 @@ The following provider options are available:
Allowed values: `'bullets'`, `'bullets_verbose'`, `'gist'`, `'headline'`, `'paragraph'`.
Optional.
+- **temperature** _number_
+
+ Sampling temperature (0-1) controlling randomness.
+ Universal-3 Pro models.
+ Optional.
+
- **webhookAuthHeaderName** _string_
Name of the authentication header for webhook requests.
@@ -272,11 +348,60 @@ The following provider options are available:
- **wordBoost** _array of strings_
List of words to boost in the transcription.
+ **Deprecated** — rejected by `universal-3-pro`, `universal-3-5-pro`, and `slam-1`
+ (works only on `universal-2`/`best`); use `keytermsPrompt` instead.
Optional.
+### Speaker diarization and audio-intelligence results
+
+The AI SDK's `transcribe` result exposes `text`, `segments`, `language`, and
+`durationInSeconds`. AssemblyAI's richer results — speaker diarization and
+audio-intelligence features — don't fit that shape, so they are surfaced in two
+places:
+
+- **`providerMetadata.assemblyai`** — structured results for the features you
+ enabled: `utterances` (speaker-diarized segments, when `speakerLabels` is set),
+ `entities`, `sentimentAnalysisResults`, `contentSafetyLabels`,
+ `iabCategoriesResult`, and `autoHighlightsResult`.
+- **`response.body`** — the complete, raw AssemblyAI transcript response, so any
+ field not surfaced above (e.g. `chapters`, word-level `speaker` labels) is
+ still available.
+
+Note: timings inside `providerMetadata` and `response.body` (e.g.
+`utterances[].start`) are in **milliseconds**, matching the AssemblyAI API —
+whereas the top-level `segments` use **seconds**.
+
+```ts
+import { transcribe } from 'ai';
+import { assemblyai } from '@ai-sdk/assemblyai';
+import { readFile } from 'fs/promises';
+
+const result = await transcribe({
+ model: assemblyai.transcription('universal-3-5-pro'),
+ audio: await readFile('audio.mp3'),
+ providerOptions: {
+ assemblyai: {
+ speakerLabels: true,
+ entityDetection: true,
+ },
+ },
+});
+
+const { utterances, entities } = result.providerMetadata?.assemblyai ?? {};
+// utterances: [{ speaker: 'A', text: '…', start, end, … }, …] (start/end in ms)
+```
+
+The following AssemblyAI features are **deprecated** by the API and not surfaced
+in `providerMetadata` (their output remains on the raw `response.body` if
+enabled): Summarization, Auto Chapters, and Custom Topics. Note also that some
+features are language-gated (e.g. sentiment analysis is English-centric); see
+AssemblyAI's documentation for per-language availability.
+
### Model Capabilities
-| Model | Transcription | Duration | Segments | Language |
-| ------ | ------------------- | ------------------- | ------------------- | ------------------- |
-| `best` | | | | |
-| `nano` | | | | |
+| Model | Transcription | Duration | Segments | Language |
+| ------------------- | ------------------- | ------------------- | ------------------- | ------------------- |
+| `universal-3-5-pro` | | | | |
+| `universal-3-pro` | | | | |
+| `universal-2` | | | | |
+| `best` | | | | |
diff --git a/examples/ai-functions/src/transcribe/assemblyai/basic.ts b/examples/ai-functions/src/transcribe/assemblyai/basic.ts
index 5d1ebf221ef4..f50e5f519232 100644
--- a/examples/ai-functions/src/transcribe/assemblyai/basic.ts
+++ b/examples/ai-functions/src/transcribe/assemblyai/basic.ts
@@ -5,7 +5,7 @@ import { run } from '../../lib/run';
run(async () => {
const result = await transcribe({
- model: assemblyai.transcription('best'),
+ model: assemblyai.transcription('universal-3-5-pro'),
audio: await readFile('data/galileo.mp3'),
});
diff --git a/examples/ai-functions/src/transcribe/assemblyai/string.ts b/examples/ai-functions/src/transcribe/assemblyai/string.ts
index 9210cf1f263a..01a1d8172dd2 100644
--- a/examples/ai-functions/src/transcribe/assemblyai/string.ts
+++ b/examples/ai-functions/src/transcribe/assemblyai/string.ts
@@ -5,7 +5,7 @@ import { run } from '../../lib/run';
run(async () => {
const result = await transcribe({
- model: assemblyai.transcription('best'),
+ model: assemblyai.transcription('universal-3-5-pro'),
audio: Buffer.from(await readFile('./data/galileo.mp3')).toString('base64'),
});
diff --git a/examples/ai-functions/src/transcribe/assemblyai/url.ts b/examples/ai-functions/src/transcribe/assemblyai/url.ts
index 6493a81b7786..c48905c936b1 100644
--- a/examples/ai-functions/src/transcribe/assemblyai/url.ts
+++ b/examples/ai-functions/src/transcribe/assemblyai/url.ts
@@ -4,7 +4,7 @@ import { run } from '../../lib/run';
run(async () => {
const result = await transcribe({
- model: assemblyai.transcription('best'),
+ model: assemblyai.transcription('universal-3-5-pro'),
audio: new URL(
'https://github.com/vercel/ai/raw/refs/heads/main/examples/ai-functions/data/galileo.mp3',
),
diff --git a/packages/assemblyai/README.md b/packages/assemblyai/README.md
index a9de04ae4bc1..fc4930387235 100644
--- a/packages/assemblyai/README.md
+++ b/packages/assemblyai/README.md
@@ -36,7 +36,7 @@ import { assemblyai } from '@ai-sdk/assemblyai';
import { transcribe } from 'ai';
const { text } = await transcribe({
- model: assemblyai.transcription('best'),
+ model: assemblyai.transcription('universal-3-5-pro'),
audio: new URL(
'https://github.com/vercel/ai/raw/refs/heads/main/examples/ai-functions/data/galileo.mp3',
),
diff --git a/packages/assemblyai/src/assemblyai-api-types.ts b/packages/assemblyai/src/assemblyai-api-types.ts
index 22fd08b0f30c..17d690374cc8 100644
--- a/packages/assemblyai/src/assemblyai-api-types.ts
+++ b/packages/assemblyai/src/assemblyai-api-types.ts
@@ -28,6 +28,7 @@ export type AssemblyAITranscriptionAPITypes = {
/**
* How much to boost specified words
+ * @deprecated Only used with the deprecated `word_boost`. Use `keyterms_prompt`.
*/
boost_param?: 'low' | 'default' | 'high';
@@ -207,6 +208,16 @@ export type AssemblyAITranscriptionAPITypes = {
*/
language_detection?: boolean;
+ /**
+ * Options for automatic language detection.
+ */
+ language_detection_options?: {
+ expected_languages?: string[];
+ fallback_language?: string;
+ code_switching?: boolean;
+ code_switching_confidence_threshold?: number;
+ };
+
/**
* Enable Multichannel transcription, can be true or false.
* @default false
@@ -231,6 +242,14 @@ export type AssemblyAITranscriptionAPITypes = {
*/
redact_pii_audio?: boolean;
+ /**
+ * Options for PII-redacted audio files. Requires redact_pii.
+ */
+ redact_pii_audio_options?: {
+ return_redacted_no_speech_audio?: boolean;
+ override_audio_redaction_method?: 'silence';
+ };
+
/**
* Controls the filetype of the audio created by redact_pii_audio. Currently supports mp3 (default) and wav.
*/
@@ -286,11 +305,23 @@ export type AssemblyAITranscriptionAPITypes = {
| 'zodiac_sign'
>;
+ /**
+ * Return the original unredacted transcript alongside the redacted one.
+ * Requires redact_pii.
+ */
+ redact_pii_return_unredacted?: boolean;
+
/**
* The replacement logic for detected PII, can be "entity_name" or "hash".
*/
redact_pii_sub?: 'entity_name' | 'hash';
+ /**
+ * Map of user-defined labels to exact terms to redact, applied on top of
+ * standard PII redaction. Requires redact_pii.
+ */
+ redact_static_entities?: Record;
+
/**
* Enable Sentiment Analysis, can be true or false
* @default false
@@ -303,6 +334,14 @@ export type AssemblyAITranscriptionAPITypes = {
*/
speaker_labels?: boolean;
+ /**
+ * Options for speaker diarization, e.g. a range of possible speakers.
+ */
+ speaker_options?: {
+ min_speakers_expected?: number;
+ max_speakers_expected?: number;
+ };
+
/**
* Tells the speaker label model how many speakers it should attempt to identify, up to 10.
*/
@@ -310,8 +349,21 @@ export type AssemblyAITranscriptionAPITypes = {
/**
* The speech model to use for the transcription.
+ *
+ * @deprecated This parameter has been replaced with `speech_models`. It only
+ * supports the legacy `best` model. Use `speech_models` for `universal-2`,
+ * `universal-3-pro`, `universal-3-5-pro`, etc.
+ * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model
*/
- speech_model?: 'best' | 'nano';
+ speech_model?: 'best';
+
+ /**
+ * List of speech models in priority order, allowing the system to
+ * automatically route the audio to the best available option. When omitted,
+ * the API defaults to `['universal-3-pro', 'universal-2']`.
+ * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model
+ */
+ speech_models?: string[];
/**
* Reject audio files that contain less than this fraction of speech. Valid values are in the range [0, 1] inclusive.
@@ -357,6 +409,37 @@ export type AssemblyAITranscriptionAPITypes = {
/**
* The list of custom vocabulary to boost transcription probability for
+ * @deprecated Rejected by `universal-3-pro` / `universal-3-5-pro` and
+ * `slam-1` (works only on `universal-2`/`best`). Use `keyterms_prompt`.
*/
word_boost?: string[];
+
+ /**
+ * Domain-specific keyterms to boost (max 6 words per phrase). Replaces
+ * `word_boost` for `universal-3-pro` / `universal-3-5-pro` and `slam-1`.
+ */
+ keyterms_prompt?: string[];
+
+ /**
+ * Natural-language context (up to 1,500 words) to steer the model.
+ * Only supported by `universal-3-pro` / `universal-3-5-pro` and `slam-1`.
+ */
+ prompt?: string;
+
+ /**
+ * Sampling temperature (0-1) controlling randomness. Universal-3 Pro models.
+ */
+ temperature?: number;
+
+ /**
+ * Remove inline annotations from rich transcripts: `'all'` removes all
+ * annotations, `'speaker'` removes only speaker cues. Universal-3 Pro models.
+ */
+ remove_audio_tags?: 'all' | 'speaker';
+
+ /**
+ * Enable a domain-specific model to improve accuracy for specialized
+ * terminology, e.g. `'medical-v1'` for Medical Mode.
+ */
+ domain?: string;
};
diff --git a/packages/assemblyai/src/assemblyai-provider.ts b/packages/assemblyai/src/assemblyai-provider.ts
index 8c517e5d932d..ff4779284953 100644
--- a/packages/assemblyai/src/assemblyai-provider.ts
+++ b/packages/assemblyai/src/assemblyai-provider.ts
@@ -14,7 +14,7 @@ import { VERSION } from './version';
export interface AssemblyAIProvider extends ProviderV4 {
(
- modelId: 'best',
+ modelId: AssemblyAITranscriptionModelId,
settings?: {},
): {
transcription: AssemblyAITranscriptionModel;
diff --git a/packages/assemblyai/src/assemblyai-transcription-model-options.ts b/packages/assemblyai/src/assemblyai-transcription-model-options.ts
index 7fc753ea86cd..ef04e94f9e3a 100644
--- a/packages/assemblyai/src/assemblyai-transcription-model-options.ts
+++ b/packages/assemblyai/src/assemblyai-transcription-model-options.ts
@@ -19,8 +19,12 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
*/
autoHighlights: z.boolean().nullish(),
/**
- * Boost parameter for the transcription.
+ * Boost parameter for word boost (used with `wordBoost`).
* Allowed values: 'low', 'default', 'high'.
+ *
+ * @deprecated Only applies to the deprecated `wordBoost` option. Use
+ * `keytermsPrompt` instead, which works with the recommended `universal-*`
+ * models.
*/
boostParam: z.string().nullish(),
/**
@@ -46,6 +50,11 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* Whether to include filler words (um, uh, etc.) in the transcription.
*/
disfluencies: z.boolean().nullish(),
+ /**
+ * Enable a domain-specific model to improve accuracy for specialized
+ * terminology. Currently supports `'medical-v1'` (Medical Mode).
+ */
+ domain: z.string().nullish(),
/**
* Whether to enable entity detection.
*/
@@ -62,6 +71,13 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* Whether to enable IAB categories detection.
*/
iabCategories: z.boolean().nullish(),
+ /**
+ * Domain-specific keyterms to boost recognition for (max 6 words per phrase).
+ * Replaces `wordBoost` for newer models: supported by `universal-3-pro` /
+ * `universal-3-5-pro` and `slam-1` (and `universal-2` when metaphone is
+ * enabled for the account).
+ */
+ keytermsPrompt: z.array(z.string()).nullish(),
/**
* Language code for the transcription.
*/
@@ -74,10 +90,30 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* Whether to enable language detection.
*/
languageDetection: z.boolean().nullish(),
+ /**
+ * Options for automatic language detection.
+ */
+ languageDetectionOptions: z
+ .object({
+ /** List of languages expected in the audio file. */
+ expectedLanguages: z.array(z.string()).nullish(),
+ /** Fallback language if the detected language is not expected. */
+ fallbackLanguage: z.string().nullish(),
+ /** Whether code switching should be detected. */
+ codeSwitching: z.boolean().nullish(),
+ /** Confidence threshold for code switching detection (0-1). */
+ codeSwitchingConfidenceThreshold: z.number().min(0).max(1).nullish(),
+ })
+ .nullish(),
/**
* Whether to process audio as multichannel.
*/
multichannel: z.boolean().nullish(),
+ /**
+ * Provide natural-language context (up to 1,500 words) to steer the model.
+ * Only supported by `universal-3-pro` / `universal-3-5-pro` and `slam-1`.
+ */
+ prompt: z.string().nullish(),
/**
* Whether to add punctuation to the transcription.
*/
@@ -90,6 +126,17 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* Whether to redact PII in the audio file.
*/
redactPiiAudio: z.boolean().nullish(),
+ /**
+ * Options for PII-redacted audio files. Requires `redactPiiAudio`.
+ */
+ redactPiiAudioOptions: z
+ .object({
+ /** Return redacted audio even for files without detected speech. */
+ returnRedactedNoSpeechAudio: z.boolean().nullish(),
+ /** Redaction method; set to `'silence'` to replace PII with silence. */
+ overrideAudioRedactionMethod: z.enum(['silence']).nullish(),
+ })
+ .nullish(),
/**
* Audio format for PII redaction.
*/
@@ -98,10 +145,26 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* List of PII types to redact.
*/
redactPiiPolicies: z.array(z.string()).nullish(),
+ /**
+ * Return the original unredacted transcript alongside the redacted one.
+ * Requires `redactPii`.
+ */
+ redactPiiReturnUnredacted: z.boolean().nullish(),
/**
* Substitution method for redacted PII.
*/
redactPiiSub: z.string().nullish(),
+ /**
+ * Map of user-defined labels to exact terms to redact, e.g.
+ * `{ INTERNAL_TOOL: ['Bearclaw'] }`. Applied on top of standard PII redaction
+ * using `redactPiiSub`. Requires `redactPii`.
+ */
+ redactStaticEntities: z.record(z.string(), z.array(z.string())).nullish(),
+ /**
+ * Remove inline annotations from rich transcripts. `'all'` removes all inline
+ * annotations; `'speaker'` removes only speaker cues. Universal-3 Pro models.
+ */
+ removeAudioTags: z.enum(['all', 'speaker']).nullish(),
/**
* Whether to enable sentiment analysis.
*/
@@ -110,6 +173,17 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* Whether to identify different speakers in the audio.
*/
speakerLabels: z.boolean().nullish(),
+ /**
+ * Options for speaker diarization, e.g. a range of possible speakers.
+ */
+ speakerOptions: z
+ .object({
+ /** Minimum number of speakers expected in the audio file. */
+ minSpeakersExpected: z.number().int().nullish(),
+ /** Maximum number of speakers expected in the audio file. */
+ maxSpeakersExpected: z.number().int().nullish(),
+ })
+ .nullish(),
/**
* Number of speakers expected in the audio.
*/
@@ -130,6 +204,10 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
* Type of summary to generate.
*/
summaryType: z.string().nullish(),
+ /**
+ * Sampling temperature (0-1) controlling randomness. Universal-3 Pro models.
+ */
+ temperature: z.number().min(0).max(1).nullish(),
/**
* Name of the authentication header for webhook requests.
*/
@@ -144,6 +222,10 @@ export const assemblyaiTranscriptionModelOptionsSchema = z.object({
webhookUrl: z.string().nullish(),
/**
* List of words to boost recognition for.
+ *
+ * @deprecated `wordBoost` is rejected by `universal-3-pro` /
+ * `universal-3-5-pro` and `slam-1` (it only works on `universal-2`/`best`).
+ * Use `keytermsPrompt` instead.
*/
wordBoost: z.array(z.string()).nullish(),
});
diff --git a/packages/assemblyai/src/assemblyai-transcription-model.test.ts b/packages/assemblyai/src/assemblyai-transcription-model.test.ts
index 2147a6c05b66..ddc9cabb390d 100644
--- a/packages/assemblyai/src/assemblyai-transcription-model.test.ts
+++ b/packages/assemblyai/src/assemblyai-transcription-model.test.ts
@@ -233,6 +233,16 @@ describe('doGenerate', () => {
summary_model: 'informative',
summary: '- Hello, world!',
sentiment_analysis: true,
+ sentiment_analysis_results: [
+ {
+ text: 'Hello, world!',
+ start: 250,
+ end: 26950,
+ sentiment: 'POSITIVE',
+ confidence: 0.9,
+ speaker: 'A',
+ },
+ ],
entity_detection: true,
entities: [
{
@@ -256,18 +266,369 @@ describe('doGenerate', () => {
};
}
- it('should pass the model', async () => {
+ it('should pass the legacy model via the speech_model parameter', async () => {
prepareJsonResponse();
- await model.doGenerate({
+ const result = await model.doGenerate({
audio: audioData,
mediaType: 'audio/wav',
});
- expect(await server.calls[1].requestBodyJson).toMatchObject({
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody).toMatchObject({
audio_url: 'https://storage.assemblyai.com/mock-upload-url',
speech_model: 'best',
});
+ expect(requestBody.speech_models).toBeUndefined();
+
+ expect(result.warnings).toContainEqual({
+ type: 'deprecated',
+ setting: "model 'best'",
+ message: expect.stringContaining('universal-3-5-pro'),
+ });
+ const [deprecation] = result.warnings.filter(
+ warning => warning.type === 'deprecated',
+ );
+ expect(deprecation?.message).toContain(
+ 'https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model',
+ );
+ });
+
+ it('should pass newer models via the speech_models parameter', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody).toMatchObject({
+ audio_url: 'https://storage.assemblyai.com/mock-upload-url',
+ speech_models: ['universal-3-5-pro'],
+ });
+ expect(requestBody.speech_model).toBeUndefined();
+
+ // No deprecation and no nudge for the latest flagship model.
+ expect(result.warnings).toEqual([]);
+ });
+
+ it('should route universal-3-pro via speech_models and nudge to universal-3-5-pro', async () => {
+ prepareJsonResponse();
+
+ const result = await provider.transcription('universal-3-pro').doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody.speech_models).toEqual(['universal-3-pro']);
+ expect(requestBody.speech_model).toBeUndefined();
+
+ // universal-3-pro is the model universal-3-5-pro replaces, so the message
+ // names it explicitly.
+ const [nudge] = result.warnings.filter(w => w.type === 'other');
+ expect(nudge?.message).toContain('universal-3-5-pro');
+ expect(nudge?.message).toContain("replace 'universal-3-pro'");
+ });
+
+ it('should nudge universal-2 users toward universal-3-5-pro', async () => {
+ prepareJsonResponse();
+
+ const result = await provider.transcription('universal-2').doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ // The nudge for universal-2 must not claim it is replaced by universal-3-pro.
+ const [nudge] = result.warnings.filter(w => w.type === 'other');
+ expect(nudge?.message).toContain('universal-3-5-pro');
+ expect(nudge?.message).not.toContain("replace 'universal-3-pro'");
+ });
+
+ it('should not special-case the removed nano model', async () => {
+ prepareJsonResponse();
+
+ const result = await provider.transcription('nano').doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ // `nano` is no longer a legacy `speech_model` alias: it falls through to
+ // `speech_models` (where the live API rejects it) and emits no warning.
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody.speech_models).toEqual(['nano']);
+ expect(requestBody.speech_model).toBeUndefined();
+ expect(
+ result.warnings.filter(warning => warning.type === 'deprecated'),
+ ).toEqual([]);
+ });
+
+ it('should still send provider options alongside speech_models', async () => {
+ prepareJsonResponse();
+
+ await provider.transcription('universal-3-5-pro').doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: {
+ languageDetection: true,
+ punctuate: false,
+ },
+ },
+ });
+
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody).toMatchObject({
+ speech_models: ['universal-3-5-pro'],
+ language_detection: true,
+ punctuate: false,
+ });
+ });
+
+ it('should surface diarization + audio-intelligence via providerMetadata', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ const metadata = result.providerMetadata?.assemblyai as
+ | Record
+ | undefined;
+ expect(metadata).toBeDefined();
+
+ // Speaker diarization
+ expect(metadata?.utterances?.[0]).toMatchObject({
+ speaker: 'A',
+ text: 'Hello, world!',
+ });
+
+ // Audio-intelligence results
+ expect(metadata?.entities?.[0]).toMatchObject({
+ entity_type: 'location',
+ text: 'Canada',
+ });
+ expect(metadata?.sentimentAnalysisResults?.[0]).toMatchObject({
+ sentiment: 'POSITIVE',
+ text: 'Hello, world!',
+ });
+ expect(metadata?.contentSafetyLabels).toBeDefined();
+ expect(metadata?.iabCategoriesResult).toBeDefined();
+ expect(metadata?.autoHighlightsResult).toBeDefined();
+ });
+
+ it('should preserve the full raw response on response.body', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ const body = result.response.body as Record;
+ // Word-level speaker label survives on the raw body.
+ expect(body.words[0].speaker).toBe('speaker');
+ // Fields not modeled in our schema (e.g. chapters, summary) are no longer
+ // stripped — proves response.body is the raw response, not the parsed one.
+ expect(body.chapters).toBeDefined();
+ expect(body.summary).toBe('- Hello, world!');
+ });
+
+ it('should pass the Universal-3-Pro input params', async () => {
+ prepareJsonResponse();
+
+ await provider.transcription('universal-3-5-pro').doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: {
+ prompt: 'This is a conversation about the AI SDK.',
+ keytermsPrompt: ['Vercel', 'AI SDK'],
+ temperature: 0.2,
+ removeAudioTags: 'speaker',
+ domain: 'medical-v1',
+ },
+ },
+ });
+
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody).toMatchObject({
+ speech_models: ['universal-3-5-pro'],
+ prompt: 'This is a conversation about the AI SDK.',
+ keyterms_prompt: ['Vercel', 'AI SDK'],
+ temperature: 0.2,
+ remove_audio_tags: 'speaker',
+ domain: 'medical-v1',
+ });
+ });
+
+ it('should pass the GA nested input params', async () => {
+ prepareJsonResponse();
+
+ await provider.transcription('universal-3-5-pro').doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: {
+ redactPii: true,
+ speakerOptions: { minSpeakersExpected: 1, maxSpeakersExpected: 3 },
+ languageDetectionOptions: {
+ expectedLanguages: ['en', 'es'],
+ fallbackLanguage: 'en',
+ codeSwitching: true,
+ codeSwitchingConfidenceThreshold: 0.5,
+ },
+ redactPiiAudioOptions: {
+ returnRedactedNoSpeechAudio: true,
+ overrideAudioRedactionMethod: 'silence',
+ },
+ redactPiiReturnUnredacted: true,
+ redactStaticEntities: { INTERNAL_TOOL: ['Bearclaw'] },
+ },
+ },
+ });
+
+ const requestBody = await server.calls[1].requestBodyJson;
+ expect(requestBody).toMatchObject({
+ speaker_options: { min_speakers_expected: 1, max_speakers_expected: 3 },
+ language_detection_options: {
+ expected_languages: ['en', 'es'],
+ fallback_language: 'en',
+ code_switching: true,
+ code_switching_confidence_threshold: 0.5,
+ },
+ redact_pii_audio_options: {
+ return_redacted_no_speech_audio: true,
+ override_audio_redaction_method: 'silence',
+ },
+ redact_pii_return_unredacted: true,
+ redact_static_entities: { INTERNAL_TOOL: ['Bearclaw'] },
+ });
+ });
+
+ it('should warn when deprecated wordBoost/boostParam options are used', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: { wordBoost: ['Vercel'], boostParam: 'high' },
+ },
+ });
+
+ expect(result.warnings).toContainEqual({
+ type: 'deprecated',
+ setting: 'wordBoost, boostParam',
+ message: expect.stringContaining('keytermsPrompt'),
+ });
+ });
+
+ it('should attribute the deprecation warning to boostParam when only boostParam is set', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: { assemblyai: { boostParam: 'high' } },
+ });
+
+ expect(result.warnings).toContainEqual({
+ type: 'deprecated',
+ setting: 'boostParam',
+ message: expect.stringContaining('keytermsPrompt'),
+ });
+ });
+
+ it('should warn when redactPii-dependent options are set without redactPii', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: { redactStaticEntities: { TOOL: ['Vercel'] } },
+ },
+ });
+
+ expect(
+ result.warnings.some(
+ w => w.type === 'other' && w.message.includes('redactPii'),
+ ),
+ ).toBe(true);
+ });
+
+ it('should warn when redactPiiAudioOptions is set without redactPiiAudio', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: {
+ redactPii: true,
+ redactPiiAudioOptions: { overrideAudioRedactionMethod: 'silence' },
+ },
+ },
+ });
+
+ expect(
+ result.warnings.some(
+ w => w.type === 'other' && w.message.includes('redactPiiAudio'),
+ ),
+ ).toBe(true);
+ });
+
+ it('should warn when languageCode and languageDetection are combined', async () => {
+ prepareJsonResponse();
+
+ const result = await provider
+ .transcription('universal-3-5-pro')
+ .doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ providerOptions: {
+ assemblyai: { languageCode: 'en', languageDetection: true },
+ },
+ });
+
+ expect(
+ result.warnings.some(
+ w => w.type === 'other' && w.message.includes('languageDetection'),
+ ),
+ ).toBe(true);
+ });
+
+ it('should report segment timings in seconds (ms converted)', async () => {
+ prepareJsonResponse();
+
+ const result = await model.doGenerate({
+ audio: audioData,
+ mediaType: 'audio/wav',
+ });
+
+ // Fixture word[0] is start: 250ms, end: 650ms → 0.25s / 0.65s.
+ expect(result.segments[0]).toEqual({
+ text: 'Hello,',
+ startSecond: 0.25,
+ endSecond: 0.65,
+ });
});
it('should pass headers', async () => {
diff --git a/packages/assemblyai/src/assemblyai-transcription-model.ts b/packages/assemblyai/src/assemblyai-transcription-model.ts
index f6b6ce2e3b69..4059430e5058 100644
--- a/packages/assemblyai/src/assemblyai-transcription-model.ts
+++ b/packages/assemblyai/src/assemblyai-transcription-model.ts
@@ -1,4 +1,8 @@
-import type { TranscriptionModelV4, SharedV4Warning } from '@ai-sdk/provider';
+import type {
+ TranscriptionModelV4,
+ SharedV4Warning,
+ SharedV4ProviderMetadata,
+} from '@ai-sdk/provider';
import {
combineHeaders,
createJsonResponseHandler,
@@ -66,9 +70,42 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 {
schema: assemblyaiTranscriptionModelOptionsSchema,
});
- const body: Omit = {
- speech_model: this.modelId as 'best' | 'nano',
- };
+ const body: Omit = {};
+
+ // The legacy `best` model is selected via the deprecated singular
+ // `speech_model` parameter. All other models (e.g. `universal-2`,
+ // `universal-3-pro`, `universal-3-5-pro`) are only accessible via the
+ // `speech_models` array and are rejected by `speech_model`.
+ // See https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model
+ if (this.modelId === 'best') {
+ body.speech_model = this.modelId as 'best';
+ warnings.push({
+ type: 'deprecated',
+ setting: `model '${this.modelId}'`,
+ message:
+ "The 'best' model is a legacy AssemblyAI model. Use 'universal-3-5-pro' instead. See documentation: https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model",
+ });
+ } else {
+ body.speech_models = [this.modelId];
+
+ // Forward-looking nudge: universal-3-5-pro is AssemblyAI's latest
+ // flagship and is set to replace universal-3-pro. Not a deprecation —
+ // both models still work — so this is an informational warning only.
+ if (
+ this.modelId === 'universal-3-pro' ||
+ this.modelId === 'universal-2'
+ ) {
+ const docsUrl =
+ 'https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model';
+ warnings.push({
+ type: 'other',
+ message:
+ this.modelId === 'universal-3-pro'
+ ? `'universal-3-5-pro' is AssemblyAI's latest flagship model and is set to replace 'universal-3-pro'. See ${docsUrl}`
+ : `'universal-3-5-pro' is AssemblyAI's latest flagship model. See ${docsUrl}`,
+ });
+ }
+ }
// Add provider-specific options
if (assemblyaiOptions) {
@@ -118,6 +155,103 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 {
assemblyaiOptions.webhookAuthHeaderValue ?? undefined;
body.webhook_url = assemblyaiOptions.webhookUrl ?? undefined;
body.word_boost = assemblyaiOptions.wordBoost ?? undefined;
+ body.keyterms_prompt = assemblyaiOptions.keytermsPrompt ?? undefined;
+ body.prompt = assemblyaiOptions.prompt ?? undefined;
+ body.temperature = assemblyaiOptions.temperature ?? undefined;
+ body.remove_audio_tags = assemblyaiOptions.removeAudioTags ?? undefined;
+ body.domain = assemblyaiOptions.domain ?? undefined;
+ body.redact_pii_return_unredacted =
+ assemblyaiOptions.redactPiiReturnUnredacted ?? undefined;
+ body.redact_static_entities =
+ assemblyaiOptions.redactStaticEntities ?? undefined;
+
+ if (assemblyaiOptions.speakerOptions) {
+ body.speaker_options = {
+ min_speakers_expected:
+ assemblyaiOptions.speakerOptions.minSpeakersExpected ?? undefined,
+ max_speakers_expected:
+ assemblyaiOptions.speakerOptions.maxSpeakersExpected ?? undefined,
+ };
+ }
+
+ if (assemblyaiOptions.languageDetectionOptions) {
+ body.language_detection_options = {
+ expected_languages:
+ assemblyaiOptions.languageDetectionOptions.expectedLanguages ??
+ undefined,
+ fallback_language:
+ assemblyaiOptions.languageDetectionOptions.fallbackLanguage ??
+ undefined,
+ code_switching:
+ assemblyaiOptions.languageDetectionOptions.codeSwitching ??
+ undefined,
+ code_switching_confidence_threshold:
+ assemblyaiOptions.languageDetectionOptions
+ .codeSwitchingConfidenceThreshold ?? undefined,
+ };
+ }
+
+ if (assemblyaiOptions.redactPiiAudioOptions) {
+ body.redact_pii_audio_options = {
+ return_redacted_no_speech_audio:
+ assemblyaiOptions.redactPiiAudioOptions
+ .returnRedactedNoSpeechAudio ?? undefined,
+ override_audio_redaction_method:
+ assemblyaiOptions.redactPiiAudioOptions
+ .overrideAudioRedactionMethod ?? undefined,
+ };
+ }
+
+ const deprecatedBoostOptions: string[] = [];
+ if (assemblyaiOptions.wordBoost != null) {
+ deprecatedBoostOptions.push('wordBoost');
+ }
+ if (assemblyaiOptions.boostParam != null) {
+ deprecatedBoostOptions.push('boostParam');
+ }
+ if (deprecatedBoostOptions.length > 0) {
+ warnings.push({
+ type: 'deprecated',
+ setting: deprecatedBoostOptions.join(', '),
+ message:
+ "'wordBoost' and 'boostParam' are deprecated and are rejected by 'universal-3-pro' / 'universal-3-5-pro' and 'slam-1'. Use 'keytermsPrompt' instead.",
+ });
+ }
+
+ // The following options only take effect alongside a prerequisite
+ // option; without it AssemblyAI either rejects the request (400) or
+ // silently ignores the option. Warn rather than mutate user input.
+ if (
+ (assemblyaiOptions.redactPiiReturnUnredacted != null ||
+ assemblyaiOptions.redactStaticEntities != null) &&
+ !assemblyaiOptions.redactPii
+ ) {
+ warnings.push({
+ type: 'other',
+ message:
+ "'redactPiiReturnUnredacted' and 'redactStaticEntities' require 'redactPii' to be enabled; AssemblyAI rejects the request otherwise.",
+ });
+ }
+ if (
+ assemblyaiOptions.redactPiiAudioOptions != null &&
+ !assemblyaiOptions.redactPiiAudio
+ ) {
+ warnings.push({
+ type: 'other',
+ message:
+ "'redactPiiAudioOptions' only applies when 'redactPiiAudio' is enabled; it is otherwise ignored.",
+ });
+ }
+ if (
+ assemblyaiOptions.languageCode != null &&
+ assemblyaiOptions.languageDetection
+ ) {
+ warnings.push({
+ type: 'other',
+ message:
+ "'languageDetection' cannot be combined with an explicit 'languageCode'; AssemblyAI rejects requests that set both.",
+ });
+ }
}
return {
@@ -137,17 +271,22 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 {
abortSignal?: AbortSignal,
): Promise<{
transcript: z.infer;
+ rawTranscript: unknown;
responseHeaders: Record;
}> {
const pollingInterval =
this.config.pollingInterval ?? this.POLLING_INTERVAL_MS;
+ // Honor a caller-provided fetch (proxy, auth injection, tests) for the
+ // polling GETs, matching the upload/submit calls that use config.fetch.
+ const fetchImpl = this.config.fetch ?? globalThis.fetch;
+
while (true) {
if (abortSignal?.aborted) {
throw new Error('Transcription request was aborted');
}
- const response = await fetch(
+ const response = await fetchImpl(
this.config.url({
path: `/v2/transcript/${transcriptId}`,
modelId: this.modelId,
@@ -173,13 +312,14 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 {
});
}
- const transcript = assemblyaiTranscriptionResponseSchema.parse(
- await response.json(),
- );
+ const rawTranscript = await response.json();
+ const transcript =
+ assemblyaiTranscriptionResponseSchema.parse(rawTranscript);
if (transcript.status === 'completed') {
return {
transcript,
+ rawTranscript,
responseHeaders: extractResponseHeaders(response),
};
}
@@ -240,29 +380,70 @@ export class AssemblyAITranscriptionModel implements TranscriptionModelV4 {
fetch: this.config.fetch,
});
- const { transcript, responseHeaders } = await this.waitForCompletion(
- submitResponse.id,
- options.headers,
- options.abortSignal,
- );
+ const { transcript, rawTranscript, responseHeaders } =
+ await this.waitForCompletion(
+ submitResponse.id,
+ options.headers,
+ options.abortSignal,
+ );
+
+ // Surface diarization and audio-intelligence results that the AI SDK's
+ // `segments` shape can't represent, keyed under `assemblyai`. Presence is
+ // gated on the parsed transcript, but values are taken from the raw
+ // response so no fields are stripped by the schema.
+ //
+ // NOTE: timings inside these objects (e.g. `utterances[].start`) are in
+ // milliseconds, matching the AssemblyAI API — unlike the top-level
+ // `segments`, whose `startSecond`/`endSecond` are in seconds.
+ const raw = (rawTranscript ?? {}) as Record;
+ const assemblyaiMetadata: Record = {};
+ if (transcript.utterances != null) {
+ assemblyaiMetadata.utterances = raw.utterances;
+ }
+ if (transcript.sentiment_analysis_results != null) {
+ assemblyaiMetadata.sentimentAnalysisResults =
+ raw.sentiment_analysis_results;
+ }
+ if (transcript.entities != null) {
+ assemblyaiMetadata.entities = raw.entities;
+ }
+ if (transcript.content_safety_labels != null) {
+ assemblyaiMetadata.contentSafetyLabels = raw.content_safety_labels;
+ }
+ if (transcript.iab_categories_result != null) {
+ assemblyaiMetadata.iabCategoriesResult = raw.iab_categories_result;
+ }
+ if (transcript.auto_highlights_result != null) {
+ assemblyaiMetadata.autoHighlightsResult = raw.auto_highlights_result;
+ }
+
+ const lastWordEndMs = transcript.words?.at(-1)?.end;
return {
text: transcript.text ?? '',
+ // AssemblyAI returns word timings in milliseconds; the AI SDK reports
+ // segment timings in seconds.
segments:
transcript.words?.map(word => ({
text: word.text,
- startSecond: word.start,
- endSecond: word.end,
+ startSecond: word.start / 1000,
+ endSecond: word.end / 1000,
})) ?? [],
language: transcript.language_code ?? undefined,
durationInSeconds:
- transcript.audio_duration ?? transcript.words?.at(-1)?.end ?? undefined,
+ transcript.audio_duration ??
+ (lastWordEndMs != null ? lastWordEndMs / 1000 : undefined),
warnings,
+ ...(Object.keys(assemblyaiMetadata).length > 0 && {
+ providerMetadata: {
+ assemblyai: assemblyaiMetadata,
+ } as SharedV4ProviderMetadata,
+ }),
response: {
timestamp: currentDate,
modelId: this.modelId,
headers: responseHeaders, // Headers from final GET request
- body: transcript, // Raw response from final GET request
+ body: rawTranscript, // Full raw response from final GET request
},
};
}
@@ -277,20 +458,65 @@ const assemblyaiSubmitResponseSchema = z.object({
status: z.enum(['queued', 'processing', 'completed', 'error']),
});
+const assemblyaiWordSchema = z.object({
+ start: z.number(),
+ end: z.number(),
+ text: z.string(),
+ confidence: z.number().nullish(),
+ // Speaker label (e.g. 'A', 'B') when speaker diarization is enabled, else null.
+ speaker: z.string().nullish(),
+ channel: z.string().nullish(),
+});
+
const assemblyaiTranscriptionResponseSchema = z.object({
id: z.string(),
status: z.enum(['queued', 'processing', 'completed', 'error']),
text: z.string().nullish(),
language_code: z.string().nullish(),
- words: z
+ speech_model_used: z.string().nullish(),
+ words: z.array(assemblyaiWordSchema).nullish(),
+ // Speaker-diarized utterances (present when `speaker_labels` is enabled).
+ utterances: z
.array(
z.object({
start: z.number(),
end: z.number(),
text: z.string(),
+ confidence: z.number().nullish(),
+ speaker: z.string().nullish(),
+ channel: z.string().nullish(),
+ words: z.array(assemblyaiWordSchema).nullish(),
+ }),
+ )
+ .nullish(),
+ // Audio-intelligence results, present only when the matching feature is
+ // enabled. Kept intentionally permissive (the full structures are also
+ // available on the raw `response.body`).
+ sentiment_analysis_results: z
+ .array(
+ z.object({
+ text: z.string(),
+ start: z.number().nullish(),
+ end: z.number().nullish(),
+ sentiment: z.string(),
+ confidence: z.number().nullish(),
+ speaker: z.string().nullish(),
+ }),
+ )
+ .nullish(),
+ entities: z
+ .array(
+ z.object({
+ entity_type: z.string(),
+ text: z.string(),
+ start: z.number().nullish(),
+ end: z.number().nullish(),
}),
)
.nullish(),
+ content_safety_labels: z.record(z.string(), z.any()).nullish(),
+ iab_categories_result: z.record(z.string(), z.any()).nullish(),
+ auto_highlights_result: z.record(z.string(), z.any()).nullish(),
audio_duration: z.number().nullish(),
error: z.string().nullish(),
});
diff --git a/packages/assemblyai/src/assemblyai-transcription-settings.ts b/packages/assemblyai/src/assemblyai-transcription-settings.ts
index 61ee8e09f29d..ef3ba0c1ba1a 100644
--- a/packages/assemblyai/src/assemblyai-transcription-settings.ts
+++ b/packages/assemblyai/src/assemblyai-transcription-settings.ts
@@ -1 +1,15 @@
-export type AssemblyAITranscriptionModelId = 'best' | 'nano' | (string & {});
+/**
+ * Legacy AssemblyAI speech model, sent via the deprecated singular
+ * `speech_model` request parameter.
+ *
+ * @deprecated Use `universal-3-5-pro` instead.
+ * @see https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model
+ */
+export type AssemblyAIDeprecatedTranscriptionModelId = 'best';
+
+export type AssemblyAITranscriptionModelId =
+ | 'universal-2'
+ | 'universal-3-pro'
+ | 'universal-3-5-pro'
+ | AssemblyAIDeprecatedTranscriptionModelId
+ | (string & {});