transloadit · kvz · May 30, 2026 · May 30, 2026
diff --git a/packages/node/README.md b/packages/node/README.md
@@ -156,6 +156,7 @@ All intent commands also support the global CLI flags `--json`, `--log-level`, `
 | `document auto-rotate` | Auto-rotate documents to the correct orientation | file, dir, URL, base64 | file |
 | `document thumbs` | Extract thumbnail images from documents | file, dir, URL, base64 | directory |
 | `audio waveform` | Generate waveform images from audio | file, dir, URL, base64 | file |
+| `speech transcribe` | Transcribe speech in audio or video files | file, dir, URL, base64 | file |
 | `text speak` | Speak text | file, dir, URL, base64 | file |
 | `video thumbs` | Extract thumbnails from videos | file, dir, URL, base64 | directory |
 | `video encode-hls` | Run builtin/encode-hls-video@latest | file, dir, URL, base64 | directory |
@@ -473,7 +474,7 @@ npx transloadit image resize --input <path|dir|url|-> [options]
 
 | Flag | Type | Required | Example | Description |
 | --- | --- | --- | --- | --- |
-| `--format` | `string` | no | `value` | The output format for the modified image. Some of the most important available formats are "jpg", "png", "gif", and "tiff". For a complete lists of all formats that we can write… |
+| `--format` | `string` | no | `value` | The output format for the modified image. Some of the most important available formats are "jpg", "png", "gif", "tiff", and "jxl" for JPEG XL. For a complete list of all formats… |
 | `--width` | `number` | no | `1` | Width of the result in pixels. If not specified, will default to the width of the original. |
 | `--height` | `number` | no | `1` | Height of the new image, in pixels. If not specified, will default to the height of the input image. |
 | `--resize-strategy` | `string` | no | `crop` | See the list of available resize strategies. |
@@ -798,6 +799,48 @@ npx transloadit audio waveform --input <path|dir|url|-> [options]
 transloadit audio waveform --input input.mp3 --output output.png
 ```
 
+#### `speech transcribe`
+
+Transcribe speech in audio or video files
+
+Runs `/speech/transcribe` with a text-first default and writes the transcript to `--output`.
+
+**Usage**
+
+```bash
+npx transloadit speech transcribe --input <path|dir|url|-> [options]
+```
+
+**Quick facts**
+
+- Input: file, dir, URL, base64
+- Output: file
+- Execution: per-file; supports `--watch`
+- Backend: semantic alias `speech-transcribe`
+
+**Shared flags**
+
+- Uses the shared file input and output flags listed above.
+- Also supports the shared base processing flags, watch flags listed above.
+
+**Command options**
+
+| Flag | Type | Required | Example | Description |
+| --- | --- | --- | --- | --- |
+| `--provider` | `string` | no | `replicate` | Provider to use for transcription. Defaults to replicate. |
+| `--format` | `string` | no | `text` | Output format. Defaults to text. |
+| `--source-language` | `string` | no | `en-US` | Spoken language as a BCP-47 code, for providers that support explicit source languages. |
+| `--target-language` | `string` | no | `en-US` | Target written language for providers that support translation. |
+
+**Examples**
+
+```bash
+# Transcribe an audio file to text
+transloadit speech transcribe --input voice.opus --output voice.txt
+# Generate subtitles
+transloadit speech transcribe --input clip.mp4 --format webvtt --output captions.vtt
+```
+
 #### `text speak`
 
 Speak text
@@ -1896,11 +1939,3 @@ See [CONTRIBUTING](./CONTRIBUTING.md).
 
 
 
-
-
-
-
-
-
-
-
diff --git a/packages/node/docs/intent-commands.md b/packages/node/docs/intent-commands.md
@@ -24,6 +24,7 @@ All intent commands also support the global CLI flags `--json`, `--log-level`, `
 | `document auto-rotate` | Auto-rotate documents to the correct orientation | file, dir, URL, base64 | file |
 | `document thumbs` | Extract thumbnail images from documents | file, dir, URL, base64 | directory |
 | `audio waveform` | Generate waveform images from audio | file, dir, URL, base64 | file |
+| `speech transcribe` | Transcribe speech in audio or video files | file, dir, URL, base64 | file |
 | `text speak` | Speak text | file, dir, URL, base64 | file |
 | `video thumbs` | Extract thumbnails from videos | file, dir, URL, base64 | directory |
 | `video encode-hls` | Run builtin/encode-hls-video@latest | file, dir, URL, base64 | directory |
@@ -341,7 +342,7 @@ npx transloadit image resize --input <path|dir|url|-> [options]
 
 | Flag | Type | Required | Example | Description |
 | --- | --- | --- | --- | --- |
-| `--format` | `string` | no | `value` | The output format for the modified image. Some of the most important available formats are "jpg", "png", "gif", and "tiff". For a complete lists of all formats that we can write… |
+| `--format` | `string` | no | `value` | The output format for the modified image. Some of the most important available formats are "jpg", "png", "gif", "tiff", and "jxl" for JPEG XL. For a complete list of all formats… |
 | `--width` | `number` | no | `1` | Width of the result in pixels. If not specified, will default to the width of the original. |
 | `--height` | `number` | no | `1` | Height of the new image, in pixels. If not specified, will default to the height of the input image. |
 | `--resize-strategy` | `string` | no | `crop` | See the list of available resize strategies. |
@@ -666,6 +667,48 @@ npx transloadit audio waveform --input <path|dir|url|-> [options]
 transloadit audio waveform --input input.mp3 --output output.png
 ```
 
+## `speech transcribe`
+
+Transcribe speech in audio or video files
+
+Runs `/speech/transcribe` with a text-first default and writes the transcript to `--output`.
+
+**Usage**
+
+```bash
+npx transloadit speech transcribe --input <path|dir|url|-> [options]
+```
+
+**Quick facts**
+
+- Input: file, dir, URL, base64
+- Output: file
+- Execution: per-file; supports `--watch`
+- Backend: semantic alias `speech-transcribe`
+
+**Shared flags**
+
+- Uses the shared file input and output flags listed above.
+- Also supports the shared base processing flags, watch flags listed above.
+
+**Command options**
+
+| Flag | Type | Required | Example | Description |
+| --- | --- | --- | --- | --- |
+| `--provider` | `string` | no | `replicate` | Provider to use for transcription. Defaults to replicate. |
+| `--format` | `string` | no | `text` | Output format. Defaults to text. |
+| `--source-language` | `string` | no | `en-US` | Spoken language as a BCP-47 code, for providers that support explicit source languages. |
+| `--target-language` | `string` | no | `en-US` | Target written language for providers that support translation. |
+
+**Examples**
+
+```bash
+# Transcribe an audio file to text
+transloadit speech transcribe --input voice.opus --output voice.txt
+# Generate subtitles
+transloadit speech transcribe --input clip.mp4 --format webvtt --output captions.vtt
+```
+
 ## `text speak`
 
 Speak text

diff --git a/packages/node/src/cli/intentCommandSpecs.ts b/packages/node/src/cli/intentCommandSpecs.ts
@@ -247,6 +247,11 @@ export const intentCatalog = [
     meta: robotAudioWaveformMeta,
     schema: robotAudioWaveformInstructionsSchema,
   }),
+  defineSemanticIntent({
+    kind: 'semantic',
+    semantic: 'speech-transcribe',
+    paths: ['speech', 'transcribe'],
+  }),
   defineRobotIntent({
     kind: 'robot',
     robot: '/text/speak',

diff --git a/packages/node/src/cli/semanticIntents/index.ts b/packages/node/src/cli/semanticIntents/index.ts
@@ -10,6 +10,7 @@ import {
   markdownDocxSemanticIntentDescriptor,
   markdownPdfSemanticIntentDescriptor,
 } from './markdownPdf.ts'
+import { speechTranscribeSemanticIntentDescriptor } from './speechTranscribe.ts'
 
 export interface SemanticIntentPresentation {
   description: string
@@ -43,6 +44,7 @@ const semanticIntentDescriptors: Record<string, SemanticIntentDescriptor> = {
   'markdown-docx': {
     ...markdownDocxSemanticIntentDescriptor,
   },
+  'speech-transcribe': speechTranscribeSemanticIntentDescriptor,
 }
 
 export function getSemanticIntentDescriptor(name: string): SemanticIntentDescriptor {

diff --git a/packages/node/src/cli/semanticIntents/speechTranscribe.ts b/packages/node/src/cli/semanticIntents/speechTranscribe.ts
@@ -0,0 +1,139 @@
+import type {
+  IntentDynamicStepExecutionDefinition,
+  IntentOptionDefinition,
+} from '../intentRuntime.ts'
+import type { SemanticIntentDescriptor, SemanticIntentPresentation } from './index.ts'
+import { parseOptionalEnumValue } from './parsing.ts'
+
+const speechTranscribeProviders = ['aws', 'gcp', 'replicate'] as const
+const speechTranscribeFormats = ['text', 'json', 'srt', 'webvtt'] as const
+
+type SpeechTranscribeProvider = (typeof speechTranscribeProviders)[number]
+type SpeechTranscribeFormat = (typeof speechTranscribeFormats)[number]
+
+const defaultSpeechTranscribeProvider = 'replicate' satisfies SpeechTranscribeProvider
+const defaultSpeechTranscribeFormat = 'text' satisfies SpeechTranscribeFormat
+
+const speechTranscribeExecutionDefinition = {
+  kind: 'dynamic-step',
+  handler: 'speech-transcribe',
+  resultStepName: 'transcribe',
+  fields: [
+    {
+      name: 'provider',
+      kind: 'string',
+      propertyName: 'provider',
+      optionFlags: '--provider',
+      description: 'Provider to use for transcription. Defaults to replicate.',
+      required: false,
+      exampleValue: defaultSpeechTranscribeProvider,
+    },
+    {
+      name: 'format',
+      kind: 'string',
+      propertyName: 'format',
+      optionFlags: '--format',
+      description: 'Output format. Defaults to text.',
+      required: false,
+      exampleValue: defaultSpeechTranscribeFormat,
+    },
+    {
+      name: 'source_language',
+      kind: 'string',
+      propertyName: 'sourceLanguage',
+      optionFlags: '--source-language',
+      description:
+        'Spoken language as a BCP-47 code, for providers that support explicit source languages.',
+      required: false,
+      exampleValue: 'en-US',
+    },
+    {
+      name: 'target_language',
+      kind: 'string',
+      propertyName: 'targetLanguage',
+      optionFlags: '--target-language',
+      description: 'Target written language for providers that support translation.',
+      required: false,
+      exampleValue: 'en-US',
+    },
+  ] as const satisfies readonly IntentOptionDefinition[],
+} satisfies IntentDynamicStepExecutionDefinition
+
+const speechTranscribeCommandPresentation = {
+  description: 'Transcribe speech in audio or video files',
+  details:
+    'Runs `/speech/transcribe` with a text-first default and writes the transcript to `--output`.',
+  examples: [
+    [
+      'Transcribe an audio file to text',
+      'transloadit speech transcribe --input voice.opus --output voice.txt',
+    ],
+    [
+      'Generate subtitles',
+      'transloadit speech transcribe --input clip.mp4 --format webvtt --output captions.vtt',
+    ],
+  ] as Array<[string, string]>,
+} as const satisfies SemanticIntentPresentation
+
+function parseProvider(value: unknown): SpeechTranscribeProvider {
+  return (
+    parseOptionalEnumValue({
+      flagName: '--provider',
+      supportedValues: speechTranscribeProviders,
+      value,
+    }) ?? defaultSpeechTranscribeProvider
+  )
+}
+
+function parseFormat(value: unknown): SpeechTranscribeFormat {
+  return (
+    parseOptionalEnumValue({
+      flagName: '--format',
+      supportedValues: speechTranscribeFormats,
+      value,
+    }) ?? defaultSpeechTranscribeFormat
+  )
+}
+
+function parseOptionalString(value: unknown, flagName: string): string | null {
+  if (value == null || value === '') {
+    return null
+  }
+
+  if (typeof value !== 'string') {
+    throw new Error(`${flagName} must be a string`)
+  }
+
+  const trimmed = value.trim()
+  return trimmed.length > 0 ? trimmed : null
+}
+
+function createSpeechTranscribeStep(
+  rawValues: Record<string, unknown>,
+  _context: { hasInputs: boolean },
+): Record<string, unknown> {
+  const provider = parseProvider(rawValues.provider)
+  const format = parseFormat(rawValues.format)
+  const sourceLanguage = parseOptionalString(rawValues.source_language, '--source-language')
+  const targetLanguage = parseOptionalString(rawValues.target_language, '--target-language')
+
+  return {
+    robot: '/speech/transcribe',
+    use: ':original',
+    result: true,
+    provider,
+    format,
+    ...(sourceLanguage != null ? { source_language: sourceLanguage } : {}),
+    ...(targetLanguage != null ? { target_language: targetLanguage } : {}),
+  }
+}
+
+export const speechTranscribeSemanticIntentDescriptor = {
+  createStep: createSpeechTranscribeStep,
+  defaultOutputPath: 'output.txt',
+  execution: speechTranscribeExecutionDefinition,
+  inputPolicy: { kind: 'required' },
+  outputDescription: 'Write the transcript to this path or directory',
+  presentation: speechTranscribeCommandPresentation,
+  runnerKind: 'watchable',
+} as const satisfies SemanticIntentDescriptor
diff --git a/packages/node/test/support/intentSmokeCases.ts b/packages/node/test/support/intentSmokeCases.ts
@@ -90,6 +90,11 @@ const intentSmokeOverrides: Record<string, Omit<IntentSmokeCase, 'key' | 'paths'
     outputPath: 'markdown-docx.docx',
     verifier: 'docx',
   },
+  'speech-transcribe:speech/transcribe': {
+    args: ['--input', '@fixture/input.mp3', '--format', 'json'],
+    outputPath: 'speech-transcribe.json',
+    verifier: 'json',
+  },
   '/image/optimize': {
     args: ['--input', '@fixture/input.jpg'],
     outputPath: 'image-optimize.jpg',