From 412449385c5246291268a3b80f4905049e458c3f Mon Sep 17 00:00:00 2001 From: bayger Date: Mon, 15 Jun 2026 15:12:44 +0200 Subject: [PATCH] feat: add barge-in silence timeout and placeholder to VAD settings --- src/api/generated/data-contracts.ts | 13 +- src/api/openapi.json | 15 +- src/api/websocket/websocket-contracts.json | 15 +- src/api/websocket/websocket-contracts.ts | 12 +- .../modals/ServerVadSettingsModal.vue | 554 ++++++++++-------- 5 files changed, 346 insertions(+), 263 deletions(-) diff --git a/src/api/generated/data-contracts.ts b/src/api/generated/data-contracts.ts index 3002054..985eb1b 100644 --- a/src/api/generated/data-contracts.ts +++ b/src/api/generated/data-contracts.ts @@ -964,6 +964,15 @@ export type ServerVadConfig = ( ) & { /** Optional Smart Turn endpoint detection configuration. Runs after VAD silence detection to verify turn completion. */ smartTurn?: SmartTurnConfig; + /** + * Duration in milliseconds to wait for the user to continue speaking after a barge-in interrupt. If silence is detected for this duration, ASR is stopped. Default: 3000. + * @min 500 + * @max 10000 + * @default 3000 + */ + bargeInSilenceTimeout?: number; + /** Optional placeholder text fed to the AI as user input when the user barge-ins but then stops speaking before the bargeInSilenceTimeout. The AI generates a response based on this prompt (e.g. "[you misheard something the user said]"). Default: [repeat after interruption]. */ + bargeInSilencePlaceholder?: string; }; export interface LegacyVadConfig { @@ -1064,12 +1073,12 @@ export interface FireRedVadConfig { */ minSpeechFrame?: number; /** - * Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 2000. + * Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 6000. * @min 1 */ maxSpeechFrame?: number; /** - * Minimum consecutive silence frames after speech before speech_end is emitted. Default: 20. + * Minimum consecutive silence frames after speech before speech_end is emitted. Default: 80. * @min 1 */ minSilenceFrame?: number; diff --git a/src/api/openapi.json b/src/api/openapi.json index 9e86d2c..c550744 100644 --- a/src/api/openapi.json +++ b/src/api/openapi.json @@ -1557,6 +1557,17 @@ "properties": { "smartTurn": { "$ref": "#/components/schemas/SmartTurnConfig" + }, + "bargeInSilenceTimeout": { + "type": "integer", + "minimum": 500, + "maximum": 10000, + "default": 3000, + "description": "Duration in milliseconds to wait for the user to continue speaking after a barge-in interrupt. If silence is detected for this duration, ASR is stopped. Default: 3000." + }, + "bargeInSilencePlaceholder": { + "type": "string", + "description": "Optional placeholder text fed to the AI as user input when the user barge-ins but then stops speaking before the bargeInSilenceTimeout. The AI generates a response based on this prompt (e.g. \"[you misheard something the user said]\"). Default: [repeat after interruption]." } } } @@ -1718,12 +1729,12 @@ "maxSpeechFrame": { "type": "integer", "minimum": 1, - "description": "Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 2000." + "description": "Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 6000." }, "minSilenceFrame": { "type": "integer", "minimum": 1, - "description": "Minimum consecutive silence frames after speech before speech_end is emitted. Default: 20." + "description": "Minimum consecutive silence frames after speech before speech_end is emitted. Default: 80." }, "padStartFrame": { "type": "integer", diff --git a/src/api/websocket/websocket-contracts.json b/src/api/websocket/websocket-contracts.json index 686fc3e..8d307f4 100644 --- a/src/api/websocket/websocket-contracts.json +++ b/src/api/websocket/websocket-contracts.json @@ -778,6 +778,17 @@ "properties": { "smartTurn": { "$ref": "#/definitions/SmartTurnConfig" + }, + "bargeInSilenceTimeout": { + "type": "integer", + "minimum": 500, + "maximum": 10000, + "default": 3000, + "description": "Duration in milliseconds to wait for the user to continue speaking after a barge-in interrupt. If silence is detected for this duration, ASR is stopped. Default: 3000." + }, + "bargeInSilencePlaceholder": { + "type": "string", + "description": "Optional placeholder text fed to the AI as user input when the user barge-ins but then stops speaking before the bargeInSilenceTimeout. The AI generates a response based on this prompt (e.g. \"[you misheard something the user said]\"). Default: [repeat after interruption]." } } } @@ -940,12 +951,12 @@ "maxSpeechFrame": { "type": "integer", "minimum": 1, - "description": "Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 2000." + "description": "Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 6000." }, "minSilenceFrame": { "type": "integer", "minimum": 1, - "description": "Minimum consecutive silence frames after speech before speech_end is emitted. Default: 20." + "description": "Minimum consecutive silence frames after speech before speech_end is emitted. Default: 80." }, "padStartFrame": { "type": "integer", diff --git a/src/api/websocket/websocket-contracts.ts b/src/api/websocket/websocket-contracts.ts index 8d24579..87d2e04 100644 --- a/src/api/websocket/websocket-contracts.ts +++ b/src/api/websocket/websocket-contracts.ts @@ -371,11 +371,11 @@ export interface FireRedVadConfig { */ minSpeechFrame?: number; /** - * Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 2000. + * Maximum consecutive speech frames before a forced speech_end (long-utterance cutoff). Default: 6000. */ maxSpeechFrame?: number; /** - * Minimum consecutive silence frames after speech before speech_end is emitted. Default: 20. + * Minimum consecutive silence frames after speech before speech_end is emitted. Default: 80. */ minSilenceFrame?: number; /** @@ -407,6 +407,14 @@ export interface SmartTurnConfig { */ export type ServerVadConfig = (LegacyVadConfig | SileroVadConfig | FireRedVadConfig) & { smartTurn?: SmartTurnConfig; + /** + * Duration in milliseconds to wait for the user to continue speaking after a barge-in interrupt. If silence is detected for this duration, ASR is stopped. Default: 3000. + */ + bargeInSilenceTimeout?: number; + /** + * Optional placeholder text fed to the AI as user input when the user barge-ins but then stops speaking before the bargeInSilenceTimeout. The AI generates a response based on this prompt (e.g. "[you misheard something the user said]"). Default: [repeat after interruption]. + */ + bargeInSilencePlaceholder?: string; }; diff --git a/src/components/modals/ServerVadSettingsModal.vue b/src/components/modals/ServerVadSettingsModal.vue index 965ffc3..86e90e7 100644 --- a/src/components/modals/ServerVadSettingsModal.vue +++ b/src/components/modals/ServerVadSettingsModal.vue @@ -3,76 +3,32 @@