From 77863c6921c7492aca8e360d703f86f9096d90d3 Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Wed, 10 Jun 2026 10:40:08 +0800 Subject: [PATCH 01/10] Add Speech Recognition WCR API sample Adds a Speech Recognition sample under WCRAPIs that transcribes audio locally on device: live microphone streaming plus batch and streaming recognition from an audio file. Registers the Speech API in apis.json, WcrApiHelpers, and WcrApiCodeSnippet, adds the microphone capability to the app manifests, and upgrades Microsoft.WindowsAppSDK to 2.2.2-experimental9 (Microsoft.WindowsAppSDK.ML 2.1.75-experimental). --- AIDevGallery/Package.Store.appxmanifest | 1 + AIDevGallery/Package.appxmanifest | 1 + .../Definitions/WcrApis/WcrApiCodeSnippet.cs | 51 ++ .../Definitions/WcrApis/WcrApiHelpers.cs | 26 + .../Samples/Definitions/WcrApis/apis.json | 11 + .../Samples/WCRAPIs/SpeechRecognition.xaml | 98 +++ .../Samples/WCRAPIs/SpeechRecognition.xaml.cs | 645 ++++++++++++++++++ Directory.Packages.props | 4 +- 8 files changed, 835 insertions(+), 2 deletions(-) create mode 100644 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml create mode 100644 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs diff --git a/AIDevGallery/Package.Store.appxmanifest b/AIDevGallery/Package.Store.appxmanifest index 86a6319c..6a677c7e 100644 --- a/AIDevGallery/Package.Store.appxmanifest +++ b/AIDevGallery/Package.Store.appxmanifest @@ -62,5 +62,6 @@ + diff --git a/AIDevGallery/Package.appxmanifest b/AIDevGallery/Package.appxmanifest index 168a7315..fc2714af 100644 --- a/AIDevGallery/Package.appxmanifest +++ b/AIDevGallery/Package.appxmanifest @@ -62,5 +62,6 @@ + diff --git a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs index b9f7ec1e..48bf2b3e 100644 --- a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs +++ b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs @@ -721,6 +721,57 @@ public async void IndexStatisticsSample() } } """"" + }, + { + ModelType.SpeechRecognition, """" + using Microsoft.Windows.AI; + using Microsoft.Windows.AI.MachineLearning; + using Microsoft.Windows.AI.Speech; + + // Speech recognition runs on the device's execution providers (CPU / GPU / NPU). + var catalog = ExecutionProviderCatalog.GetDefault(); + await catalog.EnsureAndRegisterCertifiedAsync(); + + var readyState = SpeechRecognitionModel.GetReadyState(); + if (readyState == AIFeatureReadyState.NotReady) + { + var ensureOp = await SpeechRecognitionModel.EnsureReadyAsync(); + if (ensureOp.Status != AIFeatureReadyResultState.Success) + { + throw new InvalidOperationException("Speech model could not be prepared."); + } + } + + var modelResult = await SpeechRecognitionModel.TryCreateAsync(); + if (modelResult.ExtendedError != null) + { + throw modelResult.ExtendedError; + } + + using SpeechRecognitionModel speechModel = modelResult.SpeechModel; + + // Stream audio from the default microphone. Pass an empty deviceId to use the system default. + var audioConfig = AudioConfiguration.FromAudioDevice(string.Empty); + + using var streaming = new StreamingRecognition(audioConfig, speechModel); + + streaming.Recognizing += (_, args) => + { + // Interim hypothesis (updates frequently as more audio arrives). + Console.WriteLine($"[interim] {args.Text}"); + }; + + streaming.Recognized += (_, args) => + { + // Final result for a stable utterance. + Console.WriteLine($"[final] offset={args.Offset:F2}s duration={args.Duration:F2}s: {args.Text}"); + }; + + await streaming.StartContinuousRecognitionAsync(); + + // ... let captions stream in. When done: + streaming.StopContinuousRecognition(); + """" } }; } \ No newline at end of file diff --git a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs index 0becb093..d6b37a13 100644 --- a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs +++ b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs @@ -5,10 +5,13 @@ using AIDevGallery.Utils; using Microsoft.Windows.AI; using Microsoft.Windows.AI.Imaging; +using Microsoft.Windows.AI.MachineLearning; +using Microsoft.Windows.AI.Speech; using Microsoft.Windows.AI.Text; using Microsoft.Windows.AI.Video; using System; using System.Collections.Generic; +using System.Runtime.InteropServices.WindowsRuntime; using Windows.Foundation; namespace AIDevGallery.Samples; @@ -80,6 +83,9 @@ internal static class WcrApiHelpers }, { ModelType.VideoSuperRes, VideoScaler.GetReadyState + }, + { + ModelType.SpeechRecognition, SpeechRecognitionModel.GetReadyState } }; @@ -132,9 +138,29 @@ internal static class WcrApiHelpers }, { ModelType.VideoSuperRes, VideoScaler.EnsureReadyAsync + }, + { + ModelType.SpeechRecognition, EnsureSpeechRecognitionModelReadyAsync } }; + // SpeechRecognitionModel.EnsureReadyAsync reports progress as SpeechRecognitionModelProgress, + // so adapt it to the IAsyncOperationWithProgress shape the gallery expects. + private static IAsyncOperationWithProgress EnsureSpeechRecognitionModelReadyAsync() + { + return AsyncInfo.Run(async (cancellationToken, progress) => + { + progress.Report(0); + var catalog = ExecutionProviderCatalog.GetDefault(); + await catalog.EnsureAndRegisterCertifiedAsync().AsTask(cancellationToken); + + var inner = SpeechRecognitionModel.EnsureReadyAsync(); + inner.Progress = (_, p) => progress.Report(p.Progress); + using var registration = cancellationToken.Register(() => inner.Cancel()); + return await inner; + }); + } + // this is a workaround for GetReadyState not returning Ready after EnsureReadyAsync is called // for now, we will track when EnsureReadyAsync succeeds for each model to ensure we are not // blocking the samples from running until this bug is fixed diff --git a/AIDevGallery/Samples/Definitions/WcrApis/apis.json b/AIDevGallery/Samples/Definitions/WcrApis/apis.json index 61b79841..7311baee 100644 --- a/AIDevGallery/Samples/Definitions/WcrApis/apis.json +++ b/AIDevGallery/Samples/Definitions/WcrApis/apis.json @@ -218,6 +218,17 @@ "ReadmeUrl": "https://github.com/MicrosoftDocs/windows-ai-docs/blob/docs/docs/apis/video-super-resolution.md", "License": "ms-pl", "SampleIdToShowInDocs": "c3252e18-1d47-4689-adae-78fc66968650" + }, + "SpeechRecognition": { + "Id": "0d4f1c2a-7e3b-4c9e-9b8a-1f2d3c4b5a60", + "Name": "Speech Recognition", + "Icon": "WCRAPI.svg", + "IconGlyph": "\uE720", + "Description": "Continuously transcribe audio using local speech recognition.", + "ReadmeUrl": "https://github.com/MicrosoftDocs/windows-ai-docs/blob/docs/docs/apis/speech-recognition.md", + "License": "ms-pl", + "SampleIdToShowInDocs": "9c5b2e8a-1f7d-4d3c-9e6a-3b1c8e7f4d20", + "Category": "Speech" } } } diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml new file mode 100644 index 00000000..07077fb9 --- /dev/null +++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml @@ -0,0 +1,98 @@ + + + + + + + + + + + + + + + + + + + + + +