From 77863c6921c7492aca8e360d703f86f9096d90d3 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Wed, 10 Jun 2026 10:40:08 +0800
Subject: [PATCH 01/10] Add Speech Recognition WCR API sample

Adds a Speech Recognition sample under WCRAPIs that transcribes audio locally on device: live microphone streaming plus batch and streaming recognition from an audio file. Registers the Speech API in apis.json, WcrApiHelpers, and WcrApiCodeSnippet, adds the microphone capability to the app manifests, and upgrades Microsoft.WindowsAppSDK to 2.2.2-experimental9 (Microsoft.WindowsAppSDK.ML 2.1.75-experimental).
---
 AIDevGallery/Package.Store.appxmanifest       |   1 +
 AIDevGallery/Package.appxmanifest             |   1 +
 .../Definitions/WcrApis/WcrApiCodeSnippet.cs  |  51 ++
 .../Definitions/WcrApis/WcrApiHelpers.cs      |  26 +
 .../Samples/Definitions/WcrApis/apis.json     |  11 +
 .../Samples/WCRAPIs/SpeechRecognition.xaml    |  98 +++
 .../Samples/WCRAPIs/SpeechRecognition.xaml.cs | 645 ++++++++++++++++++
 Directory.Packages.props                      |   4 +-
 8 files changed, 835 insertions(+), 2 deletions(-)
 create mode 100644 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
 create mode 100644 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
diff --git a/AIDevGallery/Package.Store.appxmanifest b/AIDevGallery/Package.Store.appxmanifest
index 86a6319c..6a677c7e 100644
--- a/AIDevGallery/Package.Store.appxmanifest
+++ b/AIDevGallery/Package.Store.appxmanifest
@@ -62,5 +62,6 @@
   <Capabilities>
     <rescap:Capability Name="runFullTrust" />
 	<systemai:Capability Name="systemAIModels"/>
+    <DeviceCapability Name="microphone" />
   </Capabilities>
 </Package>
diff --git a/AIDevGallery/Package.appxmanifest b/AIDevGallery/Package.appxmanifest
index 168a7315..fc2714af 100644
--- a/AIDevGallery/Package.appxmanifest
+++ b/AIDevGallery/Package.appxmanifest
@@ -62,5 +62,6 @@
   <Capabilities>
     <rescap:Capability Name="runFullTrust" />
     <systemai:Capability Name="systemAIModels"/>
+    <DeviceCapability Name="microphone" />
   </Capabilities>
 </Package>
diff --git a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs
index b9f7ec1e..48bf2b3e 100644
--- a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs
+++ b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs
@@ -721,6 +721,57 @@ public async void IndexStatisticsSample()
                 }
             }
             """""
+        },
+        {
+            ModelType.SpeechRecognition, """"
+            using Microsoft.Windows.AI;
+            using Microsoft.Windows.AI.MachineLearning;
+            using Microsoft.Windows.AI.Speech;
+
+            // Speech recognition runs on the device's execution providers (CPU / GPU / NPU).
+            var catalog = ExecutionProviderCatalog.GetDefault();
+            await catalog.EnsureAndRegisterCertifiedAsync();
+
+            var readyState = SpeechRecognitionModel.GetReadyState();
+            if (readyState == AIFeatureReadyState.NotReady)
+            {
+                var ensureOp = await SpeechRecognitionModel.EnsureReadyAsync();
+                if (ensureOp.Status != AIFeatureReadyResultState.Success)
+                {
+                    throw new InvalidOperationException("Speech model could not be prepared.");
+                }
+            }
+
+            var modelResult = await SpeechRecognitionModel.TryCreateAsync();
+            if (modelResult.ExtendedError != null)
+            {
+                throw modelResult.ExtendedError;
+            }
+
+            using SpeechRecognitionModel speechModel = modelResult.SpeechModel;
+
+            // Stream audio from the default microphone. Pass an empty deviceId to use the system default.
+            var audioConfig = AudioConfiguration.FromAudioDevice(string.Empty);
+
+            using var streaming = new StreamingRecognition(audioConfig, speechModel);
+
+            streaming.Recognizing += (_, args) =>
+            {
+                // Interim hypothesis (updates frequently as more audio arrives).
+                Console.WriteLine($"[interim] {args.Text}");
+            };
+
+            streaming.Recognized += (_, args) =>
+            {
+                // Final result for a stable utterance.
+                Console.WriteLine($"[final] offset={args.Offset:F2}s duration={args.Duration:F2}s: {args.Text}");
+            };
+
+            await streaming.StartContinuousRecognitionAsync();
+
+            // ... let captions stream in. When done:
+            streaming.StopContinuousRecognition();
+            """"
         }
     };
 }
\ No newline at end of file
diff --git a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs
index 0becb093..d6b37a13 100644
--- a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs
+++ b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiHelpers.cs
@@ -5,10 +5,13 @@
 using AIDevGallery.Utils;
 using Microsoft.Windows.AI;
 using Microsoft.Windows.AI.Imaging;
+using Microsoft.Windows.AI.MachineLearning;
+using Microsoft.Windows.AI.Speech;
 using Microsoft.Windows.AI.Text;
 using Microsoft.Windows.AI.Video;
 using System;
 using System.Collections.Generic;
+using System.Runtime.InteropServices.WindowsRuntime;
 using Windows.Foundation;
 
 namespace AIDevGallery.Samples;
@@ -80,6 +83,9 @@ internal static class WcrApiHelpers
         },
         {
             ModelType.VideoSuperRes, VideoScaler.GetReadyState
+        },
+        {
+            ModelType.SpeechRecognition, SpeechRecognitionModel.GetReadyState
         }
     };
 
@@ -132,9 +138,29 @@ internal static class WcrApiHelpers
         },
         {
             ModelType.VideoSuperRes, VideoScaler.EnsureReadyAsync
+        },
+        {
+            ModelType.SpeechRecognition, EnsureSpeechRecognitionModelReadyAsync
         }
     };
 
+    // SpeechRecognitionModel.EnsureReadyAsync reports progress as SpeechRecognitionModelProgress,
+    // so adapt it to the IAsyncOperationWithProgress<AIFeatureReadyResult, double> shape the gallery expects.
+    private static IAsyncOperationWithProgress<AIFeatureReadyResult, double> EnsureSpeechRecognitionModelReadyAsync()
+    {
+        return AsyncInfo.Run<AIFeatureReadyResult, double>(async (cancellationToken, progress) =>
+        {
+            progress.Report(0);
+            var catalog = ExecutionProviderCatalog.GetDefault();
+            await catalog.EnsureAndRegisterCertifiedAsync().AsTask(cancellationToken);
+
+            var inner = SpeechRecognitionModel.EnsureReadyAsync();
+            inner.Progress = (_, p) => progress.Report(p.Progress);
+            using var registration = cancellationToken.Register(() => inner.Cancel());
+            return await inner;
+        });
+    }
+
     // this is a workaround for GetReadyState not returning Ready after EnsureReadyAsync is called
     // for now, we will track when EnsureReadyAsync succeeds for each model to ensure we are not
     // blocking the samples from running until this bug is fixed
diff --git a/AIDevGallery/Samples/Definitions/WcrApis/apis.json b/AIDevGallery/Samples/Definitions/WcrApis/apis.json
index 61b79841..7311baee 100644
--- a/AIDevGallery/Samples/Definitions/WcrApis/apis.json
+++ b/AIDevGallery/Samples/Definitions/WcrApis/apis.json
@@ -218,6 +218,17 @@
         "ReadmeUrl": "https://github.com/MicrosoftDocs/windows-ai-docs/blob/docs/docs/apis/video-super-resolution.md",
         "License": "ms-pl",
         "SampleIdToShowInDocs": "c3252e18-1d47-4689-adae-78fc66968650"
+      },
+      "SpeechRecognition": {
+        "Id": "0d4f1c2a-7e3b-4c9e-9b8a-1f2d3c4b5a60",
+        "Name": "Speech Recognition",
+        "Icon": "WCRAPI.svg",
+        "IconGlyph": "\uE720",
+        "Description": "Continuously transcribe audio using local speech recognition.",
+        "ReadmeUrl": "https://github.com/MicrosoftDocs/windows-ai-docs/blob/docs/docs/apis/speech-recognition.md",
+        "License": "ms-pl",
+        "SampleIdToShowInDocs": "9c5b2e8a-1f7d-4d3c-9e6a-3b1c8e7f4d20",
+        "Category": "Speech"
       }
     }
   }
diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
new file mode 100644
index 00000000..07077fb9
--- /dev/null
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<samples:BaseSamplePage
+    x:Class="AIDevGallery.Samples.WCRAPIs.SpeechRecognition"
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+    xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
+    xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
+    xmlns:samples="using:AIDevGallery.Samples"
+    mc:Ignorable="d">
+    <Grid RowSpacing="12">
+        <Grid.RowDefinitions>
+            <RowDefinition Height="*" />
+            <RowDefinition Height="Auto" />
+            <RowDefinition Height="Auto" />
+        </Grid.RowDefinitions>
+
+        <Border
+            Grid.Row="0"
+            Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+            BorderBrush="{ThemeResource ControlStrongStrokeColorDefaultBrush}"
+            BorderThickness="1"
+            CornerRadius="{StaticResource OverlayCornerRadius}">
+            <ScrollViewer
+                x:Name="TranscriptionScrollViewer"
+                Padding="16"
+                HorizontalScrollBarVisibility="Disabled"
+                VerticalScrollBarVisibility="Auto">
+                <StackPanel Spacing="4">
+                    <TextBlock
+                        x:Name="FinalTranscriptionTextBlock"
+                        AutomationProperties.LiveSetting="Polite"
+                        AutomationProperties.Name="Final transcription"
+                        FontFamily="Cascadia Code"
+                        FontSize="16"
+                        IsTextSelectionEnabled="True"
+                        Text="Press Start to begin speech recognition..."
+                        TextWrapping="Wrap" />
+                    <TextBlock
+                        x:Name="InterimTranscriptionTextBlock"
+                        AutomationProperties.LiveSetting="Polite"
+                        AutomationProperties.Name="Interim transcription"
+                        FontFamily="Cascadia Code"
+                        FontSize="16"
+                        FontStyle="Italic"
+                        Foreground="{ThemeResource TextFillColorSecondaryBrush}"
+                        IsTextSelectionEnabled="True"
+                        TextWrapping="Wrap" />
+                </StackPanel>
+            </ScrollViewer>
+        </Border>
+
+        <InfoBar
+            x:Name="StatusInfoBar"
+            Grid.Row="1"
+            IsClosable="False"
+            IsOpen="False"
+            Severity="Informational" />
+
+        <StackPanel
+            Grid.Row="2"
+            HorizontalAlignment="Center"
+            Orientation="Horizontal"
+            Spacing="8">
+            <Button
+                x:Name="StartStopButton"
+                MinWidth="180"
+                AutomationProperties.Name="Start recognition"
+                Click="StartStopButton_Click"
+                Content="Start recognition"
+                Style="{StaticResource AccentButtonStyle}" />
+            <DropDownButton
+                x:Name="FromFileButton"
+                MinWidth="180"
+                AutomationProperties.Name="Recognize from audio file"
+                Content="Recognize from audio file..."
+                ToolTipService.ToolTip="Useful when the microphone isn't available (e.g. over Remote Desktop). Pick an audio file to transcribe.">
+                <DropDownButton.Flyout>
+                    <MenuFlyout Placement="Bottom">
+                        <MenuFlyoutItem
+                            Click="RecognizeFileBatch_Click"
+                            Text="Batch - full transcript"
+                            ToolTipService.ToolTip="BatchRecognition.RecognizeFromFile: one-shot call that returns the complete transcript." />
+                        <MenuFlyoutItem
+                            Click="RecognizeFileStreaming_Click"
+                            Text="Streaming - incremental"
+                            ToolTipService.ToolTip="StreamingRecognition over the file: raises incremental Recognizing/Recognized events, like the microphone." />
+                    </MenuFlyout>
+                </DropDownButton.Flyout>
+            </DropDownButton>
+            <Button
+                x:Name="ClearButton"
+                MinWidth="120"
+                AutomationProperties.Name="Clear transcription"
+                Click="ClearButton_Click"
+                Content="Clear" />
+        </StackPanel>
+    </Grid>
+</samples:BaseSamplePage>
diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
new file mode 100644
index 00000000..a1fd16d1
--- /dev/null
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -0,0 +1,645 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+using AIDevGallery.Models;
+using AIDevGallery.Samples.Attributes;
+using Microsoft.UI.Xaml;
+using Microsoft.UI.Xaml.Controls;
+using Microsoft.Windows.AI;
+using Microsoft.Windows.AI.MachineLearning;
+using Microsoft.Windows.AI.Speech;
+using System;
+using System.Diagnostics;
+using System.Globalization;
+using System.IO;
+using System.Text;
+using System.Threading.Tasks;
+using Windows.Media.MediaProperties;
+using Windows.Media.Transcoding;
+using Windows.Security.Authorization.AppCapabilityAccess;
+using Windows.Storage;
+using Windows.Storage.Pickers;
+using Windows.System;
+
+namespace AIDevGallery.Samples.WCRAPIs;
+
+[GallerySample(
+    Name = "Speech Recognition",
+    Model1Types = [ModelType.SpeechRecognition],
+    Scenario = ScenarioType.AudioAndVideoTranscribeLiveAudio,
+    Id = "9c5b2e8a-1f7d-4d3c-9e6a-3b1c8e7f4d20",
+    Icon = "\uE720")]
+internal sealed partial class SpeechRecognition : BaseSamplePage
+{
+    private SpeechRecognitionModel? _speechModel;
+    private StreamingRecognition? _streamingRecognition;
+    private Task? _streamingSessionTask;
+
+    private string _finalText = string.Empty;
+    private bool _isRecognizing;
+
+    public SpeechRecognition()
+    {
+        this.Unloaded += (_, _) => CleanUp();
+        this.InitializeComponent();
+    }
+
+    protected override async Task LoadModelAsync(SampleNavigationParameters sampleParams)
+    {
+        try
+        {
+            var catalog = ExecutionProviderCatalog.GetDefault();
+            await catalog.EnsureAndRegisterCertifiedAsync();
+
+            var readyState = SpeechRecognitionModel.GetReadyState();
+            if (readyState is AIFeatureReadyState.Ready or AIFeatureReadyState.NotReady)
+            {
+                if (readyState == AIFeatureReadyState.NotReady)
+                {
+                    var op = await SpeechRecognitionModel.EnsureReadyAsync();
+                    if (op.Status != AIFeatureReadyResultState.Success)
+                    {
+                        ShowException(op.ExtendedError, "Speech Recognition is not available.");
+                        return;
+                    }
+                }
+
+                var modelResult = await SpeechRecognitionModel.TryCreateAsync();
+                if (modelResult.ExtendedError != null)
+                {
+                    ShowException(modelResult.ExtendedError, "Failed to load the Speech Recognition model.");
+                    return;
+                }
+
+                _speechModel = modelResult.SpeechModel;
+            }
+            else
+            {
+                var msg = readyState == AIFeatureReadyState.DisabledByUser
+                    ? "Disabled by user."
+                    : "Not supported on this system.";
+                ShowException(null, $"Speech Recognition is not available: {msg}");
+            }
+        }
+        catch (Exception ex)
+        {
+            ShowException(ex, "Failed to load the Speech Recognition model.");
+        }
+        finally
+        {
+            sampleParams.NotifyCompletion();
+        }
+    }
+
+    private async void StartStopButton_Click(object sender, RoutedEventArgs e)
+    {
+        if (_isRecognizing)
+        {
+            await StopRecognitionAsync();
+        }
+        else
+        {
+            await StartRecognitionAsync();
+        }
+    }
+
+    private async Task StartRecognitionAsync()
+    {
+        if (_speechModel == null)
+        {
+            ShowException(null, "Speech Recognition model is not loaded yet.");
+            return;
+        }
+
+        SendSampleInteractedEvent("StartSpeechRecognition");
+        StartStopButton.IsEnabled = false;
+
+        try
+        {
+            if (!await EnsureMicrophoneAccessAsync())
+            {
+                return;
+            }
+
+            // Stream audio from the default microphone
+            var audioConfig = AudioConfiguration.FromAudioDevice(string.Empty);
+
+            _streamingRecognition = new StreamingRecognition(audioConfig, _speechModel);
+            var session = _streamingRecognition;
+            session.Recognizing += OnRecognizing;
+            session.Recognized += OnRecognized;
+
+            _finalText = string.Empty;
+            FinalTranscriptionTextBlock.Text = string.Empty;
+            InterimTranscriptionTextBlock.Text = string.Empty;
+            _isRecognizing = true;
+            UpdateUiState(running: true, status: "Listening on default microphone...");
+
+            var sessionTask = session.StartContinuousRecognitionAsync().AsTask();
+            _streamingSessionTask = sessionTask;
+            _ = MonitorStreamingSessionAsync(session, sessionTask);
+        }
+        catch (Exception ex)
+        {
+            await StopRecognitionAsync();
+            ShowException(ex, $"Failed to start speech recognition: {FormatError(ex)}");
+        }
+        finally
+        {
+            StartStopButton.IsEnabled = true;
+        }
+    }
+
+    private async Task StopRecognitionAsync()
+    {
+        SendSampleInteractedEvent("StopSpeechRecognition");
+
+        var streaming = _streamingRecognition;
+        var sessionTask = _streamingSessionTask;
+        _streamingRecognition = null;
+        _streamingSessionTask = null;
+
+        if (streaming != null)
+        {
+            try
+            {
+                // Await the Start operation after stopping so the on-disk model cache flushes before disposal.
+                streaming.StopContinuousRecognition();
+
+                if (sessionTask != null)
+                {
+                    // Faults were already surfaced by MonitorStreamingSessionAsync
+                    await sessionTask.ContinueWith(static _ => { }, TaskScheduler.Default);
+                }
+            }
+            catch (Exception ex)
+            {
+                ShowException(ex, "Failed to stop speech recognition cleanly.");
+            }
+            finally
+            {
+                DetachHandlers(streaming);
+                streaming.Dispose();
+            }
+        }
+
+        _isRecognizing = false;
+        UpdateUiState(running: false, status: null);
+    }
+
+    // Awaits the streaming session task on a background thread; if it faults (e.g., mic device
+    // not found or wrong audio format), marshals the exception back to the UI thread and shows
+    // it to the user so the failure is visible instead of silently swallowed.
+    private async Task MonitorStreamingSessionAsync(StreamingRecognition session, Task sessionTask)
+    {
+        try
+        {
+            await sessionTask.ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            DispatcherQueue.TryEnqueue(() =>
+            {
+                // Ignore if the user already stopped or a different session is now active.
+                if (_streamingRecognition == session)
+                {
+                    HandleStreamingFailure(session, ex);
+                }
+            });
+        }
+    }
+
+    private void HandleStreamingFailure(StreamingRecognition session, Exception ex)
+    {
+        _streamingRecognition = null;
+        _streamingSessionTask = null;
+        _isRecognizing = false;
+
+        DetachHandlers(session);
+        session.Dispose();
+
+        UpdateUiState(running: false, status: null);
+        ShowException(ex, $"Speech recognition failed: {FormatError(ex)}");
+    }
+
+    private async void RecognizeFileBatch_Click(object sender, RoutedEventArgs e)
+    {
+        await RecognizeFromFileAsync(streamMode: false);
+    }
+
+    private async void RecognizeFileStreaming_Click(object sender, RoutedEventArgs e)
+    {
+        await RecognizeFromFileAsync(streamMode: true);
+    }
+
+    private async Task RecognizeFromFileAsync(bool streamMode)
+    {
+        if (_speechModel == null)
+        {
+            ShowException(null, "Speech Recognition model is not loaded yet.");
+            return;
+        }
+
+        if (_isRecognizing)
+        {
+            await StopRecognitionAsync();
+        }
+
+        SendSampleInteractedEvent("RecognizeFromFile");
+
+        var picker = new FileOpenPicker();
+        WinRT.Interop.InitializeWithWindow.Initialize(
+            picker, WinRT.Interop.WindowNative.GetWindowHandle(App.MainWindow));
+        picker.ViewMode = PickerViewMode.List;
+        picker.SuggestedStartLocation = PickerLocationId.MusicLibrary;
+        picker.FileTypeFilter.Add(".wav");
+        picker.FileTypeFilter.Add(".mp3");
+        picker.FileTypeFilter.Add(".m4a");
+
+        var file = await picker.PickSingleFileAsync();
+        if (file == null)
+        {
+            return;
+        }
+
+        StorageFile? transcodedFile = null;
+        StreamingRecognition? fileStreaming = null;
+        try
+        {
+            UpdateUiState(running: true, status: $"Transcoding \"{file.Name}\" to 16 kHz mono...");
+            FinalTranscriptionTextBlock.Text = $"Transcoding \"{file.Name}\" to 16 kHz mono...";
+            InterimTranscriptionTextBlock.Text = string.Empty;
+            _finalText = string.Empty;
+            _isRecognizing = true;
+
+            transcodedFile = await TranscodeTo16kMonoCanonicalWavAsync(file);
+
+            if (streamMode)
+            {
+                UpdateUiState(running: true, status: $"Streaming recognition of \"{file.Name}\"...");
+                FinalTranscriptionTextBlock.Text = string.Empty;
+
+                fileStreaming = new StreamingRecognition(
+                    AudioConfiguration.FromFile(transcodedFile.Path),
+                    _speechModel);
+                fileStreaming.Recognizing += OnRecognizing;
+                fileStreaming.Recognized += OnRecognized;
+
+                await fileStreaming.StartContinuousRecognitionAsync();
+
+                if (string.IsNullOrWhiteSpace(_finalText))
+                {
+                    FinalTranscriptionTextBlock.Text = "(no speech detected in file)";
+                }
+
+                UpdateUiState(running: false, status: $"Streaming recognition of \"{file.Name}\" completed.");
+            }
+            else
+            {
+                // BatchRecognition returns the full transcript in a single call.
+                UpdateUiState(running: true, status: $"Recognizing from \"{file.Name}\"...");
+                FinalTranscriptionTextBlock.Text = $"Recognizing from file: {file.Name}...";
+
+                using var batch = new BatchRecognition(_speechModel);
+                var transcript = await batch.RecognizeFromFile(transcodedFile.Path);
+
+                _finalText = transcript ?? string.Empty;
+                FinalTranscriptionTextBlock.Text = string.IsNullOrWhiteSpace(_finalText)
+                    ? "(no speech detected in file)"
+                    : _finalText;
+                UpdateUiState(running: false, status: $"Recognition of \"{file.Name}\" completed.");
+            }
+        }
+        catch (Exception ex)
+        {
+            ShowException(ex, $"Failed to recognize from \"{file.Name}\".\n\n{ex.Message}");
+            UpdateUiState(running: false, status: null);
+        }
+        finally
+        {
+            _isRecognizing = false;
+
+            if (fileStreaming != null)
+            {
+                DetachHandlers(fileStreaming);
+                fileStreaming.Dispose();
+            }
+
+            await TryDeleteAsync(transcodedFile);
+        }
+    }
+
+    private static async Task<StorageFile> TranscodeTo16kMonoCanonicalWavAsync(StorageFile inputFile)
+    {
+        var mfFile = await ApplicationData.Current.TemporaryFolder.CreateFileAsync(
+            $"speech-recognition-mf-{Guid.NewGuid():N}.wav",
+            CreationCollisionOption.ReplaceExisting);
+        var canonicalFile = await ApplicationData.Current.TemporaryFolder.CreateFileAsync(
+            $"speech-recognition-{Guid.NewGuid():N}.wav",
+            CreationCollisionOption.ReplaceExisting);
+
+        try
+        {
+            var profile = MediaEncodingProfile.CreateWav(AudioEncodingQuality.Auto);
+            profile.Audio = AudioEncodingProperties.CreatePcm(16000, 1, 16);
+
+            var transcoder = new MediaTranscoder();
+            var prepare = await transcoder.PrepareFileTranscodeAsync(inputFile, mfFile, profile);
+            if (!prepare.CanTranscode)
+            {
+                throw new InvalidOperationException(
+                    $"MediaTranscoder cannot transcode \"{inputFile.Name}\" to 16 kHz mono PCM WAV: {prepare.FailureReason}");
+            }
+
+            await prepare.TranscodeAsync();
+            RewriteWavAsCanonicalPcm(mfFile.Path, canonicalFile.Path);
+            return canonicalFile;
+        }
+        catch
+        {
+            await TryDeleteAsync(canonicalFile);
+            throw;
+        }
+        finally
+        {
+            await TryDeleteAsync(mfFile);
+        }
+    }
+
+    private static void RewriteWavAsCanonicalPcm(string sourcePath, string destPath)
+    {
+        var src = File.ReadAllBytes(sourcePath);
+        if (src.Length < 12 || Encoding.ASCII.GetString(src, 0, 4) != "RIFF" || Encoding.ASCII.GetString(src, 8, 4) != "WAVE")
+        {
+            throw new InvalidOperationException("Source file is not a RIFF/WAVE.");
+        }
+
+        ushort audioFormat = 0, channels = 0, blockAlign = 0, bitsPerSample = 0;
+        uint sampleRate = 0, byteRate = 0;
+        int dataOffset = -1, dataSize = 0;
+
+        int offset = 12;
+        while (offset + 8 <= src.Length)
+        {
+            var chunkId = Encoding.ASCII.GetString(src, offset, 4);
+            var chunkSize = (int)BitConverter.ToUInt32(src, offset + 4);
+
+            if (chunkId == "fmt " && chunkSize >= 16)
+            {
+                audioFormat = BitConverter.ToUInt16(src, offset + 8);
+                channels = BitConverter.ToUInt16(src, offset + 10);
+                sampleRate = BitConverter.ToUInt32(src, offset + 12);
+                byteRate = BitConverter.ToUInt32(src, offset + 16);
+                blockAlign = BitConverter.ToUInt16(src, offset + 20);
+                bitsPerSample = BitConverter.ToUInt16(src, offset + 22);
+            }
+            else if (chunkId == "data")
+            {
+                dataOffset = offset + 8;
+                dataSize = chunkSize;
+                break;
+            }
+
+            offset += 8 + chunkSize;
+            if ((chunkSize & 1) == 1)
+            {
+                offset += 1;
+            }
+        }
+
+        if (audioFormat != 1)
+        {
+            throw new InvalidOperationException($"Source WAV is not WAVE_FORMAT_PCM (got 0x{audioFormat:X4}).");
+        }
+
+        if (dataOffset < 0 || dataSize <= 0)
+        {
+            throw new InvalidOperationException("Source WAV has no data chunk.");
+        }
+
+        if (bitsPerSample != 16)
+        {
+            throw new InvalidOperationException($"Source WAV is not 16-bit PCM (got {bitsPerSample}).");
+        }
+
+        const int CanonicalFmtSize = 16;
+        int canonicalSize = 12 + 8 + CanonicalFmtSize + 8 + dataSize;
+        var dst = new byte[canonicalSize];
+
+        Encoding.ASCII.GetBytes("RIFF").CopyTo(dst, 0);
+        BitConverter.GetBytes((uint)(canonicalSize - 8)).CopyTo(dst, 4);
+        Encoding.ASCII.GetBytes("WAVE").CopyTo(dst, 8);
+
+        Encoding.ASCII.GetBytes("fmt ").CopyTo(dst, 12);
+        BitConverter.GetBytes((uint)CanonicalFmtSize).CopyTo(dst, 16);
+        BitConverter.GetBytes((ushort)1).CopyTo(dst, 20);
+        BitConverter.GetBytes(channels).CopyTo(dst, 22);
+        BitConverter.GetBytes(sampleRate).CopyTo(dst, 24);
+        BitConverter.GetBytes(byteRate).CopyTo(dst, 28);
+        BitConverter.GetBytes(blockAlign).CopyTo(dst, 32);
+        BitConverter.GetBytes(bitsPerSample).CopyTo(dst, 34);
+
+        Encoding.ASCII.GetBytes("data").CopyTo(dst, 36);
+        BitConverter.GetBytes((uint)dataSize).CopyTo(dst, 40);
+
+        Buffer.BlockCopy(src, dataOffset, dst, 44, dataSize);
+        File.WriteAllBytes(destPath, dst);
+    }
+
+    private async Task<bool> EnsureMicrophoneAccessAsync()
+    {
+        try
+        {
+#pragma warning disable CA1416
+            var capability = AppCapability.Create("microphone");
+            if (capability != null)
+            {
+                var status = capability.CheckAccess();
+                if (status != AppCapabilityAccessStatus.Allowed)
+                {
+                    status = await capability.RequestAccessAsync();
+                }
+
+                if (status != AppCapabilityAccessStatus.Allowed)
+                {
+                    await ShowMicrophoneAccessDeniedDialogAsync();
+                    return false;
+                }
+            }
+#pragma warning restore CA1416
+        }
+        catch (UnauthorizedAccessException)
+        {
+            await ShowMicrophoneAccessDeniedDialogAsync();
+            return false;
+        }
+
+        return true;
+    }
+
+    private async Task ShowMicrophoneAccessDeniedDialogAsync()
+    {
+        var dialog = new ContentDialog
+        {
+            Title = "Microphone access required",
+            Content = "Speech recognition needs permission to use the microphone. " +
+                "Open Windows Settings, enable \u201CLet apps access your microphone\u201D, " +
+                "and allow access for AI Dev Gallery.",
+            PrimaryButtonText = "Open Settings",
+            CloseButtonText = "Cancel",
+            XamlRoot = this.XamlRoot,
+        };
+
+        var result = await dialog.ShowAsync();
+        if (result == ContentDialogResult.Primary)
+        {
+            await Launcher.LaunchUriAsync(new Uri("ms-settings:privacy-microphone"));
+        }
+    }
+
+    private void OnRecognizing(StreamingRecognition sender, StreamingRecognizingEventArgs args)
+    {
+        var partial = args.Text ?? string.Empty;
+        DispatcherQueue.TryEnqueue(() =>
+        {
+            InterimTranscriptionTextBlock.Text = partial;
+            ScrollToEnd();
+        });
+    }
+
+    private void OnRecognized(StreamingRecognition sender, StreamingRecognizedEventArgs args)
+    {
+        var text = args.Text ?? string.Empty;
+        DispatcherQueue.TryEnqueue(() =>
+        {
+            if (!string.IsNullOrWhiteSpace(text))
+            {
+                if (_finalText.Length > 0 && !_finalText.EndsWith(' '))
+                {
+                    _finalText += " ";
+                }
+
+                _finalText += text;
+                FinalTranscriptionTextBlock.Text = _finalText;
+            }
+
+            InterimTranscriptionTextBlock.Text = string.Empty;
+            ScrollToEnd();
+        });
+    }
+
+    private void ScrollToEnd()
+    {
+        TranscriptionScrollViewer.UpdateLayout();
+        TranscriptionScrollViewer.ChangeView(null, TranscriptionScrollViewer.ScrollableHeight, null, disableAnimation: true);
+    }
+
+    private void ClearButton_Click(object sender, RoutedEventArgs e)
+    {
+        _finalText = string.Empty;
+        FinalTranscriptionTextBlock.Text = string.Empty;
+        InterimTranscriptionTextBlock.Text = string.Empty;
+    }
+
+    private void UpdateUiState(bool running, string? status)
+    {
+        StartStopButton.Content = running ? "Stop recognition" : "Start recognition";
+        FromFileButton.IsEnabled = !running;
+
+        if (string.IsNullOrEmpty(status))
+        {
+            StatusInfoBar.IsOpen = false;
+            StatusInfoBar.Title = string.Empty;
+            StatusInfoBar.Message = string.Empty;
+        }
+        else
+        {
+            StatusInfoBar.Severity = InfoBarSeverity.Informational;
+            StatusInfoBar.Title = running ? "Listening" : string.Empty;
+            StatusInfoBar.Message = status;
+            StatusInfoBar.IsOpen = true;
+        }
+    }
+
+    private void DetachHandlers(StreamingRecognition session)
+    {
+        session.Recognizing -= OnRecognizing;
+        session.Recognized -= OnRecognized;
+    }
+
+    private static string FormatError(Exception ex)
+    {
+        var hresult = ((uint)ex.HResult).ToString("X8", CultureInfo.InvariantCulture);
+        return string.IsNullOrEmpty(ex.Message) ? $"HRESULT 0x{hresult}" : $"{ex.Message} (HRESULT 0x{hresult})";
+    }
+
+    private static async Task TryDeleteAsync(StorageFile? file)
+    {
+        if (file == null)
+        {
+            return;
+        }
+
+        try
+        {
+            await file.DeleteAsync(StorageDeleteOption.PermanentDelete);
+        }
+        catch (Exception ex)
+        {
+            Debug.WriteLine($"[SpeechRecognition] Failed to delete temporary file: {ex.Message}");
+        }
+    }
+
+    private void CleanUp()
+    {
+        var streaming = _streamingRecognition;
+        var sessionTask = _streamingSessionTask;
+        var model = _speechModel;
+
+        _streamingRecognition = null;
+        _streamingSessionTask = null;
+        _speechModel = null;
+        _isRecognizing = false;
+
+        if (streaming == null && model == null)
+        {
+            return;
+        }
+
+        // Tear down off the UI thread (a synchronous wait would deadlock the DispatcherQueue), stopping
+        // and awaiting the session before disposal to avoid corrupting the on-disk model cache.
+        _ = Task.Run(async () =>
+        {
+            if (streaming != null)
+            {
+                try
+                {
+                    DetachHandlers(streaming);
+                    streaming.StopContinuousRecognition();
+
+                    if (sessionTask != null)
+                    {
+                        await sessionTask.WaitAsync(TimeSpan.FromSeconds(5))
+                            .ContinueWith(static _ => { }, TaskScheduler.Default)
+                            .ConfigureAwait(false);
+                    }
+
+                    streaming.Dispose();
+                }
+                catch (Exception ex)
+                {
+                    Debug.WriteLine($"[SpeechRecognition] Streaming cleanup threw: {ex.Message}");
+                }
+            }
+
+            try
+            {
+                model?.Dispose();
+            }
+            catch (Exception ex)
+            {
+                Debug.WriteLine($"[SpeechRecognition] Model cleanup threw: {ex.Message}");
+            }
+        });
+    }
+}
\ No newline at end of file
diff --git a/Directory.Packages.props b/Directory.Packages.props
index 8dd94a65..35cad52a 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -7,7 +7,7 @@
     <PackageVersion Include="CommunityToolkit.WinUI.Media" Version="8.2.250402" />
     <PackageVersion Include="Microsoft.AI.Foundry.Local.WinML" Version="0.8.2.1" />
     <PackageVersion Include="Microsoft.Extensions.AI" Version="10.2.0" />
-    <PackageVersion Include="Microsoft.WindowsAppSDK.ML" Version="2.1.3-experimental" />
+    <PackageVersion Include="Microsoft.WindowsAppSDK.ML" Version="2.1.75-experimental" />
     <PackageVersion Include="OllamaSharp" Version="5.4.7" />
     <PackageVersion Include="Microsoft.Extensions.AI.OpenAI" Version="10.2.0-preview.1.26063.2" />
     <PackageVersion Include="Microsoft.SemanticKernel.Connectors.InMemory" Version="1.72.0-preview" />
@@ -31,7 +31,7 @@
     <PackageVersion Include="CommunityToolkit.WinUI.Extensions" Version="8.2.250402" />
     <PackageVersion Include="CommunityToolkit.WinUI.Controls.Sizers" Version="8.2.250402" />
     <PackageVersion Include="Microsoft.Graphics.Win2D" Version="1.3.2" />
-    <PackageVersion Include="Microsoft.WindowsAppSDK" Version="2.1.4-experimental8" />
+    <PackageVersion Include="Microsoft.WindowsAppSDK" Version="2.2.2-experimental9" />
     <PackageVersion Include="Microsoft.Windows.SDK.BuildTools" Version="10.0.26100.6584" />
     <PackageVersion Include="Microsoft.Windows.SDK.BuildTools.MSIX" Version="1.7.251221100" />
     <PackageVersion Include="Microsoft.Windows.CsWin32" Version="0.3.269" />

From 8e712134123d759ca02abae8425159fc4e20277c Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Wed, 10 Jun 2026 16:42:04 +0800
Subject: [PATCH 02/10] fix: Remove stale AutomationProperties.Name on
 StartStopButton

---
 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
index 07077fb9..abf33977 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
@@ -64,7 +64,6 @@
             <Button
                 x:Name="StartStopButton"
                 MinWidth="180"
-                AutomationProperties.Name="Start recognition"
                 Click="StartStopButton_Click"
                 Content="Start recognition"
                 Style="{StaticResource AccentButtonStyle}" />

From 8ad4d7c9efed1666fae02e7e1b9a6081fac51d8c Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Wed, 10 Jun 2026 17:09:28 +0800
Subject: [PATCH 03/10] Guard Speech snippet against unavailable states like
 other WCR snippets

---
 .../Definitions/WcrApis/WcrApiCodeSnippet.cs  | 62 +++++++++++--------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs
index 48bf2b3e..69cfa2c6 100644
--- a/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs
+++ b/AIDevGallery/Samples/Definitions/WcrApis/WcrApiCodeSnippet.cs
@@ -733,44 +733,52 @@ public async void IndexStatisticsSample()
             await catalog.EnsureAndRegisterCertifiedAsync();
 
             var readyState = SpeechRecognitionModel.GetReadyState();
-            if (readyState == AIFeatureReadyState.NotReady)
+            if (readyState is AIFeatureReadyState.Ready or AIFeatureReadyState.NotReady)
             {
-                var ensureOp = await SpeechRecognitionModel.EnsureReadyAsync();
-                if (ensureOp.Status != AIFeatureReadyResultState.Success)
+                if (readyState == AIFeatureReadyState.NotReady)
                 {
-                    throw new InvalidOperationException("Speech model could not be prepared.");
+                    var ensureOp = await SpeechRecognitionModel.EnsureReadyAsync();
+                    if (ensureOp.Status != AIFeatureReadyResultState.Success)
+                    {
+                        throw new InvalidOperationException("Speech model could not be prepared.");
+                    }
                 }
-            }
 
-            var modelResult = await SpeechRecognitionModel.TryCreateAsync();
-            if (modelResult.ExtendedError != null)
-            {
-                throw modelResult.ExtendedError;
-            }
+                var modelResult = await SpeechRecognitionModel.TryCreateAsync();
+                if (modelResult.ExtendedError != null)
+                {
+                    throw modelResult.ExtendedError;
+                }
 
-            using SpeechRecognitionModel speechModel = modelResult.SpeechModel;
+                using SpeechRecognitionModel speechModel = modelResult.SpeechModel;
 
-            // Stream audio from the default microphone. Pass an empty deviceId to use the system default.
-            var audioConfig = AudioConfiguration.FromAudioDevice(string.Empty);
+                // Stream audio from the default microphone. Pass an empty deviceId to use the system default.
+                var audioConfig = AudioConfiguration.FromAudioDevice(string.Empty);
 
-            using var streaming = new StreamingRecognition(audioConfig, speechModel);
+                using var streaming = new StreamingRecognition(audioConfig, speechModel);
 
-            streaming.Recognizing += (_, args) =>
-            {
-                // Interim hypothesis (updates frequently as more audio arrives).
-                Console.WriteLine($"[interim] {args.Text}");
-            };
+                streaming.Recognizing += (_, args) =>
+                {
+                    // Interim hypothesis (updates frequently as more audio arrives).
+                    Console.WriteLine($"[interim] {args.Text}");
+                };
 
-            streaming.Recognized += (_, args) =>
-            {
-                // Final result for a stable utterance.
-                Console.WriteLine($"[final] offset={args.Offset:F2}s duration={args.Duration:F2}s: {args.Text}");
-            };
+                streaming.Recognized += (_, args) =>
+                {
+                    // Final result for a stable utterance.
+                    Console.WriteLine($"[final] offset={args.Offset:F2}s duration={args.Duration:F2}s: {args.Text}");
+                };
 
-            await streaming.StartContinuousRecognitionAsync();
+                await streaming.StartContinuousRecognitionAsync();
 
-            // ... let captions stream in. When done:
-            streaming.StopContinuousRecognition();
+                // ... let captions stream in. When done:
+                streaming.StopContinuousRecognition();
+            }
+            else
+            {
+                // DisabledByUser or NotSupportedOnCurrentSystem.
+                throw new InvalidOperationException($"Speech recognition is not available: {readyState}.");
+            }
             """"
         }
     };

From 61ec10d8bd62302455d992b9e5622157d2e4a4a6 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Fri, 12 Jun 2026 11:37:27 +0800
Subject: [PATCH 04/10] Hide Windows Update download messaging for Speech
 Recognition

---
 AIDevGallery/Controls/WcrModelDownloader.xaml    | 12 ++++++++++--
 AIDevGallery/Controls/WcrModelDownloader.xaml.cs |  9 +++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/AIDevGallery/Controls/WcrModelDownloader.xaml b/AIDevGallery/Controls/WcrModelDownloader.xaml
index fcc0b7cc..0a6a20fb 100644
--- a/AIDevGallery/Controls/WcrModelDownloader.xaml
+++ b/AIDevGallery/Controls/WcrModelDownloader.xaml
@@ -31,7 +31,7 @@
                 IsTextSelectionEnabled="True"
                 TextAlignment="Center"
                 TextWrapping="Wrap">
-                <Run Text="This Windows AI API requires a one-time model download via Windows Update." /><LineBreak /> <LineBreak />
+                <Run x:Name="ModelDownloadInfoRun" Text="This Windows AI API requires a one-time model download via Windows Update." /><LineBreak /> <LineBreak />
                 <Hyperlink NavigateUri="https://learn.microsoft.com/windows/ai/apis/model-setup#prerequisites" UnderlineStyle="None">A Copilot+ PC with Windows 11 Build 26120.3073 or higher is required</Hyperlink>
             </TextBlock>
             <TextBlock
@@ -71,7 +71,15 @@
                 TextWrapping="WrapWholeWords">
                 <Run Text="Requesting model.." /> <LineBreak />
                 <LineBreak />
-                <Run FontSize="12" Text="Download progress can also be tracked in " /><Hyperlink Click="WindowsUpdateHyperlinkClicked" FontSize="12">Windows Update</Hyperlink><LineBreak />
+            </TextBlock>
+            <TextBlock
+                x:Name="WindowsUpdateTrackingText"
+                Margin="0,0,0,24"
+                HorizontalAlignment="Center"
+                Foreground="{ThemeResource TextFillColorSecondaryBrush}"
+                TextAlignment="Center"
+                TextWrapping="WrapWholeWords">
+                <Run FontSize="12" Text="Download progress can also be tracked in " /><Hyperlink Click="WindowsUpdateHyperlinkClicked" FontSize="12">Windows Update</Hyperlink>
             </TextBlock>
         </StackPanel>
 
diff --git a/AIDevGallery/Controls/WcrModelDownloader.xaml.cs b/AIDevGallery/Controls/WcrModelDownloader.xaml.cs
index c620b0ed..ff166008 100644
--- a/AIDevGallery/Controls/WcrModelDownloader.xaml.cs
+++ b/AIDevGallery/Controls/WcrModelDownloader.xaml.cs
@@ -166,6 +166,15 @@ public Task<bool> SetDownloadOperation(ModelType modelType, string sampleId, Fun
             ? Visibility.Visible
             : Visibility.Collapsed;
 
+        // TODO: Remove once the Speech Recognition ships through Windows Update
+        var isSpeechRecognition = modelType == ModelType.SpeechRecognition;
+        ModelDownloadInfoRun.Text = isSpeechRecognition
+            ? "This Windows AI API requires a one-time model download."
+            : "This Windows AI API requires a one-time model download via Windows Update.";
+        WindowsUpdateTrackingText.Visibility = isSpeechRecognition
+            ? Visibility.Collapsed
+            : Visibility.Visible;
+
         if (exisitingOperation != null && exisitingOperation.Status == AsyncStatus.Started)
         {
             // don't reuse same one because we can only have one Progress delegate

From 07894161789ee2daaa2fb1cc787358e4e8830da9 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Fri, 12 Jun 2026 16:29:01 +0800
Subject: [PATCH 05/10] Play audio during file streaming recognition and fix
 stop handling

---
 .../Samples/WCRAPIs/SpeechRecognition.xaml.cs | 133 +++++++++++++++++-
 1 file changed, 129 insertions(+), 4 deletions(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
index a1fd16d1..606d637e 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -13,8 +13,11 @@
 using System.Globalization;
 using System.IO;
 using System.Text;
+using System.Threading;
 using System.Threading.Tasks;
+using Windows.Media.Core;
 using Windows.Media.MediaProperties;
+using Windows.Media.Playback;
 using Windows.Media.Transcoding;
 using Windows.Security.Authorization.AppCapabilityAccess;
 using Windows.Storage;
@@ -34,6 +37,10 @@ internal sealed partial class SpeechRecognition : BaseSamplePage
     private SpeechRecognitionModel? _speechModel;
     private StreamingRecognition? _streamingRecognition;
     private Task? _streamingSessionTask;
+    private StreamingRecognition? _fileStreamingRecognition;
+    private CancellationTokenSource? _fileStreamingCts;
+    private MediaPlayer? _filePlaybackPlayer;
+    private TaskCompletionSource<bool>? _filePlaybackCompletion;
 
     private string _finalText = string.Empty;
     private bool _isRecognizing;
@@ -154,6 +161,12 @@ private async Task StopRecognitionAsync()
     {
         SendSampleInteractedEvent("StopSpeechRecognition");
 
+        if (_fileStreamingRecognition is { } fileSession)
+        {
+            DetachHandlers(fileSession);
+            _fileStreamingCts?.Cancel();
+        }
+
         var streaming = _streamingRecognition;
         var sessionTask = _streamingSessionTask;
         _streamingRecognition = null;
@@ -184,6 +197,7 @@ private async Task StopRecognitionAsync()
         }
 
         _isRecognizing = false;
+        StopFilePlayback();
         UpdateUiState(running: false, status: null);
     }
 
@@ -264,6 +278,7 @@ private async Task RecognizeFromFileAsync(bool streamMode)
 
         StorageFile? transcodedFile = null;
         StreamingRecognition? fileStreaming = null;
+        CancellationTokenSource? fileCts = null;
         try
         {
             UpdateUiState(running: true, status: $"Transcoding \"{file.Name}\" to 16 kHz mono...");
@@ -284,15 +299,36 @@ private async Task RecognizeFromFileAsync(bool streamMode)
                     _speechModel);
                 fileStreaming.Recognizing += OnRecognizing;
                 fileStreaming.Recognized += OnRecognized;
+                _fileStreamingRecognition = fileStreaming;
+                fileCts = new CancellationTokenSource();
+                _fileStreamingCts = fileCts;
+
+                // Play the picked file so the transcript can be followed as it streams in,
+                // rather than appearing in silence.
+                var playbackTask = StartFilePlayback(file);
 
-                await fileStreaming.StartContinuousRecognitionAsync();
+                var fileSessionTask = fileStreaming.StartContinuousRecognitionAsync().AsTask(fileCts.Token);
 
-                if (string.IsNullOrWhiteSpace(_finalText))
+                try
+                {
+                    await fileSessionTask;
+                }
+                catch (OperationCanceledException)
                 {
-                    FinalTranscriptionTextBlock.Text = "(no speech detected in file)";
+                    // Expected when the user presses Stop or navigates away mid-file.
                 }
 
-                UpdateUiState(running: false, status: $"Streaming recognition of \"{file.Name}\" completed.");
+                if (_isRecognizing)
+                {
+                    if (string.IsNullOrWhiteSpace(_finalText))
+                    {
+                        FinalTranscriptionTextBlock.Text = "(no speech detected in file)";
+                    }
+
+                    await playbackTask;
+
+                    UpdateUiState(running: false, status: $"Streaming recognition of \"{file.Name}\" completed.");
+                }
             }
             else
             {
@@ -318,6 +354,7 @@ private async Task RecognizeFromFileAsync(bool streamMode)
         finally
         {
             _isRecognizing = false;
+            StopFilePlayback();
 
             if (fileStreaming != null)
             {
@@ -325,10 +362,93 @@ private async Task RecognizeFromFileAsync(bool streamMode)
                 fileStreaming.Dispose();
             }
 
+            fileCts?.Dispose();
+
+            // Clear shared state only if a newer run hasn't already replaced it.
+            if (ReferenceEquals(_fileStreamingRecognition, fileStreaming))
+            {
+                _fileStreamingRecognition = null;
+            }
+
+            if (ReferenceEquals(_fileStreamingCts, fileCts))
+            {
+                _fileStreamingCts = null;
+            }
+
             await TryDeleteAsync(transcodedFile);
         }
     }
 
+    private Task StartFilePlayback(StorageFile file)
+    {
+        StopFilePlayback();
+
+        var completion = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
+        _filePlaybackCompletion = completion;
+
+        try
+        {
+            var player = new MediaPlayer();
+            player.MediaEnded += OnFilePlaybackEnded;
+            player.MediaFailed += OnFilePlaybackFailed;
+            player.Source = MediaSource.CreateFromStorageFile(file);
+            _filePlaybackPlayer = player;
+            player.Play();
+        }
+        catch (Exception ex)
+        {
+            Debug.WriteLine($"[SpeechRecognition] Failed to start file playback: {ex.Message}");
+            StopFilePlayback();
+        }
+
+        return completion.Task;
+    }
+
+    private void StopFilePlayback()
+    {
+        var player = _filePlaybackPlayer;
+        _filePlaybackPlayer = null;
+
+        // Unblock anyone awaiting playback completion (e.g. a Stop click before the clip ends).
+        _filePlaybackCompletion?.TrySetResult(false);
+        _filePlaybackCompletion = null;
+
+        if (player == null)
+        {
+            return;
+        }
+
+        player.MediaEnded -= OnFilePlaybackEnded;
+        player.MediaFailed -= OnFilePlaybackFailed;
+
+        try
+        {
+            player.Pause();
+            if (player.Source is IDisposable source)
+            {
+                player.Source = null;
+                source.Dispose();
+            }
+        }
+        catch (Exception ex)
+        {
+            Debug.WriteLine($"[SpeechRecognition] Failed to stop file playback: {ex.Message}");
+        }
+
+        player.Dispose();
+    }
+
+    private void OnFilePlaybackEnded(MediaPlayer sender, object args)
+    {
+        _filePlaybackCompletion?.TrySetResult(true);
+    }
+
+    private void OnFilePlaybackFailed(MediaPlayer sender, MediaPlayerFailedEventArgs args)
+    {
+        Debug.WriteLine($"[SpeechRecognition] File playback failed: {args.ErrorMessage}");
+        _filePlaybackCompletion?.TrySetResult(false);
+    }
+
     private static async Task<StorageFile> TranscodeTo16kMonoCanonicalWavAsync(StorageFile inputFile)
     {
         var mfFile = await ApplicationData.Current.TemporaryFolder.CreateFileAsync(
@@ -592,6 +712,11 @@ private static async Task TryDeleteAsync(StorageFile? file)
 
     private void CleanUp()
     {
+        StopFilePlayback();
+
+        // Cancel any in-flight file-streaming recognition so it doesn't keep running after navigation.
+        _fileStreamingCts?.Cancel();
+
         var streaming = _streamingRecognition;
         var sessionTask = _streamingSessionTask;
         var model = _speechModel;

From 4ba3c1f8c0d2e19b4d1b24fcb8349fa4b47ff265 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Fri, 12 Jun 2026 16:41:07 +0800
Subject: [PATCH 06/10] Fix double spaces between recognized speech segments

---
 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
index 606d637e..7a63cf19 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -629,12 +629,14 @@ private void OnRecognizing(StreamingRecognition sender, StreamingRecognizingEven
 
     private void OnRecognized(StreamingRecognition sender, StreamingRecognizedEventArgs args)
     {
-        var text = args.Text ?? string.Empty;
+        // Recognized phrases often carry their own leading/trailing whitespace; trim it so the
+        // segments below are always joined by exactly one space instead of doubling up.
+        var text = (args.Text ?? string.Empty).Trim();
         DispatcherQueue.TryEnqueue(() =>
         {
-            if (!string.IsNullOrWhiteSpace(text))
+            if (text.Length > 0)
             {
-                if (_finalText.Length > 0 && !_finalText.EndsWith(' '))
+                if (_finalText.Length > 0)
                 {
                     _finalText += " ";
                 }

From 61a7d4e84852c0c24f83869d67a311e863577f17 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Fri, 12 Jun 2026 17:44:26 +0800
Subject: [PATCH 07/10] Unify speech input selection into a single source
 dropdown

---
 .../Samples/WCRAPIs/SpeechRecognition.xaml    |  40 +++----
 .../Samples/WCRAPIs/SpeechRecognition.xaml.cs | 110 ++++++++++++------
 2 files changed, 94 insertions(+), 56 deletions(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
index abf33977..654b365b 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml
@@ -61,31 +61,31 @@
             HorizontalAlignment="Center"
             Orientation="Horizontal"
             Spacing="8">
+            <ComboBox
+                x:Name="InputSourceComboBox"
+                MinWidth="240"
+                AutomationProperties.Name="Input source"
+                SelectedIndex="0"
+                ToolTipService.ToolTip="Choose what to transcribe, then press Start.">
+                <ComboBoxItem
+                    Content="Microphone (live)"
+                    Tag="Microphone"
+                    ToolTipService.ToolTip="Live transcription from the default microphone using StreamingRecognition." />
+                <ComboBoxItem
+                    Content="Audio file - full transcript"
+                    Tag="FileBatch"
+                    ToolTipService.ToolTip="BatchRecognition.RecognizeFromFile: one-shot call that returns the complete transcript." />
+                <ComboBoxItem
+                    Content="Audio file - incremental"
+                    Tag="FileStreaming"
+                    ToolTipService.ToolTip="StreamingRecognition over a file: raises incremental Recognizing/Recognized events, like the microphone." />
+            </ComboBox>
             <Button
                 x:Name="StartStopButton"
-                MinWidth="180"
+                MinWidth="160"
                 Click="StartStopButton_Click"
                 Content="Start recognition"
                 Style="{StaticResource AccentButtonStyle}" />
-            <DropDownButton
-                x:Name="FromFileButton"
-                MinWidth="180"
-                AutomationProperties.Name="Recognize from audio file"
-                Content="Recognize from audio file..."
-                ToolTipService.ToolTip="Useful when the microphone isn't available (e.g. over Remote Desktop). Pick an audio file to transcribe.">
-                <DropDownButton.Flyout>
-                    <MenuFlyout Placement="Bottom">
-                        <MenuFlyoutItem
-                            Click="RecognizeFileBatch_Click"
-                            Text="Batch - full transcript"
-                            ToolTipService.ToolTip="BatchRecognition.RecognizeFromFile: one-shot call that returns the complete transcript." />
-                        <MenuFlyoutItem
-                            Click="RecognizeFileStreaming_Click"
-                            Text="Streaming - incremental"
-                            ToolTipService.ToolTip="StreamingRecognition over the file: raises incremental Recognizing/Recognized events, like the microphone." />
-                    </MenuFlyout>
-                </DropDownButton.Flyout>
-            </DropDownButton>
             <Button
                 x:Name="ClearButton"
                 MinWidth="120"
diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
index 7a63cf19..398c44c6 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -45,6 +45,13 @@ internal sealed partial class SpeechRecognition : BaseSamplePage
     private string _finalText = string.Empty;
     private bool _isRecognizing;
 
+    private enum InputSource
+    {
+        Microphone,
+        FileBatch,
+        FileStreaming,
+    }
+
     public SpeechRecognition()
     {
         this.Unloaded += (_, _) => CleanUp();
@@ -103,14 +110,35 @@ private async void StartStopButton_Click(object sender, RoutedEventArgs e)
         if (_isRecognizing)
         {
             await StopRecognitionAsync();
+            return;
         }
-        else
+
+        switch (GetSelectedInputSource())
         {
-            await StartRecognitionAsync();
+            case InputSource.FileBatch:
+                await RecognizeFromFileAsync(streamMode: false);
+                break;
+            case InputSource.FileStreaming:
+                await RecognizeFromFileAsync(streamMode: true);
+                break;
+            default:
+                await StartMicrophoneRecognitionAsync();
+                break;
         }
     }
 
-    private async Task StartRecognitionAsync()
+    private InputSource GetSelectedInputSource()
+    {
+        var tag = (InputSourceComboBox.SelectedItem as FrameworkElement)?.Tag as string;
+        return tag switch
+        {
+            "FileBatch" => InputSource.FileBatch,
+            "FileStreaming" => InputSource.FileStreaming,
+            _ => InputSource.Microphone,
+        };
+    }
+
+    private async Task StartMicrophoneRecognitionAsync()
     {
         if (_speechModel == null)
         {
@@ -161,12 +189,16 @@ private async Task StopRecognitionAsync()
     {
         SendSampleInteractedEvent("StopSpeechRecognition");
 
+        // Cancel any in-flight file-streaming recognition (even during the pre-session transcode).
+        // Detaching handlers when a session exists freezes the transcript immediately;
+        // RecognizeFromFileAsync then drains and disposes the session safely.
         if (_fileStreamingRecognition is { } fileSession)
         {
             DetachHandlers(fileSession);
-            _fileStreamingCts?.Cancel();
         }
 
+        _fileStreamingCts?.Cancel();
+
         var streaming = _streamingRecognition;
         var sessionTask = _streamingSessionTask;
         _streamingRecognition = null;
@@ -236,16 +268,6 @@ private void HandleStreamingFailure(StreamingRecognition session, Exception ex)
         ShowException(ex, $"Speech recognition failed: {FormatError(ex)}");
     }
 
-    private async void RecognizeFileBatch_Click(object sender, RoutedEventArgs e)
-    {
-        await RecognizeFromFileAsync(streamMode: false);
-    }
-
-    private async void RecognizeFileStreaming_Click(object sender, RoutedEventArgs e)
-    {
-        await RecognizeFromFileAsync(streamMode: true);
-    }
-
     private async Task RecognizeFromFileAsync(bool streamMode)
     {
         if (_speechModel == null)
@@ -276,12 +298,22 @@ private async Task RecognizeFromFileAsync(bool streamMode)
             return;
         }
 
+        var canStop = streamMode;
+
         StorageFile? transcodedFile = null;
         StreamingRecognition? fileStreaming = null;
         CancellationTokenSource? fileCts = null;
         try
         {
-            UpdateUiState(running: true, status: $"Transcoding \"{file.Name}\" to 16 kHz mono...");
+            // For streaming, create the cancellation source up front so Stop works even during the
+            // pre-recognition transcode step.
+            if (streamMode)
+            {
+                fileCts = new CancellationTokenSource();
+                _fileStreamingCts = fileCts;
+            }
+
+            UpdateUiState(running: true, status: $"Transcoding \"{file.Name}\" to 16 kHz mono...", canStop: canStop);
             FinalTranscriptionTextBlock.Text = $"Transcoding \"{file.Name}\" to 16 kHz mono...";
             InterimTranscriptionTextBlock.Text = string.Empty;
             _finalText = string.Empty;
@@ -291,27 +323,27 @@ private async Task RecognizeFromFileAsync(bool streamMode)
 
             if (streamMode)
             {
-                UpdateUiState(running: true, status: $"Streaming recognition of \"{file.Name}\"...");
                 FinalTranscriptionTextBlock.Text = string.Empty;
 
-                fileStreaming = new StreamingRecognition(
-                    AudioConfiguration.FromFile(transcodedFile.Path),
-                    _speechModel);
-                fileStreaming.Recognizing += OnRecognizing;
-                fileStreaming.Recognized += OnRecognized;
-                _fileStreamingRecognition = fileStreaming;
-                fileCts = new CancellationTokenSource();
-                _fileStreamingCts = fileCts;
+                Task? playbackTask = null;
+                try
+                {
+                    fileCts!.Token.ThrowIfCancellationRequested();
 
-                // Play the picked file so the transcript can be followed as it streams in,
-                // rather than appearing in silence.
-                var playbackTask = StartFilePlayback(file);
+                    UpdateUiState(running: true, status: $"Streaming recognition of \"{file.Name}\"...", canStop: true);
 
-                var fileSessionTask = fileStreaming.StartContinuousRecognitionAsync().AsTask(fileCts.Token);
+                    fileStreaming = new StreamingRecognition(
+                        AudioConfiguration.FromFile(transcodedFile.Path),
+                        _speechModel);
+                    fileStreaming.Recognizing += OnRecognizing;
+                    fileStreaming.Recognized += OnRecognized;
+                    _fileStreamingRecognition = fileStreaming;
 
-                try
-                {
-                    await fileSessionTask;
+                    // Play the picked file so the transcript can be followed as it streams in,
+                    // rather than appearing in silence.
+                    playbackTask = StartFilePlayback(file);
+
+                    await fileStreaming.StartContinuousRecognitionAsync().AsTask(fileCts.Token);
                 }
                 catch (OperationCanceledException)
                 {
@@ -325,7 +357,10 @@ private async Task RecognizeFromFileAsync(bool streamMode)
                         FinalTranscriptionTextBlock.Text = "(no speech detected in file)";
                     }
 
-                    await playbackTask;
+                    if (playbackTask != null)
+                    {
+                        await playbackTask;
+                    }
 
                     UpdateUiState(running: false, status: $"Streaming recognition of \"{file.Name}\" completed.");
                 }
@@ -333,7 +368,7 @@ private async Task RecognizeFromFileAsync(bool streamMode)
             else
             {
                 // BatchRecognition returns the full transcript in a single call.
-                UpdateUiState(running: true, status: $"Recognizing from \"{file.Name}\"...");
+                UpdateUiState(running: true, status: $"Recognizing from \"{file.Name}\"...", canStop: false);
                 FinalTranscriptionTextBlock.Text = $"Recognizing from file: {file.Name}...";
 
                 using var batch = new BatchRecognition(_speechModel);
@@ -663,10 +698,13 @@ private void ClearButton_Click(object sender, RoutedEventArgs e)
         InterimTranscriptionTextBlock.Text = string.Empty;
     }
 
-    private void UpdateUiState(bool running, string? status)
+    private void UpdateUiState(bool running, string? status, bool canStop = true)
     {
-        StartStopButton.Content = running ? "Stop recognition" : "Start recognition";
-        FromFileButton.IsEnabled = !running;
+        InputSourceComboBox.IsEnabled = !running;
+        StartStopButton.IsEnabled = !running || canStop;
+        StartStopButton.Content = running
+            ? (canStop ? "Stop recognition" : "Recognizing...")
+            : "Start recognition";
 
         if (string.IsNullOrEmpty(status))
         {

From 8729b46423f6949b8c6548b7eb914464a7c6d60b Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Mon, 15 Jun 2026 11:14:20 +0800
Subject: [PATCH 08/10] Replace CA1416 pragma with runtime OS version check

---
 AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
index 398c44c6..0fe0413c 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -603,9 +603,16 @@ private static void RewriteWavAsCanonicalPcm(string sourcePath, string destPath)
 
     private async Task<bool> EnsureMicrophoneAccessAsync()
     {
+        // The AppCapability microphone-permission API requires Windows 10 1903 (build 18362).
+        // On older builds it isn't available, so skip the explicit check and let recognition
+        // proceed; the start call will surface any genuine access failure.
+        if (!OperatingSystem.IsWindowsVersionAtLeast(10, 0, 18362))
+        {
+            return true;
+        }
+
         try
         {
-#pragma warning disable CA1416
             var capability = AppCapability.Create("microphone");
             if (capability != null)
             {
@@ -621,7 +628,6 @@ private async Task<bool> EnsureMicrophoneAccessAsync()
                     return false;
                 }
             }
-#pragma warning restore CA1416
         }
         catch (UnauthorizedAccessException)
         {

From 6f80bfb4d81edc354943e14b8ac9320683447723 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Mon, 15 Jun 2026 13:33:36 +0800
Subject: [PATCH 09/10] Stream WAV data instead of buffering the whole file in
 memory

---
 .../Samples/WCRAPIs/SpeechRecognition.xaml.cs | 108 +++++++++++-------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
index 0fe0413c..9595c56e 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -523,42 +523,57 @@ private static async Task<StorageFile> TranscodeTo16kMonoCanonicalWavAsync(Stora
 
     private static void RewriteWavAsCanonicalPcm(string sourcePath, string destPath)
     {
-        var src = File.ReadAllBytes(sourcePath);
-        if (src.Length < 12 || Encoding.ASCII.GetString(src, 0, 4) != "RIFF" || Encoding.ASCII.GetString(src, 8, 4) != "WAVE")
+        // Stream the file rather than reading it fully into memory: a long recording can produce a
+        // large PCM data chunk, so we parse the header by seeking and copy the audio in bounded chunks.
+        using var source = new FileStream(sourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
+
+        var header = new byte[12];
+        if (source.ReadAtLeast(header, 12, throwOnEndOfStream: false) < 12
+            || Encoding.ASCII.GetString(header, 0, 4) != "RIFF"
+            || Encoding.ASCII.GetString(header, 8, 4) != "WAVE")
         {
             throw new InvalidOperationException("Source file is not a RIFF/WAVE.");
         }
 
         ushort audioFormat = 0, channels = 0, blockAlign = 0, bitsPerSample = 0;
         uint sampleRate = 0, byteRate = 0;
-        int dataOffset = -1, dataSize = 0;
+        long dataOffset = -1;
+        uint dataSize = 0;
 
-        int offset = 12;
-        while (offset + 8 <= src.Length)
+        var chunkHeader = new byte[8];
+        while (source.ReadAtLeast(chunkHeader, 8, throwOnEndOfStream: false) == 8)
         {
-            var chunkId = Encoding.ASCII.GetString(src, offset, 4);
-            var chunkSize = (int)BitConverter.ToUInt32(src, offset + 4);
+            var chunkId = Encoding.ASCII.GetString(chunkHeader, 0, 4);
+            var chunkSize = BitConverter.ToUInt32(chunkHeader, 4);
 
             if (chunkId == "fmt " && chunkSize >= 16)
             {
-                audioFormat = BitConverter.ToUInt16(src, offset + 8);
-                channels = BitConverter.ToUInt16(src, offset + 10);
-                sampleRate = BitConverter.ToUInt32(src, offset + 12);
-                byteRate = BitConverter.ToUInt32(src, offset + 16);
-                blockAlign = BitConverter.ToUInt16(src, offset + 20);
-                bitsPerSample = BitConverter.ToUInt16(src, offset + 22);
+                var fmt = new byte[16];
+                if (source.ReadAtLeast(fmt, 16, throwOnEndOfStream: false) < 16)
+                {
+                    break;
+                }
+
+                audioFormat = BitConverter.ToUInt16(fmt, 0);
+                channels = BitConverter.ToUInt16(fmt, 2);
+                sampleRate = BitConverter.ToUInt32(fmt, 4);
+                byteRate = BitConverter.ToUInt32(fmt, 8);
+                blockAlign = BitConverter.ToUInt16(fmt, 12);
+                bitsPerSample = BitConverter.ToUInt16(fmt, 14);
+
+                // Skip any remaining fmt bytes (e.g. extensible headers) plus the pad byte.
+                source.Seek((chunkSize - 16) + (chunkSize & 1), SeekOrigin.Current);
             }
             else if (chunkId == "data")
             {
-                dataOffset = offset + 8;
+                dataOffset = source.Position;
                 dataSize = chunkSize;
                 break;
             }
-
-            offset += 8 + chunkSize;
-            if ((chunkSize & 1) == 1)
+            else
             {
-                offset += 1;
+                // Skip this chunk's body plus its pad byte if the size is odd.
+                source.Seek(chunkSize + (chunkSize & 1), SeekOrigin.Current);
             }
         }
 
@@ -567,7 +582,7 @@ private static void RewriteWavAsCanonicalPcm(string sourcePath, string destPath)
             throw new InvalidOperationException($"Source WAV is not WAVE_FORMAT_PCM (got 0x{audioFormat:X4}).");
         }
 
-        if (dataOffset < 0 || dataSize <= 0)
+        if (dataOffset < 0 || dataSize == 0)
         {
             throw new InvalidOperationException("Source WAV has no data chunk.");
         }
@@ -577,28 +592,41 @@ private static void RewriteWavAsCanonicalPcm(string sourcePath, string destPath)
             throw new InvalidOperationException($"Source WAV is not 16-bit PCM (got {bitsPerSample}).");
         }
 
+        using var dest = new FileStream(destPath, FileMode.Create, FileAccess.Write, FileShare.None);
+
         const int CanonicalFmtSize = 16;
-        int canonicalSize = 12 + 8 + CanonicalFmtSize + 8 + dataSize;
-        var dst = new byte[canonicalSize];
-
-        Encoding.ASCII.GetBytes("RIFF").CopyTo(dst, 0);
-        BitConverter.GetBytes((uint)(canonicalSize - 8)).CopyTo(dst, 4);
-        Encoding.ASCII.GetBytes("WAVE").CopyTo(dst, 8);
-
-        Encoding.ASCII.GetBytes("fmt ").CopyTo(dst, 12);
-        BitConverter.GetBytes((uint)CanonicalFmtSize).CopyTo(dst, 16);
-        BitConverter.GetBytes((ushort)1).CopyTo(dst, 20);
-        BitConverter.GetBytes(channels).CopyTo(dst, 22);
-        BitConverter.GetBytes(sampleRate).CopyTo(dst, 24);
-        BitConverter.GetBytes(byteRate).CopyTo(dst, 28);
-        BitConverter.GetBytes(blockAlign).CopyTo(dst, 32);
-        BitConverter.GetBytes(bitsPerSample).CopyTo(dst, 34);
-
-        Encoding.ASCII.GetBytes("data").CopyTo(dst, 36);
-        BitConverter.GetBytes((uint)dataSize).CopyTo(dst, 40);
-
-        Buffer.BlockCopy(src, dataOffset, dst, 44, dataSize);
-        File.WriteAllBytes(destPath, dst);
+        var canonicalHeader = new byte[44];
+        Encoding.ASCII.GetBytes("RIFF").CopyTo(canonicalHeader, 0);
+        BitConverter.GetBytes(36u + dataSize).CopyTo(canonicalHeader, 4);
+        Encoding.ASCII.GetBytes("WAVE").CopyTo(canonicalHeader, 8);
+        Encoding.ASCII.GetBytes("fmt ").CopyTo(canonicalHeader, 12);
+        BitConverter.GetBytes((uint)CanonicalFmtSize).CopyTo(canonicalHeader, 16);
+        BitConverter.GetBytes((ushort)1).CopyTo(canonicalHeader, 20);
+        BitConverter.GetBytes(channels).CopyTo(canonicalHeader, 22);
+        BitConverter.GetBytes(sampleRate).CopyTo(canonicalHeader, 24);
+        BitConverter.GetBytes(byteRate).CopyTo(canonicalHeader, 28);
+        BitConverter.GetBytes(blockAlign).CopyTo(canonicalHeader, 32);
+        BitConverter.GetBytes(bitsPerSample).CopyTo(canonicalHeader, 34);
+        Encoding.ASCII.GetBytes("data").CopyTo(canonicalHeader, 36);
+        BitConverter.GetBytes(dataSize).CopyTo(canonicalHeader, 40);
+        dest.Write(canonicalHeader, 0, canonicalHeader.Length);
+
+        // Copy the PCM samples across in bounded chunks so the whole file is never held in memory.
+        source.Seek(dataOffset, SeekOrigin.Begin);
+        var buffer = new byte[81920];
+        long remaining = dataSize;
+        while (remaining > 0)
+        {
+            int toRead = (int)Math.Min(buffer.Length, remaining);
+            int read = source.Read(buffer, 0, toRead);
+            if (read == 0)
+            {
+                throw new InvalidOperationException("Source WAV data chunk is truncated.");
+            }
+
+            dest.Write(buffer, 0, read);
+            remaining -= read;
+        }
     }
 
     private async Task<bool> EnsureMicrophoneAccessAsync()

From 8b1ca700e30ff985456c68b9f23e0612464fea97 Mon Sep 17 00:00:00 2001
From: Hao Liu <liuhao3418@gmail.com>
Date: Mon, 15 Jun 2026 14:55:33 +0800
Subject: [PATCH 10/10] Defer speech model disposal to avoid crash when
 navigating away mid-recognition

---
 .../Samples/WCRAPIs/SpeechRecognition.xaml.cs | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
index 9595c56e..4d51cad6 100644
--- a/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
+++ b/AIDevGallery/Samples/WCRAPIs/SpeechRecognition.xaml.cs
@@ -39,6 +39,7 @@ internal sealed partial class SpeechRecognition : BaseSamplePage
     private Task? _streamingSessionTask;
     private StreamingRecognition? _fileStreamingRecognition;
     private CancellationTokenSource? _fileStreamingCts;
+    private Task? _fileRecognitionTask;
     private MediaPlayer? _filePlaybackPlayer;
     private TaskCompletionSource<bool>? _filePlaybackCompletion;
 
@@ -303,6 +304,11 @@ private async Task RecognizeFromFileAsync(bool streamMode)
         StorageFile? transcodedFile = null;
         StreamingRecognition? fileStreaming = null;
         CancellationTokenSource? fileCts = null;
+
+        // Mark that a file recognition is in flight so CleanUp can tell whether it's safe to dispose
+        // the shared model: disposing it while the native engine is still draining faults the engine.
+        var completion = new TaskCompletionSource();
+        _fileRecognitionTask = completion.Task;
         try
         {
             // For streaming, create the cancellation source up front so Stop works even during the
@@ -411,6 +417,13 @@ private async Task RecognizeFromFileAsync(bool streamMode)
             }
 
             await TryDeleteAsync(transcodedFile);
+
+            // Mark the operation complete so a later CleanUp knows no file recognition is in flight.
+            completion.TrySetResult();
+            if (ReferenceEquals(_fileRecognitionTask, completion.Task))
+            {
+                _fileRecognitionTask = null;
+            }
         }
     }
 
@@ -793,20 +806,22 @@ private void CleanUp()
 
         var streaming = _streamingRecognition;
         var sessionTask = _streamingSessionTask;
+        var fileRecognitionTask = _fileRecognitionTask;
         var model = _speechModel;
 
         _streamingRecognition = null;
         _streamingSessionTask = null;
+        _fileRecognitionTask = null;
         _speechModel = null;
         _isRecognizing = false;
 
-        if (streaming == null && model == null)
+        if (streaming == null && model == null && fileRecognitionTask == null)
         {
             return;
         }
 
         // Tear down off the UI thread (a synchronous wait would deadlock the DispatcherQueue), stopping
-        // and awaiting the session before disposal to avoid corrupting the on-disk model cache.
+        // and awaiting each session before disposal to avoid corrupting the on-disk model cache.
         _ = Task.Run(async () =>
         {
             if (streaming != null)
@@ -831,13 +846,16 @@ await sessionTask.WaitAsync(TimeSpan.FromSeconds(5))
                 }
             }
 
-            try
-            {
-                model?.Dispose();
-            }
-            catch (Exception ex)
+            if (fileRecognitionTask == null)
             {
-                Debug.WriteLine($"[SpeechRecognition] Model cleanup threw: {ex.Message}");
+                try
+                {
+                    model?.Dispose();
+                }
+                catch (Exception ex)
+                {
+                    Debug.WriteLine($"[SpeechRecognition] Model cleanup threw: {ex.Message}");
+                }
             }
         });
     }