diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cd6a3fb..11eaad9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -163,6 +163,12 @@ jobs:
             examples/mnist_cnn/data/raw
           key: datasets-raw-${{ hashFiles('examples/har_classifier/prepare_data.py', 'examples/ecg_anomaly_ae/prepare_data.py', 'examples/mnist_mlp/prepare_data.py', 'examples/mnist_cnn/prepare_data.py') }}
 
+      - name: Cache SpeechCommands raw download (shared, ~2.3 GB)
+        uses: actions/cache@v4
+        with:
+          path: examples/_shared/data/speech_commands
+          key: speechcommands-raw-${{ hashFiles('examples/_shared/speechcommands_data.py') }}
+
       - name: Prepare HAR data
         run: uv run examples/har_classifier/prepare_data.py
 
@@ -187,6 +193,20 @@ jobs:
       - name: Train PyTorch MNIST CNN (produces reference predictions + weights)
         run: uv run examples/mnist_cnn/train_pytorch.py
 
+      - name: Cache kws_mfcc processed data (6-class)
+        id: kws-mfcc-cache
+        uses: actions/cache@v4
+        with:
+          path: examples/kws_mfcc/data/6class
+          key: kws-mfcc-6class-${{ hashFiles('examples/kws_mfcc/prepare_data.py', 'examples/_shared/speechcommands_data.py') }}
+
+      - name: Prepare kws_mfcc data (6-class; only on cache miss)
+        if: steps.kws-mfcc-cache.outputs.cache-hit != 'true'
+        run: uv run examples/kws_mfcc/prepare_data.py
+
+      - name: Train PyTorch kws_mfcc (produces reference predictions + weights)
+        run: uv run examples/kws_mfcc/train_pytorch.py
+
       - name: Configure
         run: cmake --preset examples
 
@@ -238,6 +258,16 @@ jobs:
             --c examples/mnist_cnn/outputs/c_predictions.npy \
             --dtype int32
 
+      - name: Run kws_mfcc in BIT_PARITY mode
+        run: BIT_PARITY=1 build/examples/examples/kws_mfcc/train_c_kws_mfcc
+
+      - name: Diff kws_mfcc predictions (int32, exact match required)
+        run: |
+          uv run examples/_shared/compare_predictions.py \
+            --pytorch examples/kws_mfcc/outputs/6class/pytorch_predictions.npy \
+            --c examples/kws_mfcc/outputs/6class/c_predictions.npy \
+            --dtype int32
+
   python-test:
     runs-on: ubuntu-latest
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0eb6d73..d34dd41 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(har_classifier)
 add_subdirectory(ecg_anomaly_ae)
 add_subdirectory(mnist_mlp)
 add_subdirectory(mnist_cnn)
+add_subdirectory(kws_mfcc)
diff --git a/examples/_shared/speechcommands_data.py b/examples/_shared/speechcommands_data.py
new file mode 100644
index 0000000..4cb301e
--- /dev/null
+++ b/examples/_shared/speechcommands_data.py
@@ -0,0 +1,153 @@
+"""Shared SpeechCommands loader for the kws_mfcc and kws_raw examples.
+
+Wraps torchaudio.datasets.SPEECHCOMMANDS (v0.02) so both KWS examples download
+the ~2.3 GB corpus once into a shared raw root and deliver identical waveform
+arrays. Output is the native 16 kHz mono waveform (float32 in [-1, 1], the range
+torchaudio yields from the int16 PCM), pad/truncated to exactly 16000 samples.
+Feature extraction (MFCC) and downsampling are the model's job, not the loader's,
+per the repo's data-shape convention.
+
+    load_speechcommands(root, num_classes) -> dict
+        num_classes in {6, 35}
+        returns {"train": (x, y), "val": (x, y), "test": (x, y)}
+            x: float32 [N, 1, 16000]
+            y: int32   [N]  (0..num_classes-1)
+
+6-class config (labels 0..5, fixed order):
+    0 yes  1 no  2 up  3 down
+    4 silence  -- synthetic low-amplitude Gaussian noise (fixed per-split seed)
+    5 unknown  -- random clips drawn from the other 31 keywords (fixed per-split seed)
+35-class config (labels 0..34): the 35 natural keywords, alphabetical. No synthetic classes.
+"""
+from __future__ import annotations
+
+import wave
+from pathlib import Path
+
+import numpy as np
+from torchaudio.datasets import SPEECHCOMMANDS
+
+SAMPLE_RATE = 16000
+CLIP_LEN = 16000  # 1 s
+KEYWORDS_6 = ["yes", "no", "up", "down"]
+SILENCE_STD = 0.05
+SHUFFLE_SEED = 42  # mirrors examples/_shared/seeds.py; kept local to avoid an import cycle
+_SUBSETS = {"train": "training", "val": "validation", "test": "testing"}
+
+
+def _fix_length(wav: np.ndarray) -> np.ndarray:
+    """Pad with zeros / truncate a mono waveform to exactly CLIP_LEN samples."""
+    n = wav.shape[0]
+    if n == CLIP_LEN:
+        return wav
+    if n > CLIP_LEN:
+        return wav[:CLIP_LEN]
+    out = np.zeros(CLIP_LEN, dtype=np.float32)
+    out[:n] = wav
+    return out
+
+
+def _read_wav_int16(path) -> np.ndarray:
+    """Read a 16 kHz mono 16-bit PCM .wav as float32 in [-1, 1] (stdlib only).
+
+    torchaudio 2.11 (maintenance mode) routes its dataset decode through
+    torchcodec, which needs a system FFmpeg. We sidestep that with the stdlib
+    `wave` reader the spec blessed as the fallback: int16 PCM / 32768 reproduces
+    exactly what torchaudio/torchcodec would yield from these clips.
+    """
+    with wave.open(str(path), "rb") as w:
+        assert w.getnchannels() == 1 and w.getsampwidth() == 2, (
+            f"{path}: expected mono 16-bit PCM, got "
+            f"{w.getnchannels()}ch/{w.getsampwidth() * 8}bit (int16/32768 decode would be wrong)"
+        )
+        frames = w.readframes(w.getnframes())
+    return np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
+
+
+def _paths_by_label(ds) -> dict[str, list[Path]]:
+    """Map each label string to its list of absolute .wav paths for a subset.
+
+    Uses ds.get_metadata (which does NOT decode audio, so no torchcodec / FFmpeg
+    dependency); the metadata path is relative to ds._archive (pinned to
+    torchaudio 2.11's SPEECHCOMMANDS layout). Returning paths instead of decoded
+    waveforms lets the 6-class build decode only the clips it keeps, bounding
+    peak memory (the CI runner has ~7 GB; decoding all 35 words would exceed it).
+    """
+    by_label: dict[str, list[Path]] = {}
+    archive = Path(ds._archive)
+    for i in range(len(ds)):
+        relpath, sample_rate, label, *_ = ds.get_metadata(i)
+        assert sample_rate == SAMPLE_RATE, sample_rate
+        by_label.setdefault(label, []).append(archive / relpath)
+    return by_label
+
+
+def _decode(paths: list[Path]) -> list[np.ndarray]:
+    """Decode + length-fix a list of .wav paths to float32 [16000] waveforms."""
+    return [_fix_length(_read_wav_int16(p)) for p in paths]
+
+
+def _stack(clips: list[np.ndarray], label_id: int) -> tuple[np.ndarray, np.ndarray]:
+    x = np.stack(clips).astype(np.float32)[:, None, :]  # [N, 1, 16000]
+    y = np.full((x.shape[0],), label_id, dtype=np.int32)
+    return x, y
+
+
+def _build_split_6(paths_by_label, split_index: int) -> tuple[np.ndarray, np.ndarray]:
+    xs, ys = [], []
+    for label_id, kw in enumerate(KEYWORDS_6):
+        x, y = _stack(_decode(paths_by_label.get(kw, [])), label_id)
+        xs.append(x)
+        ys.append(y)
+    n_per = int(round(np.mean([len(paths_by_label.get(kw, [])) for kw in KEYWORDS_6])))
+
+    rng = np.random.default_rng(SHUFFLE_SEED + split_index)
+    # silence (label 4): synthetic low-amplitude Gaussian noise
+    silence = rng.normal(0.0, SILENCE_STD, size=(n_per, CLIP_LEN)).astype(np.float32)
+    silence = np.clip(silence, -1.0, 1.0)
+    xs.append(silence[:, None, :])
+    ys.append(np.full((n_per,), 4, dtype=np.int32))
+    # unknown (label 5): random draw of paths from the other 31 keywords in THIS
+    # split, decoding only the selected clips (memory-bounded).
+    pool = [p for lab, ps in paths_by_label.items() if lab not in KEYWORDS_6 for p in ps]
+    idx = rng.choice(len(pool), size=min(n_per, len(pool)), replace=False)
+    unknown = np.stack(_decode([pool[i] for i in idx])).astype(np.float32)
+    xs.append(unknown[:, None, :])
+    ys.append(np.full((unknown.shape[0],), 5, dtype=np.int32))
+
+    return np.concatenate(xs, axis=0), np.concatenate(ys, axis=0)
+
+
+def _build_split_35(paths_by_label, keywords_35) -> tuple[np.ndarray, np.ndarray]:
+    xs, ys = [], []
+    for label_id, kw in enumerate(keywords_35):
+        paths = paths_by_label.get(kw, [])
+        if not paths:
+            continue
+        x, y = _stack(_decode(paths), label_id)
+        xs.append(x)
+        ys.append(y)
+    return np.concatenate(xs, axis=0), np.concatenate(ys, axis=0)
+
+
+def load_speechcommands(root, num_classes: int) -> dict:
+    assert num_classes in (6, 35), num_classes
+    root = Path(root)
+    root.mkdir(parents=True, exist_ok=True)
+
+    grouped = {}
+    for split, subset in _SUBSETS.items():
+        ds = SPEECHCOMMANDS(root=str(root), download=True, subset=subset)
+        grouped[split] = _paths_by_label(ds)
+
+    if num_classes == 35:
+        keywords_35 = sorted({lab for g in grouped.values() for lab in g})
+        assert len(keywords_35) == 35, (len(keywords_35), keywords_35)
+
+    out = {}
+    for split_index, split in enumerate(("train", "val", "test")):
+        if num_classes == 6:
+            out[split] = _build_split_6(grouped[split], split_index)
+        else:
+            out[split] = _build_split_35(grouped[split], keywords_35)
+    return out
diff --git a/examples/kws_mfcc/CMakeLists.txt b/examples/kws_mfcc/CMakeLists.txt
new file mode 100644
index 0000000..42ce7b3
--- /dev/null
+++ b/examples/kws_mfcc/CMakeLists.txt
@@ -0,0 +1,65 @@
+add_executable(train_c_kws_mfcc train_c.c)
+
+target_link_libraries(train_c_kws_mfcc PRIVATE
+        DataLoaderApi
+        DataLoader
+        NPYLoaderApi
+        NPYLoader
+
+        Layer
+
+        Conv1dApi
+        Conv1d
+
+        LinearApi
+        Linear
+
+        ReluApi
+        Relu
+
+        FlattenApi
+        Flatten
+
+        Pool1dApi
+        MaxPool1d
+        AvgPool1d
+
+        AdaptivePool1dApi
+        AdaptiveAvgPool1d
+
+        QuantizationApi
+        Quantization
+
+        TensorApi
+        Tensor
+        Rounding
+
+        TrainingLoopApi
+        CalculateGradsSequential
+        TrainingBatchDefault
+        TrainingEpochDefault
+        Optimizer
+
+        LossFunction
+        CrossEntropy
+
+        SoftmaxApi
+        Softmax
+
+        Sgd
+        SgdApi
+
+        InferenceApi
+
+        StateDictApi
+        LayerWeightsApi
+        LayerQuant
+        LayerCommon
+        Distributions
+
+        Common
+        StorageApi
+        RNG
+
+        examples_shared
+)
diff --git a/examples/kws_mfcc/README.md b/examples/kws_mfcc/README.md
new file mode 100644
index 0000000..69bda41
--- /dev/null
+++ b/examples/kws_mfcc/README.md
@@ -0,0 +1,54 @@
+# KWS MFCC — PyTorch + C Parity Demo
+
+Trains a small 1D-CNN keyword-spotter on Google SpeechCommands MFCC features in
+both PyTorch (reference) and the ODT C framework. Stage 3 of the 1D-CNN example
+suite. Each 1 s clip → log-MFCC `[40, 32]` (40 mel-cepstra × 32 frames); MFCC is
+computed once in `prepare_data.py` so PyTorch and C read **identical** `.npy` —
+feature extraction sits outside the parity check.
+
+One binary, two verification modes — **bit-parity** (`BIT_PARITY=1`, the exact CI
+gate: loads PyTorch's trained weights and runs inference only; C predictions must
+be bit-identical) and a **train-from-scratch** informational demo (independent
+random init; `compare.py` checks convergence within tolerance + emits plots).
+
+## Class-count knob
+
+`KWS_CLASSES` (default **6**) selects the subset. CI runs **6-class only**; 35 is
+local-only. Per-config artifacts live under `<n>class/` subdirs.
+
+- **6-class** (labels 0..5): `yes`, `no`, `up`, `down`, `silence` (synthetic
+  low-amplitude Gaussian noise), `unknown` (random clips from the other 31 keywords).
+- **35-class**: the 35 natural keywords, alphabetical.
+
+## Run it (6-class)
+
+```bash
+uv run python examples/kws_mfcc/prepare_data.py        # downloads ~2.3 GB once (shared root)
+uv run python examples/kws_mfcc/train_pytorch.py
+cmake --preset examples
+cmake --build --preset examples --target train_c_kws_mfcc
+
+# Bit-parity (exact — the CI gate)
+BIT_PARITY=1 ./build/examples/examples/kws_mfcc/train_c_kws_mfcc
+uv run python examples/_shared/compare_predictions.py \
+  --pytorch examples/kws_mfcc/outputs/6class/pytorch_predictions.npy \
+  --c examples/kws_mfcc/outputs/6class/c_predictions.npy --dtype int32
+
+# …or the train-from-scratch demo + plots (SLOW — C trains one sample at a time)
+./build/examples/examples/kws_mfcc/train_c_kws_mfcc
+uv run python examples/kws_mfcc/compare.py
+```
+
+Run the full 35-class set with `KWS_CLASSES=35 …` on every command (local-only).
+
+## Model
+
+- Input: `[40, 32]` (40 MFCC channels, 32 frames) → `reshapeItemsAddBatchDim` → `[1, 40, 32]`
+- `Conv1d(40→32,K3,SAME) → ReLU → MaxPool(2) → Conv1d(32→64,K3,SAME) → ReLU →
+  MaxPool(2) → AdaptiveAvgPool1d(1) → Flatten → Linear(64→C) → Softmax → CE`
+- Lengths: 32 → 16 → 8 → 1; ~16 K params
+- State-dict layers: `conv1`, `conv2`, `fc`
+
+The train-from-scratch tolerances (`test_acc ±2.5 pp`, `test_loss ±0.15 nats`) are
+informational; bit-parity mode requires exact equality. See
+`examples/_shared/DETERMINISM.md` for the determinism contract.
diff --git a/examples/kws_mfcc/compare.py b/examples/kws_mfcc/compare.py
new file mode 100644
index 0000000..aed9da3
--- /dev/null
+++ b/examples/kws_mfcc/compare.py
@@ -0,0 +1,88 @@
+"""Compare PyTorch and C runs of the kws_mfcc classifier.
+
+Reads logs/<n>class/{pytorch,c}.json and outputs/<n>class/{pytorch,c}_predictions.npy.
+Writes plots into plots/<n>class/. Prints a final-state parity report within tolerances.
+INFORMATIONAL only — the bit-parity check (compare_predictions.py) is the gate.
+"""
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+
+from examples._shared.log_schema import load_log  # noqa: E402
+from examples._shared.parity import ParityCheck, run_parity_checks  # noqa: E402
+from examples._shared.plotting import (  # noqa: E402
+    plot_accuracy_curves,
+    plot_confusion_matrix,
+    plot_loss_curves,
+)
+
+HERE = Path(__file__).resolve().parent
+NUM_CLASSES = int(os.environ.get("KWS_CLASSES", "6"))
+assert NUM_CLASSES in (6, 35), NUM_CLASSES
+TAG = f"{NUM_CLASSES}class"
+LOGS = HERE / "logs" / TAG
+OUTPUTS = HERE / "outputs" / TAG
+PLOTS = HERE / "plots" / TAG
+DATA = HERE / "data" / TAG
+
+CLASS_NAMES = (
+    ["yes", "no", "up", "down", "silence", "unknown"]
+    if NUM_CLASSES == 6
+    else [str(i) for i in range(NUM_CLASSES)]
+)
+
+CHECKS = [
+    ParityCheck("test_acc", abs_tol=0.025),   # ±2.5 pp
+    ParityCheck("test_loss", abs_tol=0.15),   # ±0.15 nats (informational)
+]
+
+
+def confusion_matrix(preds: np.ndarray, labels: np.ndarray, num_classes: int) -> np.ndarray:
+    cm = np.zeros((num_classes, num_classes), dtype=np.int64)
+    for p, a in zip(preds, labels):
+        cm[int(p), int(a)] += 1
+    return cm
+
+
+def main() -> int:
+    PLOTS.mkdir(parents=True, exist_ok=True)
+    pt = load_log(LOGS / "pytorch.json")
+    c = load_log(LOGS / "c.json")
+
+    plot_loss_curves(PLOTS / "loss_curves.png", pt, c)
+    plot_accuracy_curves(PLOTS / "accuracy_curves.png", pt, c)
+
+    test_y = np.load(DATA / "test_y.npy")
+    pt_pred = np.load(OUTPUTS / "pytorch_predictions.npy")
+    c_pred = np.load(OUTPUTS / "c_predictions.npy")
+    cm_pt = confusion_matrix(pt_pred, test_y, len(CLASS_NAMES))
+    cm_c = confusion_matrix(c_pred, test_y, len(CLASS_NAMES))
+    plot_confusion_matrix(PLOTS / "confusion_matrix_pt.png", cm_pt, CLASS_NAMES, "PyTorch KWS MFCC")
+    plot_confusion_matrix(PLOTS / "confusion_matrix_c.png", cm_c, CLASS_NAMES, "C KWS MFCC")
+
+    pt_finals = pt["final"]
+    c_finals = c["final"]
+    overall_pass, results = run_parity_checks(
+        CHECKS,
+        {"test_acc": pt_finals["test_acc"], "test_loss": pt_finals["test_loss"]},
+        {"test_acc": c_finals["test_acc"], "test_loss": c_finals["test_loss"]},
+    )
+
+    print("\nParity report (PyTorch vs C) — INFORMATIONAL:")
+    print(f"{'metric':<14} {'pt':>10} {'c':>10} {'diff':>10} {'tol':>8} {'type':>5} {'pass':>6}")
+    for r in results:
+        print(f"{r.metric:<14} {r.pt_value:>10.5f} {r.c_value:>10.5f} {r.diff:>10.5f} "
+              f"{r.tolerance:>8.4f} {r.tolerance_type:>5} {str(r.passed):>6}")
+    print(f"\nOverall: {'PASS' if overall_pass else 'FAIL'} (informational; not a CI gate)")
+    return 0 if overall_pass else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/kws_mfcc/prepare_data.py b/examples/kws_mfcc/prepare_data.py
new file mode 100644
index 0000000..0549c6f
--- /dev/null
+++ b/examples/kws_mfcc/prepare_data.py
@@ -0,0 +1,68 @@
+"""Prepare SpeechCommands MFCC features for the kws_mfcc example.
+
+For each clip: log-MFCC via torchaudio (n_mfcc=40, n_fft=400, hop=512, n_mels=40)
+over the native 16 kHz waveform -> [40, 32] frames (T=32 exact, no trim).
+
+Output (under examples/kws_mfcc/data/<n>class/, n = KWS_CLASSES in {6,35}, default 6):
+  {train,val,test}_x.npy  [N,40,32] f32
+  {train,val,test}_y.npy  [N] i32  (0..n-1)
+"""
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+from torchaudio.transforms import MFCC
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+from examples._shared.speechcommands_data import load_speechcommands  # noqa: E402
+
+HERE = Path(__file__).resolve().parent
+RAW_ROOT = REPO_ROOT / "examples" / "_shared" / "data" / "speech_commands"
+N_MFCC = 40
+T_FRAMES = 32
+
+
+def _mfcc_features(x: np.ndarray) -> np.ndarray:
+    """x: [N,1,16000] f32 waveform -> [N,40,32] f32 MFCC (frame axis fixed to 32)."""
+    mfcc = MFCC(
+        sample_rate=16000,
+        n_mfcc=N_MFCC,
+        melkwargs={"n_fft": 400, "hop_length": 512, "n_mels": N_MFCC},
+    )
+    feats = np.empty((x.shape[0], N_MFCC, T_FRAMES), dtype=np.float32)
+    with torch.no_grad():
+        for i in range(x.shape[0]):
+            m = mfcc(torch.from_numpy(x[i]))  # [1,40,frames]
+            m = m.squeeze(0).numpy().astype(np.float32)  # [40,frames]
+            if m.shape[1] >= T_FRAMES:
+                m = m[:, :T_FRAMES]
+            else:
+                pad = np.zeros((N_MFCC, T_FRAMES), dtype=np.float32)
+                pad[:, : m.shape[1]] = m
+                m = pad
+            feats[i] = m
+    return feats
+
+
+def main() -> None:
+    num_classes = int(os.environ.get("KWS_CLASSES", "6"))
+    assert num_classes in (6, 35), num_classes
+    data_dir = HERE / "data" / f"{num_classes}class"
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    splits = load_speechcommands(RAW_ROOT, num_classes)
+    for split in ("train", "val", "test"):
+        x_wav, y = splits[split]
+        x = _mfcc_features(x_wav)
+        np.save(data_dir / f"{split}_x.npy", x)
+        np.save(data_dir / f"{split}_y.npy", y.astype(np.int32))
+        print(f"{split}: x={x.shape} y={y.shape} classes={num_classes}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kws_mfcc/train_c.c b/examples/kws_mfcc/train_c.c
new file mode 100644
index 0000000..2b0c81c
--- /dev/null
+++ b/examples/kws_mfcc/train_c.c
@@ -0,0 +1,408 @@
+#define SOURCE_FILE "kws_mfcc_train_c"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+
+#include "AdaptivePool1dApi.h"
+#include "CalculateGradsSequential.h"
+#include "Common.h"
+#include "Conv1dApi.h"
+#include "DataLoader.h"
+#include "DataLoaderApi.h"
+#include "FlattenApi.h"
+#include "InferenceApi.h"
+#include "Layer.h"
+#include "LayerCommon.h"
+#include "LayerQuant.h"
+#include "LinearApi.h"
+#include "LossFunction.h"
+#include "NPYLoaderApi.h"
+#include "Pool1dApi.h"
+#include "Quantization.h"
+#include "QuantizationApi.h"
+#include "ReluApi.h"
+#include "SgdApi.h"
+#include "SoftmaxApi.h"
+#include "StateDictApi.h"
+#include "StorageApi.h"
+#include "Tensor.h"
+#include "TensorApi.h"
+#include "TrainingLoopApi.h"
+
+#include "npy_writer.h"
+
+#define EPOCHS 15
+#define BATCH 32
+#define LR 0.001f
+#define MOMENTUM 0.9f
+#define SEED 42
+#define SHUFFLE_SEED 42
+#define NUM_CLASSES_DEFAULT 6
+
+#define IN_CHANNELS 40
+#define LEN_INPUT 32
+#define C1_OUT 32
+#define C1_K 3
+#define C2_OUT 64
+#define C2_K 3
+
+/* 2x(Conv1d+ReLU+MaxPool) + AdaptiveAvgPool + Flatten + Linear + Softmax = 10 layers */
+#define MODEL_SIZE 10
+
+static dataset_t g_trainDataset;
+static dataset_t g_valDataset;
+static dataset_t g_testDataset;
+
+static size_t g_numClasses = NUM_CLASSES_DEFAULT;
+
+static size_t readNumClasses(void) {
+    const char *env = getenv("KWS_CLASSES");
+    if (env == NULL || env[0] == '\0') {
+        return NUM_CLASSES_DEFAULT;
+    }
+    long v = strtol(env, NULL, 10);
+    if (v != 6 && v != 35) {
+        fprintf(stderr, "KWS_CLASSES must be 6 or 35 (got '%s'); using %d\n", env,
+                NUM_CLASSES_DEFAULT);
+        return NUM_CLASSES_DEFAULT;
+    }
+    return (size_t)v;
+}
+
+static void reshapeItemsAddBatchDim(tensorArray_t *items) {
+    for (size_t i = 0; i < items->size; ++i) {
+        tensor_t *t = items->array[i];
+        size_t oldRank = t->shape->numberOfDimensions;
+        size_t newRank = oldRank + 1;
+
+        size_t *newDims = reserveMemory(newRank * sizeof(size_t));
+        size_t *newOrder = reserveMemory(newRank * sizeof(size_t));
+        newDims[0] = 1;
+        for (size_t d = 0; d < oldRank; ++d) {
+            newDims[d + 1] = t->shape->dimensions[d];
+        }
+        for (size_t d = 0; d < newRank; ++d) {
+            newOrder[d] = d;
+        }
+
+        freeReservedMemory(t->shape->dimensions);
+        freeReservedMemory(t->shape->orderOfDimensions);
+        t->shape->dimensions = newDims;
+        t->shape->orderOfDimensions = newOrder;
+        t->shape->numberOfDimensions = newRank;
+    }
+}
+
+static tensorArray_t *buildOneHotLabels(tensorArray_t *intLabels) {
+    tensorArray_t *out = reserveMemory(sizeof(tensorArray_t));
+    tensor_t **arr = reserveMemory(intLabels->size * sizeof(tensor_t *));
+    out->array = arr;
+    out->size = intLabels->size;
+
+    for (size_t i = 0; i < intLabels->size; ++i) {
+        size_t *dims = reserveMemory(1 * sizeof(size_t));
+        size_t *order = reserveMemory(1 * sizeof(size_t));
+        dims[0] = g_numClasses;
+        order[0] = 0;
+        shape_t *shape = reserveMemory(sizeof(shape_t));
+        shape->dimensions = dims;
+        shape->orderOfDimensions = order;
+        shape->numberOfDimensions = 1;
+
+        quantization_t *q = quantizationInitFloat();
+        tensor_t *t = initTensor(shape, q, NULL);
+
+        int32_t cls = ((int32_t *)intLabels->array[i]->data)[0];
+        float *data = (float *)t->data;
+        for (size_t c = 0; c < g_numClasses; ++c) {
+            data[c] = (c == (size_t)cls) ? 1.0f : 0.0f;
+        }
+        arr[i] = t;
+    }
+    return out;
+}
+
+static void initDataSets(const char *dataDir) {
+    char path[300];
+    snprintf(path, sizeof(path), "%s/train_x.npy", dataDir);
+    tensorArray_t *trainItems = npyLoad(path);
+    snprintf(path, sizeof(path), "%s/train_y.npy", dataDir);
+    tensorArray_t *trainLabelsRaw = npyLoad(path);
+    reshapeItemsAddBatchDim(trainItems);
+    g_trainDataset.items = trainItems;
+    g_trainDataset.labels = buildOneHotLabels(trainLabelsRaw);
+
+    snprintf(path, sizeof(path), "%s/val_x.npy", dataDir);
+    tensorArray_t *valItems = npyLoad(path);
+    snprintf(path, sizeof(path), "%s/val_y.npy", dataDir);
+    tensorArray_t *valLabelsRaw = npyLoad(path);
+    reshapeItemsAddBatchDim(valItems);
+    g_valDataset.items = valItems;
+    g_valDataset.labels = buildOneHotLabels(valLabelsRaw);
+
+    snprintf(path, sizeof(path), "%s/test_x.npy", dataDir);
+    tensorArray_t *testItems = npyLoad(path);
+    snprintf(path, sizeof(path), "%s/test_y.npy", dataDir);
+    tensorArray_t *testLabelsRaw = npyLoad(path);
+    reshapeItemsAddBatchDim(testItems);
+    g_testDataset.items = testItems;
+    g_testDataset.labels = buildOneHotLabels(testLabelsRaw);
+}
+
+static sample_t *getTrainSample(size_t id) {
+    return npyGetSample(&g_trainDataset, id);
+}
+static sample_t *getValSample(size_t id) {
+    return npyGetSample(&g_valDataset, id);
+}
+static sample_t *getTestSample(size_t id) {
+    return npyGetSample(&g_testDataset, id);
+}
+static size_t getTrainSize(void) {
+    return g_trainDataset.items->size;
+}
+static size_t getValSize(void) {
+    return g_valDataset.items->size;
+}
+static size_t getTestSize(void) {
+    return g_testDataset.items->size;
+}
+
+static void buildModel(layer_t **model, layerQuant_t *lq) {
+    /* Input reshaped to [1, 40, 32]. */
+    model[0] = conv1dLayerInit(
+        &(conv1dInit_t){
+            .inChannels = IN_CHANNELS, .outChannels = C1_OUT, .kernelSize = C1_K, .padding = SAME},
+        lq);
+    model[1] = reluLayerInit(lq);
+    model[2] = maxPool1dLayerInit(
+        &(maxPool1dInit_t){
+            .kernelSize = 2, .stride = 2, .inputChannels = C1_OUT, .inputLength = LEN_INPUT},
+        lq);
+
+    model[3] = conv1dLayerInit(
+        &(conv1dInit_t){
+            .inChannels = C1_OUT, .outChannels = C2_OUT, .kernelSize = C2_K, .padding = SAME},
+        lq);
+    model[4] = reluLayerInit(lq);
+    model[5] = maxPool1dLayerInit(
+        &(maxPool1dInit_t){
+            .kernelSize = 2, .stride = 2, .inputChannels = C2_OUT, .inputLength = LEN_INPUT / 2},
+        lq);
+
+    /* Rate-agnostic head: AdaptiveAvgPool1d(1) -> Flatten -> Linear -> Softmax. */
+    model[6] = adaptiveAvgPool1dLayerInit(&(adaptiveAvgPool1dInit_t){.outputSize = 1}, lq);
+    model[7] = flattenLayerInit();
+    model[8] =
+        linearLayerInit(&(linearInit_t){.inFeatures = C2_OUT, .outFeatures = g_numClasses}, lq);
+    model[9] = softmaxLayerInit(lq);
+}
+
+/* Load PyTorch state_dict from per-layer .npy files written by
+ * examples/kws_mfcc/train_pytorch.py --save-weights.
+ *
+ * Returns 0 on success, non-zero on first missing file. */
+static int loadStateDictFromDir(layer_t **model, const char *weightsDir) {
+    char wPath[256], bPath[256];
+    const char *names[3] = {"conv1", "conv2", "fc"};
+    tensor_t *w[3] = {0};
+    tensor_t *b[3] = {0};
+
+    for (int i = 0; i < 3; i++) {
+        snprintf(wPath, sizeof(wPath), "%s/%s.weight.npy", weightsDir, names[i]);
+        snprintf(bPath, sizeof(bPath), "%s/%s.bias.npy", weightsDir, names[i]);
+        w[i] = npyLoadFlat(wPath);
+        b[i] = npyLoadFlat(bPath);
+        if (w[i] == NULL || b[i] == NULL) {
+            fprintf(stderr, "loadStateDictFromDir: missing %s or %s\n", wPath, bPath);
+            return 1;
+        }
+    }
+
+    modelLoadStateDict(
+        model, MODEL_SIZE,
+        (stateDictEntry_t[]){
+            {.name = names[0], .weightData = (float *)w[0]->data, .biasData = (float *)b[0]->data},
+            {.name = names[1], .weightData = (float *)w[1]->data, .biasData = (float *)b[1]->data},
+            {.name = names[2], .weightData = (float *)w[2]->data, .biasData = (float *)b[2]->data},
+        },
+        3);
+
+    for (int i = 0; i < 3; i++) {
+        freeTensor(w[i]);
+        freeTensor(b[i]);
+    }
+    return 0;
+}
+
+static FILE *g_log_file = NULL;
+static int g_first_epoch = 1;
+static struct timespec g_epoch_t0;
+
+static void epochCallback(size_t epoch, float trainLoss, epochStats_t evalStats) {
+    struct timespec t1;
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    double wall_s =
+        (double)(t1.tv_sec - g_epoch_t0.tv_sec) + (double)(t1.tv_nsec - g_epoch_t0.tv_nsec) * 1e-9;
+
+    if (!g_first_epoch) {
+        fprintf(g_log_file, ",\n");
+    }
+    fprintf(g_log_file,
+            "    {\"epoch\": %zu, \"step_losses\": [], \"train_loss\": %.6f, "
+            "\"val_loss\": %.6f, \"val_acc\": %.6f, \"wall_s\": %.4f}",
+            epoch, (double)trainLoss, (double)evalStats.loss, (double)evalStats.accuracy, wall_s);
+    fflush(g_log_file);
+    g_first_epoch = 0;
+
+    fprintf(stdout, "epoch %zu: train_loss=%.4f val_loss=%.4f val_acc=%.4f wall_s=%.2f\n", epoch,
+            (double)trainLoss, (double)evalStats.loss, (double)evalStats.accuracy, wall_s);
+    fflush(stdout);
+
+    clock_gettime(CLOCK_MONOTONIC, &g_epoch_t0);
+}
+
+static int ensureDir(const char *p) {
+    if (mkdir(p, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == 0) {
+        return 0;
+    }
+    if (errno == EEXIST) {
+        return 0;
+    }
+    fprintf(stderr, "ERROR: cannot create %s: %s\n", p, strerror(errno));
+    return 1;
+}
+
+int main(void) {
+    g_numClasses = readNumClasses();
+
+    char dataDir[256], weightsDir[256], logsDir[256], outputsDir[256];
+    snprintf(dataDir, sizeof(dataDir), "examples/kws_mfcc/data/%zuclass", g_numClasses);
+    snprintf(weightsDir, sizeof(weightsDir), "examples/kws_mfcc/weights/%zuclass", g_numClasses);
+    snprintf(logsDir, sizeof(logsDir), "examples/kws_mfcc/logs/%zuclass", g_numClasses);
+    snprintf(outputsDir, sizeof(outputsDir), "examples/kws_mfcc/outputs/%zuclass", g_numClasses);
+
+    if (ensureDir("examples/kws_mfcc/logs") != 0 || ensureDir(logsDir) != 0) {
+        return 1;
+    }
+    if (ensureDir("examples/kws_mfcc/outputs") != 0 || ensureDir(outputsDir) != 0) {
+        return 1;
+    }
+
+    initDataSets(dataDir);
+
+    dataLoader_t *testLoader = dataLoaderInit(getTestSample, getTestSize, 1, NULL, NULL,
+                                              /*shuffle*/ false, /*shuffleSeed*/ 0,
+                                              /*dropLast*/ true);
+
+    layerQuant_t lq;
+    layerQuantInitUniform(&lq, quantizationInitFloat());
+
+    layer_t *model[MODEL_SIZE];
+    buildModel(model, &lq);
+
+    const char *bitParity = getenv("BIT_PARITY");
+    if (bitParity != NULL && bitParity[0] != '\0') {
+        /* Bit-parity mode: load PyTorch state_dict, skip training, run inference. */
+        if (loadStateDictFromDir(model, weightsDir) != 0) {
+            fprintf(stderr, "BIT_PARITY: state_dict load failed\n");
+            return 1;
+        }
+        fprintf(stdout, "BIT_PARITY: loaded state_dict from %s\n", weightsDir);
+    } else {
+        dataLoader_t *trainLoader = dataLoaderInit(getTrainSample, getTrainSize, BATCH, NULL, NULL,
+                                                   /*shuffle*/ true, /*shuffleSeed*/ SHUFFLE_SEED,
+                                                   /*dropLast*/ true);
+        dataLoader_t *valLoader = dataLoaderInit(getValSample, getValSize, 1, NULL, NULL,
+                                                 /*shuffle*/ false, /*shuffleSeed*/ 0,
+                                                 /*dropLast*/ true);
+
+        optimizer_t *sgd =
+            sgdMCreateOptim(LR, MOMENTUM, /*weightDecay*/ 0.0f, model, MODEL_SIZE, FLOAT32);
+
+        char logPath[300];
+        snprintf(logPath, sizeof(logPath), "%s/c.json", logsDir);
+        g_log_file = fopen(logPath, "w");
+        if (!g_log_file) {
+            fprintf(stderr, "ERROR: cannot open log file for writing\n");
+            return 1;
+        }
+        fprintf(g_log_file,
+                "{\n"
+                "  \"impl\": \"c\",\n"
+                "  \"example\": \"kws_mfcc\",\n"
+                "  \"config\": {\"epochs\": %d, \"batch\": %d, \"lr\": %.6f, "
+                "\"momentum\": %.6f, \"seed\": %d, \"shuffle_seed\": %d},\n"
+                "  \"epochs\": [\n",
+                EPOCHS, BATCH, (double)LR, (double)MOMENTUM, SEED, SHUFFLE_SEED);
+        fflush(g_log_file);
+
+        clock_gettime(CLOCK_MONOTONIC, &g_epoch_t0);
+
+        trainingRunResult_t result =
+            trainingRun(model, MODEL_SIZE,
+                        (lossConfig_t){.funcType = CROSS_ENTROPY,
+                                       .backwardReduction = REDUCTION_MEAN,
+                                       .classWeights = NULL},
+                        trainLoader, valLoader, sgd, EPOCHS, calculateGradsSequential,
+                        inferenceWithLoss, epochCallback);
+        (void)result;
+
+        epochStats_t testStats = evaluationEpochWithMetrics(
+            model, MODEL_SIZE, CROSS_ENTROPY, testLoader, inferenceWithLoss, REDUCTION_MEAN);
+
+        fprintf(g_log_file,
+                "\n  ],\n"
+                "  \"final\": {\"test_loss\": %.6f, \"test_acc\": %.6f, "
+                "\"test_auc\": null}\n"
+                "}\n",
+                (double)testStats.loss, (double)testStats.accuracy);
+        fclose(g_log_file);
+
+        fprintf(stdout, "FINAL test_loss=%.4f test_acc=%.4f\n", (double)testStats.loss,
+                (double)testStats.accuracy);
+    }
+
+    /* Predictions on test set (both modes). */
+    size_t numTest = getTestSize();
+    int32_t *predictions = malloc(numTest * sizeof(int32_t));
+    if (!predictions) {
+        fprintf(stderr, "OOM allocating predictions\n");
+        return 1;
+    }
+
+    for (size_t i = 0; i < numTest; ++i) {
+        sample_t *s = getTestSample(i);
+        tensor_t *out = inference(model, MODEL_SIZE, s->item);
+        float *probs = (float *)out->data;
+        size_t argmax = 0;
+        float best = probs[0];
+        for (size_t c = 1; c < g_numClasses; ++c) {
+            if (probs[c] > best) {
+                best = probs[c];
+                argmax = c;
+            }
+        }
+        predictions[i] = (int32_t)argmax;
+        freeTensor(out);
+        freeSample(s);
+    }
+
+    char predPath[300];
+    snprintf(predPath, sizeof(predPath), "%s/c_predictions.npy", outputsDir);
+    size_t outShape[] = {numTest};
+    int status = 0;
+    int rc = npyWriteInt32(predPath, predictions, outShape, 1);
+    if (rc != 0) {
+        fprintf(stderr, "ERROR: npyWriteInt32 failed (rc=%d)\n", rc);
+        status = 1;
+    }
+    free(predictions);
+
+    return status;
+}
diff --git a/examples/kws_mfcc/train_pytorch.py b/examples/kws_mfcc/train_pytorch.py
new file mode 100644
index 0000000..cfc016c
--- /dev/null
+++ b/examples/kws_mfcc/train_pytorch.py
@@ -0,0 +1,165 @@
+"""PyTorch reference implementation of the kws_mfcc 1D-CNN classifier.
+
+Input: MFCC [40,32] from prepare_data.py. Output: logs/<n>class/pytorch.json +
+outputs/<n>class/pytorch_predictions.npy + weights/<n>class/{conv1,conv2,fc}.{weight,bias}.npy
+for the C-side BIT_PARITY mode. num_classes from KWS_CLASSES (default 6).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+from examples._shared.log_schema import RunLog, dump_log  # noqa: E402
+from examples._shared.seeds import SEED, SHUFFLE_SEED  # noqa: E402
+from examples._shared.xorshift32 import shuffle_indices  # noqa: E402
+
+HERE = Path(__file__).resolve().parent
+NUM_CLASSES = int(os.environ.get("KWS_CLASSES", "6"))
+assert NUM_CLASSES in (6, 35), NUM_CLASSES
+TAG = f"{NUM_CLASSES}class"
+DATA = HERE / "data" / TAG
+LOGS = HERE / "logs" / TAG
+OUTPUTS = HERE / "outputs" / TAG
+WEIGHTS = HERE / "weights" / TAG
+
+EPOCHS = 15
+BATCH = 32
+LR = 0.001
+MOMENTUM = 0.9
+
+
+class KwsDataset(torch.utils.data.Dataset):
+    def __init__(self, x: np.ndarray, y: np.ndarray) -> None:
+        self.x = torch.from_numpy(x.astype(np.float32))
+        self.y = torch.from_numpy(y.astype(np.int64))
+
+    def __len__(self) -> int:
+        return self.x.shape[0]
+
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.x[idx], self.y[idx]
+
+
+class XorShift32Sampler(torch.utils.data.Sampler[int]):
+    """Single-shot shuffle, no per-epoch reshuffle, matching framework DataLoader.c."""
+    def __init__(self, n: int, seed: int) -> None:
+        self.indices = shuffle_indices(n, seed)
+
+    def __iter__(self):
+        return iter(self.indices)
+
+    def __len__(self) -> int:
+        return len(self.indices)
+
+
+class KwsMfccCnn(nn.Module):
+    def __init__(self, num_classes: int) -> None:
+        super().__init__()
+        self.conv1 = nn.Conv1d(40, 32, kernel_size=3, padding=1)  # SAME (K odd, stride 1)
+        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
+        self.fc = nn.Linear(64, num_classes)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.relu(self.conv1(x))              # [B,32,32]
+        x = F.max_pool1d(x, 2)                 # [B,32,16]
+        x = F.relu(self.conv2(x))              # [B,64,16]
+        x = F.max_pool1d(x, 2)                 # [B,64,8]
+        x = F.adaptive_avg_pool1d(x, 1)        # [B,64,1]
+        x = x.flatten(start_dim=1)             # [B,64]
+        return self.fc(x)
+
+
+def evaluate(model: nn.Module, x: np.ndarray, y: np.ndarray, batch: int) -> tuple[float, float]:
+    model.eval()
+    total_loss, total_correct, total = 0.0, 0, 0
+    with torch.no_grad():
+        for i in range(0, len(x), batch):
+            xb = torch.from_numpy(x[i : i + batch].astype(np.float32))
+            yb = torch.from_numpy(y[i : i + batch].astype(np.int64))
+            logits = model(xb)
+            loss = F.cross_entropy(logits, yb, reduction="sum")
+            total_loss += loss.item()
+            total_correct += (logits.argmax(dim=1) == yb).sum().item()
+            total += yb.shape[0]
+    return total_loss / total, total_correct / total
+
+
+def main() -> None:
+    torch.manual_seed(SEED)
+    np.random.seed(SEED)
+    torch.use_deterministic_algorithms(True, warn_only=True)
+
+    train_x = np.load(DATA / "train_x.npy")
+    train_y = np.load(DATA / "train_y.npy")
+    val_x = np.load(DATA / "val_x.npy")
+    val_y = np.load(DATA / "val_y.npy")
+    test_x = np.load(DATA / "test_x.npy")
+    test_y = np.load(DATA / "test_y.npy")
+
+    train_ds = KwsDataset(train_x, train_y)
+    sampler = XorShift32Sampler(len(train_ds), SHUFFLE_SEED)
+    loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH, sampler=sampler, drop_last=True)
+
+    model = KwsMfccCnn(NUM_CLASSES)
+    optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM)
+
+    epoch_records = []
+    for epoch in range(EPOCHS):
+        t0 = time.time()
+        model.train()
+        step_losses: list[float] = []
+        for xb, yb in loader:
+            optimizer.zero_grad()
+            loss = F.cross_entropy(model(xb), yb)
+            loss.backward()
+            optimizer.step()
+            step_losses.append(loss.item())
+        train_loss = float(np.mean(step_losses)) if step_losses else 0.0
+        val_loss, val_acc = evaluate(model, val_x, val_y, BATCH)
+        epoch_records.append({
+            "epoch": epoch, "step_losses": step_losses, "train_loss": train_loss,
+            "val_loss": val_loss, "val_acc": val_acc, "wall_s": time.time() - t0,
+        })
+        print(f"epoch {epoch:2d}: train_loss={train_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}", flush=True)
+
+    test_loss, test_acc = evaluate(model, test_x, test_y, BATCH)
+    log: RunLog = {
+        "impl": "pytorch", "example": "kws_mfcc",
+        "config": {"epochs": EPOCHS, "batch": BATCH, "lr": LR, "momentum": MOMENTUM,
+                   "seed": SEED, "shuffle_seed": SHUFFLE_SEED},
+        "epochs": epoch_records,  # type: ignore[typeddict-item]
+        "final": {"test_loss": test_loss, "test_acc": test_acc, "test_auc": None},
+    }
+    LOGS.mkdir(parents=True, exist_ok=True)
+    OUTPUTS.mkdir(parents=True, exist_ok=True)
+    dump_log(LOGS / "pytorch.json", log)
+
+    model.eval()
+    with torch.no_grad():
+        preds = model(torch.from_numpy(test_x.astype(np.float32))).argmax(dim=1).numpy().astype(np.int32)
+    np.save(OUTPUTS / "pytorch_predictions.npy", preds)
+    print(f"FINAL test_loss={test_loss:.4f} test_acc={test_acc:.4f}", flush=True)
+
+    WEIGHTS.mkdir(parents=True, exist_ok=True)
+    layer_map = {"conv1": model.conv1, "conv2": model.conv2, "fc": model.fc}
+    print("Saving per-layer weights:", flush=True)
+    for name, layer in layer_map.items():
+        w = layer.weight.detach().cpu().numpy().astype(np.float32)
+        np.save(WEIGHTS / f"{name}.weight.npy", w)
+        if layer.bias is not None:
+            b = layer.bias.detach().cpu().numpy().astype(np.float32)
+            np.save(WEIGHTS / f"{name}.bias.npy", b)
+        print(f"  wrote {name}.weight.npy shape={w.shape}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index f0602cb..69ae28f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ dependencies = [
     "elasticai-creator @ git+https://github.com/es-ude/elastic-ai.creator.git@training-implementation-provider",
     "matplotlib>=3.10.9",
     "torch>=2.11.0",
+    "torchaudio>=2.11.0",
     "torchvision>=0.26.0",
 ]
 
diff --git a/uv.lock b/uv.lock
index b1e2ff6..31444ed 100644
--- a/uv.lock
+++ b/uv.lock
@@ -731,6 +731,7 @@ dependencies = [
     { name = "elasticai-creator" },
     { name = "matplotlib" },
     { name = "torch" },
+    { name = "torchaudio" },
     { name = "torchvision" },
 ]
 
@@ -744,6 +745,7 @@ requires-dist = [
     { name = "elasticai-creator", git = "https://github.com/es-ude/elastic-ai.creator.git?rev=training-implementation-provider" },
     { name = "matplotlib", specifier = ">=3.10.9" },
     { name = "torch", specifier = ">=2.11.0" },
+    { name = "torchaudio", specifier = ">=2.11.0" },
     { name = "torchvision", specifier = ">=0.26.0" },
 ]
 
@@ -965,6 +967,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cf/bf/c8d12a2c86dbfd7f40fb2f56fbf5a505ccf2d9ce131eb559dfc7c51e1a04/torch-2.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b2a43985ff5ef6ddd923bbcf99943e5f58059805787c5c9a2622bf05ca2965b0", size = 114792991, upload-time = "2026-03-23T18:08:19.216Z" },
 ]
 
+[[package]]
+name = "torchaudio"
+version = "2.11.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/b1/77658817acacd01a72b714440c62f419efc4d90170e704e8e7a2c0918988/torchaudio-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a1cf1acc883bee9cb906a933572fed6a8a933f86ef34e9ea7d803f72317e8c1b", size = 684226, upload-time = "2026-03-23T18:13:40.023Z" },
+    { url = "https://files.pythonhosted.org/packages/78/28/c7adc053039f286c2aca0038b766cbe3294e66fec6b29a820e95128f9ede/torchaudio-2.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:bc653defca1c16154398517a1adc98d0fb7f1dd08e58ced217558d213c2c6e29", size = 1626670, upload-time = "2026-03-23T18:13:42.162Z" },
+    { url = "https://files.pythonhosted.org/packages/88/d8/d6d0f896e064aa67377484efef4911cdcc07bce2929474e1417cc0af18c2/torchaudio-2.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6503c0bdb29daf2e6281bb70ea2dfe2c3553b782b619eb5d73bdadd8a3f7cecf", size = 1771992, upload-time = "2026-03-23T18:13:33.188Z" },
+    { url = "https://files.pythonhosted.org/packages/23/a8/941277ecc39f7a0a169d554302a1f1afd87c1d94a8aec828891916cea59a/torchaudio-2.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:478110f981e5d40a8d82221732c57a56c85a1d5895fb8fe646e86ee15eded3bd", size = 328663, upload-time = "2026-03-23T18:13:19.218Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/9e/f76fcd9877c8c78f258ee34e0fb8291fdb91e6218d582d9ca66b1e4bd4ae/torchaudio-2.11.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e3f9696a9ef1d49acc452159b052370c636406d072e9d8f10895fda87b591ea9", size = 679904, upload-time = "2026-03-23T18:13:28.329Z" },
+    { url = "https://files.pythonhosted.org/packages/85/70/249c1498ebdad3e7752866635ec0855fc0dcf898beccda5a9d2b9df8e4d0/torchaudio-2.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b034d7672f1c415434f48ef17807f2cce47f29e8795338c751d4e596c9fbe8b5", size = 1618523, upload-time = "2026-03-23T18:13:15.703Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/98/be13fe35d9aa5c26381c0e453c828a789d15c007f8f7d08c95341d19974d/torchaudio-2.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1c1101c1243ef0e4063ec63298977e2d3655c15cf88d9eb0a1bd4fe2db9f47ea", size = 1771992, upload-time = "2026-03-23T18:13:35.343Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/8b/2bbb3dca6ff28cba0de250874d5ef4fc2822c47a934b59b3974cff3219ef/torchaudio-2.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:986f4df5ed17b003dc52489468601720090e65f964f8bebccf90eb45bba75744", size = 328662, upload-time = "2026-03-23T18:13:18.308Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/ce/52c652d30af7d6e96c8f1735d26131e94708e3f38d852b8fa97958804dd8/torchaudio-2.11.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:bda09ea630ae7207384fb0f28c35e4f8c0d82dd6eba020b6b335ad0caa9fed49", size = 680814, upload-time = "2026-03-23T18:13:17.08Z" },
+    { url = "https://files.pythonhosted.org/packages/06/95/1ad1507482e7263e556709a3f5f87fecd375a0742cdaf238806c8e72eaad/torchaudio-2.11.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:9fe3083c62e035646483a14e180d33561bdc2eed436c9ab1259c137fb7120b4a", size = 1618546, upload-time = "2026-03-23T18:13:29.686Z" },
+    { url = "https://files.pythonhosted.org/packages/98/4c/480328ba07487eb9890406720304d0d460dd7a6a64098614f5aa53b662ca/torchaudio-2.11.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:13cff988697ccbad539987599f9dc672f40c417bed67570b365e4e5002bbd096", size = 1771991, upload-time = "2026-03-23T18:13:30.843Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/98/5d4790e2d6548768999acd34999d5aeefce8bcc23a07afaa5f03e723f557/torchaudio-2.11.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ed404c4399ad7f172c86a47c1b25293d322d1d58e26b10b0456a86cf67d37d84", size = 328661, upload-time = "2026-03-23T18:13:34.359Z" },
+    { url = "https://files.pythonhosted.org/packages/39/fe/ffa618b4f0d9732d7df7a2fa2bd48657d896599bc224e5af3c70d46c546b/torchaudio-2.11.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:cc09cd1f6015b8549e7fe255fb1be5346b57e7fee06541d3f3dbb012d8c4715f", size = 679901, upload-time = "2026-03-23T18:13:25.472Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/54/f414d7b92dd0b3094a2409c95a97bd6c49aa0620da722a0e55462f9bd9cb/torchaudio-2.11.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:79fb3cb99169fd41bd9719647261402a164da0d105a4d81f42a3260844ec5e79", size = 1618527, upload-time = "2026-03-23T18:13:26.68Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/a8/bf2e1f6ce24c990192400ae49b4acc1a0d0295b6c6a06bceecdc46ce08de/torchaudio-2.11.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:00e9f71ab9c656f0abdb40c515bd65d4658ab0ad380dee27a2efd7d51dabd3d6", size = 1771995, upload-time = "2026-03-23T18:13:23.373Z" },
+    { url = "https://files.pythonhosted.org/packages/83/6f/b0efb44e0bfe8dd4d78d76ae3be280354e1fb5c8631c782785d74cd8a7b1/torchaudio-2.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:1424638adb8bb40087bc7b6eb103e8e4fe398210f09076f33b7b5e61501b5d66", size = 328662, upload-time = "2026-03-23T18:13:32.243Z" },
+    { url = "https://files.pythonhosted.org/packages/60/84/1c792b0b700eac9a96772cfd9f96c097b17bca3234a2fde3c64b8063660d/torchaudio-2.11.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:da2725e250866da42a12934c9a6552f65a18b7187fd7a6221387f0e605fb3b96", size = 679926, upload-time = "2026-03-23T18:13:24.452Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/a0/62a5842062f739239691f2e57523e0570dd06704ad987755f7644a3afa23/torchaudio-2.11.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:1be3767064364ae82705bdf2b15c1e8b41fea82c4cd04d47428a8684b634b6ed", size = 1618552, upload-time = "2026-03-23T18:13:21.09Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/89/c293d818f9f899db93bf291b42401c05ae29acfb2e53d5341c30ea703e62/torchaudio-2.11.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:67f6edac29ed004652c11db5c19d9debb5d835695930574f564efc8bdd061bba", size = 1771986, upload-time = "2026-03-23T18:13:22.153Z" },
+    { url = "https://files.pythonhosted.org/packages/93/f7/ee5da8c03f1a3c7662c6c6a119f24a4b3e646da94be56dce3201e3a6ee9b/torchaudio-2.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:88fb5e29f670a33d9bac6aabb1d2734460cf6e461bde5cdc352826035851b16d", size = 328661, upload-time = "2026-03-23T18:13:20.1Z" },
+]
+
 [[package]]
 name = "torchvision"
 version = "0.26.0"