diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 11eaad9..78a2d1c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -207,6 +207,20 @@ jobs:
       - name: Train PyTorch kws_mfcc (produces reference predictions + weights)
         run: uv run examples/kws_mfcc/train_pytorch.py
 
+      - name: Cache kws_raw processed data (6-class)
+        id: kws-raw-cache
+        uses: actions/cache@v4
+        with:
+          path: examples/kws_raw/data/6class
+          key: kws-raw-6class-${{ hashFiles('examples/kws_raw/prepare_data.py', 'examples/_shared/speechcommands_data.py') }}
+
+      - name: Prepare kws_raw data (6-class; only on cache miss)
+        if: steps.kws-raw-cache.outputs.cache-hit != 'true'
+        run: uv run examples/kws_raw/prepare_data.py
+
+      - name: Train PyTorch kws_raw (produces reference predictions + weights)
+        run: uv run examples/kws_raw/train_pytorch.py
+
       - name: Configure
         run: cmake --preset examples
 
@@ -268,6 +282,16 @@ jobs:
             --c examples/kws_mfcc/outputs/6class/c_predictions.npy \
             --dtype int32
 
+      - name: Run kws_raw in BIT_PARITY mode
+        run: BIT_PARITY=1 build/examples/examples/kws_raw/train_c_kws_raw
+
+      - name: Diff kws_raw predictions (int32, exact match required)
+        run: |
+          uv run examples/_shared/compare_predictions.py \
+            --pytorch examples/kws_raw/outputs/6class/pytorch_predictions.npy \
+            --c examples/kws_raw/outputs/6class/c_predictions.npy \
+            --dtype int32
+
   python-test:
     runs-on: ubuntu-latest
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d34dd41..abc3fe6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -4,3 +4,4 @@ add_subdirectory(ecg_anomaly_ae)
 add_subdirectory(mnist_mlp)
 add_subdirectory(mnist_cnn)
 add_subdirectory(kws_mfcc)
+add_subdirectory(kws_raw)
diff --git a/examples/README.md b/examples/README.md
index 5a796b9..143d406 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -12,7 +12,8 @@ checking and visualizations.
 | `mnist_cnn/` | MNIST 1D-CNN digit classification | ✅ |
 | `har_classifier/` | UCI HAR 6-class activity classification | Stage 1 |
 | `ecg_anomaly_ae/` | ECG5000 reconstruction-based anomaly detection | Stage 2 ✅ |
-| `kws_classifier/` | SpeechCommands 6-class keyword spotting | Stage 3 (planned) |
+| `kws_mfcc/` | SpeechCommands keyword spotting (MFCC features) | Stage 3 ✅ |
+| `kws_raw/` | SpeechCommands keyword spotting (raw waveform + in-model downsample) | Stage 3 ✅ |
 | `kws_denoising_ae/` | SpeechCommands additive-noise denoising | Stage 4 (planned) |
 
 ## Running an example
diff --git a/examples/kws_raw/CMakeLists.txt b/examples/kws_raw/CMakeLists.txt
new file mode 100644
index 0000000..a403373
--- /dev/null
+++ b/examples/kws_raw/CMakeLists.txt
@@ -0,0 +1,68 @@
+add_executable(train_c_kws_raw train_c.c)
+
+target_link_libraries(train_c_kws_raw PRIVATE
+        DataLoaderApi
+        DataLoader
+        NPYLoaderApi
+        NPYLoader
+
+        Layer
+
+        Conv1dApi
+        Conv1d
+
+        LinearApi
+        Linear
+
+        ReluApi
+        Relu
+
+        FlattenApi
+        Flatten
+
+        Pool1dApi
+        MaxPool1d
+        AvgPool1d
+
+        AdaptivePool1dApi
+        AdaptiveAvgPool1d
+
+        LayerNormApi
+        LayerNorm
+
+        QuantizationApi
+        Quantization
+
+        TensorApi
+        Tensor
+        Rounding
+
+        TrainingLoopApi
+        CalculateGradsSequential
+        TrainingBatchDefault
+        TrainingEpochDefault
+        Optimizer
+
+        LossFunction
+        CrossEntropy
+
+        SoftmaxApi
+        Softmax
+
+        Sgd
+        SgdApi
+
+        InferenceApi
+
+        StateDictApi
+        LayerWeightsApi
+        LayerQuant
+        LayerCommon
+        Distributions
+
+        Common
+        StorageApi
+        RNG
+
+        examples_shared
+)
diff --git a/examples/kws_raw/README.md b/examples/kws_raw/README.md
new file mode 100644
index 0000000..9ab716a
--- /dev/null
+++ b/examples/kws_raw/README.md
@@ -0,0 +1,61 @@
+# KWS Raw Waveform — PyTorch + C Parity Demo
+
+Trains a 1D-CNN keyword-spotter on **raw 16 kHz SpeechCommands waveforms** in both
+PyTorch (reference) and the ODT C framework. Companion to `kws_mfcc/`: same data
+and harness, but instead of pre-computing MFCC features, the model consumes the
+native `[1, 16000]` waveform and **downsamples in-framework** — its first layer is
+`AvgPool1d(K=16, S=16)`, a decimation-by-16 box filter that turns 16 kHz into
+1 kHz. Change `K` to change the effective rate (8 → 2 kHz, …) with no re-prep; the
+`AdaptiveAvgPool1d(1)` head is length-agnostic so the rest of the model is
+unchanged (only the three MaxPool nominal `inputLength`s in `train_c.c` need to
+track the new lengths).
+
+One binary, two modes — **bit-parity** (`BIT_PARITY=1`, the exact CI gate) and a
+**train-from-scratch** informational demo. See `kws_mfcc/README.md` for the mode
+explanation and the `KWS_CLASSES` knob; commands are identical with `kws_raw`
+substituted.
+
+## Why LayerNorm + a longer schedule
+
+Raw waveforms are far harder to train than MFCC features: at the `kws_mfcc`
+settings (lr=0.001, 15 epochs) the raw model never escapes its random-init
+fixed point (flat loss, every clip predicted as one class), which would make the
+bit-parity gate degenerate. Two changes fix it without leaving the framework's
+bit-parity-covered layers:
+
+- a rate-agnostic **`LayerNorm(64)`** on the pooled features before the classifier
+  (the C framework has bit-parity LayerNorm; BatchNorm is not covered), and
+- **lr=0.005, 20 epochs** (the model breaks through around epoch 15).
+
+The reference then reaches ~0.59 test accuracy with predictions spread across all
+six classes, so the gate genuinely exercises the `AvgPool1d[1,16000]` + Conv +
+LayerNorm arithmetic (C reproduces PyTorch's predictions int32-exactly).
+
+## Run it (6-class)
+
+```bash
+uv run python examples/kws_raw/prepare_data.py
+uv run python examples/kws_raw/train_pytorch.py
+cmake --preset examples
+cmake --build --preset examples --target train_c_kws_raw
+
+BIT_PARITY=1 ./build/examples/examples/kws_raw/train_c_kws_raw
+uv run python examples/_shared/compare_predictions.py \
+  --pytorch examples/kws_raw/outputs/6class/pytorch_predictions.npy \
+  --c examples/kws_raw/outputs/6class/c_predictions.npy --dtype int32
+```
+
+## Model
+
+- Input: `[1, 16000]` → `reshapeItemsAddBatchDim` → `[1, 1, 16000]`
+- `AvgPool1d(16) → Conv1d(1→16,K3,SAME) → ReLU → MaxPool(4) → Conv1d(16→32,K3,SAME)
+  → ReLU → MaxPool(4) → Conv1d(32→64,K3,SAME) → ReLU → MaxPool(4) →
+  AdaptiveAvgPool1d(1) → Flatten → LayerNorm(64) → Linear(64→C) → Softmax → CE`
+- Lengths: 16000 → 1000 → 250 → 62 → 15 → 1; ~10 K params
+- State-dict layers: `conv1`, `conv2`, `conv3`, `ln`, `fc`
+- Hyperparameters: SGD lr=0.005, momentum=0.9, batch=32, 20 epochs
+
+The train-from-scratch demo is the slowest in the suite (raw `[1,16000]` is the
+heaviest input even after the AvgPool downsample) — run it offline. Bit-parity
+mode requires exact equality; the train-from-scratch tolerances are informational
+and match `kws_mfcc/`.
diff --git a/examples/kws_raw/compare.py b/examples/kws_raw/compare.py
new file mode 100644
index 0000000..2247d6f
--- /dev/null
+++ b/examples/kws_raw/compare.py
@@ -0,0 +1,88 @@
+"""Compare PyTorch and C runs of the kws_raw classifier.
+
+Reads logs/<n>class/{pytorch,c}.json and outputs/<n>class/{pytorch,c}_predictions.npy.
+Writes plots into plots/<n>class/. Prints a final-state parity report within tolerances.
+INFORMATIONAL only — the bit-parity check (compare_predictions.py) is the gate.
+"""
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+
+from examples._shared.log_schema import load_log  # noqa: E402
+from examples._shared.parity import ParityCheck, run_parity_checks  # noqa: E402
+from examples._shared.plotting import (  # noqa: E402
+    plot_accuracy_curves,
+    plot_confusion_matrix,
+    plot_loss_curves,
+)
+
+HERE = Path(__file__).resolve().parent
+NUM_CLASSES = int(os.environ.get("KWS_CLASSES", "6"))
+assert NUM_CLASSES in (6, 35), NUM_CLASSES
+TAG = f"{NUM_CLASSES}class"
+LOGS = HERE / "logs" / TAG
+OUTPUTS = HERE / "outputs" / TAG
+PLOTS = HERE / "plots" / TAG
+DATA = HERE / "data" / TAG
+
+CLASS_NAMES = (
+    ["yes", "no", "up", "down", "silence", "unknown"]
+    if NUM_CLASSES == 6
+    else [str(i) for i in range(NUM_CLASSES)]
+)
+
+CHECKS = [
+    ParityCheck("test_acc", abs_tol=0.025),   # ±2.5 pp
+    ParityCheck("test_loss", abs_tol=0.15),   # ±0.15 nats (informational)
+]
+
+
+def confusion_matrix(preds: np.ndarray, labels: np.ndarray, num_classes: int) -> np.ndarray:
+    cm = np.zeros((num_classes, num_classes), dtype=np.int64)
+    for p, a in zip(preds, labels):
+        cm[int(p), int(a)] += 1
+    return cm
+
+
+def main() -> int:
+    PLOTS.mkdir(parents=True, exist_ok=True)
+    pt = load_log(LOGS / "pytorch.json")
+    c = load_log(LOGS / "c.json")
+
+    plot_loss_curves(PLOTS / "loss_curves.png", pt, c)
+    plot_accuracy_curves(PLOTS / "accuracy_curves.png", pt, c)
+
+    test_y = np.load(DATA / "test_y.npy")
+    pt_pred = np.load(OUTPUTS / "pytorch_predictions.npy")
+    c_pred = np.load(OUTPUTS / "c_predictions.npy")
+    cm_pt = confusion_matrix(pt_pred, test_y, len(CLASS_NAMES))
+    cm_c = confusion_matrix(c_pred, test_y, len(CLASS_NAMES))
+    plot_confusion_matrix(PLOTS / "confusion_matrix_pt.png", cm_pt, CLASS_NAMES, "PyTorch KWS Raw")
+    plot_confusion_matrix(PLOTS / "confusion_matrix_c.png", cm_c, CLASS_NAMES, "C KWS Raw")
+
+    pt_finals = pt["final"]
+    c_finals = c["final"]
+    overall_pass, results = run_parity_checks(
+        CHECKS,
+        {"test_acc": pt_finals["test_acc"], "test_loss": pt_finals["test_loss"]},
+        {"test_acc": c_finals["test_acc"], "test_loss": c_finals["test_loss"]},
+    )
+
+    print("\nParity report (PyTorch vs C) — INFORMATIONAL:")
+    print(f"{'metric':<14} {'pt':>10} {'c':>10} {'diff':>10} {'tol':>8} {'type':>5} {'pass':>6}")
+    for r in results:
+        print(f"{r.metric:<14} {r.pt_value:>10.5f} {r.c_value:>10.5f} {r.diff:>10.5f} "
+              f"{r.tolerance:>8.4f} {r.tolerance_type:>5} {str(r.passed):>6}")
+    print(f"\nOverall: {'PASS' if overall_pass else 'FAIL'} (informational; not a CI gate)")
+    return 0 if overall_pass else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/kws_raw/prepare_data.py b/examples/kws_raw/prepare_data.py
new file mode 100644
index 0000000..45ed74c
--- /dev/null
+++ b/examples/kws_raw/prepare_data.py
@@ -0,0 +1,42 @@
+"""Prepare raw SpeechCommands waveforms for the kws_raw example.
+
+Writes the native 16 kHz waveform directly — no resampling, no feature
+extraction. Downsampling (16 kHz → 1 kHz via AvgPool1d) is the model's first
+layer, so PyTorch and C read identical raw .npy.
+
+Output (under examples/kws_raw/data/<n>class/, n = KWS_CLASSES in {6,35}, default 6):
+  {train,val,test}_x.npy  [N,1,16000] f32
+  {train,val,test}_y.npy  [N] i32  (0..n-1)
+"""
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+from examples._shared.speechcommands_data import load_speechcommands  # noqa: E402
+
+HERE = Path(__file__).resolve().parent
+RAW_ROOT = REPO_ROOT / "examples" / "_shared" / "data" / "speech_commands"
+
+
+def main() -> None:
+    num_classes = int(os.environ.get("KWS_CLASSES", "6"))
+    assert num_classes in (6, 35), num_classes
+    data_dir = HERE / "data" / f"{num_classes}class"
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    splits = load_speechcommands(RAW_ROOT, num_classes)
+    for split in ("train", "val", "test"):
+        x, y = splits[split]
+        np.save(data_dir / f"{split}_x.npy", x.astype(np.float32))
+        np.save(data_dir / f"{split}_y.npy", y.astype(np.int32))
+        print(f"{split}: x={x.shape} y={y.shape} classes={num_classes}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kws_raw/train_c.c b/examples/kws_raw/train_c.c
new file mode 100644
index 0000000..0bddb70
--- /dev/null
+++ b/examples/kws_raw/train_c.c
@@ -0,0 +1,436 @@
+#define SOURCE_FILE "kws_raw_train_c"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+
+#include "AdaptivePool1dApi.h"
+#include "CalculateGradsSequential.h"
+#include "Common.h"
+#include "Conv1dApi.h"
+#include "DataLoader.h"
+#include "DataLoaderApi.h"
+#include "FlattenApi.h"
+#include "InferenceApi.h"
+#include "Layer.h"
+#include "LayerCommon.h"
+#include "LayerNormApi.h"
+#include "LayerQuant.h"
+#include "LinearApi.h"
+#include "LossFunction.h"
+#include "NPYLoaderApi.h"
+#include "Pool1dApi.h"
+#include "Quantization.h"
+#include "QuantizationApi.h"
+#include "ReluApi.h"
+#include "SgdApi.h"
+#include "SoftmaxApi.h"
+#include "StateDictApi.h"
+#include "StorageApi.h"
+#include "Tensor.h"
+#include "TensorApi.h"
+#include "TrainingLoopApi.h"
+
+#include "npy_writer.h"
+
+#define EPOCHS 20
+#define BATCH 32
+#define LR 0.005f
+#define MOMENTUM 0.9f
+#define SEED 42
+#define SHUFFLE_SEED 42
+#define NUM_CLASSES_DEFAULT 6
+
+#define IN_CHANNELS 1
+#define LEN_INPUT 16000
+#define DS_K 16     /* front AvgPool downsample: 16 kHz -> 1 kHz */
+#define LEN_DS 1000 /* LEN_INPUT / DS_K */
+#define C1_OUT 16
+#define C1_K 3
+#define C2_OUT 32
+#define C2_K 3
+#define C3_OUT 64
+#define C3_K 3
+
+/* AvgPool(ds) + 3x(Conv1d+ReLU+MaxPool) + AdaptiveAvgPool + Flatten + LayerNorm + Linear + Softmax
+ * = 15 layers */
+#define MODEL_SIZE 15
+
+static dataset_t g_trainDataset;
+static dataset_t g_valDataset;
+static dataset_t g_testDataset;
+
+static size_t g_numClasses = NUM_CLASSES_DEFAULT;
+
+static size_t readNumClasses(void) {
+    const char *env = getenv("KWS_CLASSES");
+    if (env == NULL || env[0] == '\0') {
+        return NUM_CLASSES_DEFAULT;
+    }
+    long v = strtol(env, NULL, 10);
+    if (v != 6 && v != 35) {
+        fprintf(stderr, "KWS_CLASSES must be 6 or 35 (got '%s'); using %d\n", env,
+                NUM_CLASSES_DEFAULT);
+        return NUM_CLASSES_DEFAULT;
+    }
+    return (size_t)v;
+}
+
+static void reshapeItemsAddBatchDim(tensorArray_t *items) {
+    for (size_t i = 0; i < items->size; ++i) {
+        tensor_t *t = items->array[i];
+        size_t oldRank = t->shape->numberOfDimensions;
+        size_t newRank = oldRank + 1;
+
+        size_t *newDims = reserveMemory(newRank * sizeof(size_t));
+        size_t *newOrder = reserveMemory(newRank * sizeof(size_t));
+        newDims[0] = 1;
+        for (size_t d = 0; d < oldRank; ++d) {
+            newDims[d + 1] = t->shape->dimensions[d];
+        }
+        for (size_t d = 0; d < newRank; ++d) {
+            newOrder[d] = d;
+        }
+
+        freeReservedMemory(t->shape->dimensions);
+        freeReservedMemory(t->shape->orderOfDimensions);
+        t->shape->dimensions = newDims;
+        t->shape->orderOfDimensions = newOrder;
+        t->shape->numberOfDimensions = newRank;
+    }
+}
+
+static tensorArray_t *buildOneHotLabels(tensorArray_t *intLabels) {
+    tensorArray_t *out = reserveMemory(sizeof(tensorArray_t));
+    tensor_t **arr = reserveMemory(intLabels->size * sizeof(tensor_t *));
+    out->array = arr;
+    out->size = intLabels->size;
+
+    for (size_t i = 0; i < intLabels->size; ++i) {
+        size_t *dims = reserveMemory(1 * sizeof(size_t));
+        size_t *order = reserveMemory(1 * sizeof(size_t));
+        dims[0] = g_numClasses;
+        order[0] = 0;
+        shape_t *shape = reserveMemory(sizeof(shape_t));
+        shape->dimensions = dims;
+        shape->orderOfDimensions = order;
+        shape->numberOfDimensions = 1;
+
+        quantization_t *q = quantizationInitFloat();
+        tensor_t *t = initTensor(shape, q, NULL);
+
+        int32_t cls = ((int32_t *)intLabels->array[i]->data)[0];
+        float *data = (float *)t->data;
+        for (size_t c = 0; c < g_numClasses; ++c) {
+            data[c] = (c == (size_t)cls) ? 1.0f : 0.0f;
+        }
+        arr[i] = t;
+    }
+    return out;
+}
+
+static void initDataSets(const char *dataDir) {
+    char path[300];
+    snprintf(path, sizeof(path), "%s/train_x.npy", dataDir);
+    tensorArray_t *trainItems = npyLoad(path);
+    snprintf(path, sizeof(path), "%s/train_y.npy", dataDir);
+    tensorArray_t *trainLabelsRaw = npyLoad(path);
+    reshapeItemsAddBatchDim(trainItems);
+    g_trainDataset.items = trainItems;
+    g_trainDataset.labels = buildOneHotLabels(trainLabelsRaw);
+
+    snprintf(path, sizeof(path), "%s/val_x.npy", dataDir);
+    tensorArray_t *valItems = npyLoad(path);
+    snprintf(path, sizeof(path), "%s/val_y.npy", dataDir);
+    tensorArray_t *valLabelsRaw = npyLoad(path);
+    reshapeItemsAddBatchDim(valItems);
+    g_valDataset.items = valItems;
+    g_valDataset.labels = buildOneHotLabels(valLabelsRaw);
+
+    snprintf(path, sizeof(path), "%s/test_x.npy", dataDir);
+    tensorArray_t *testItems = npyLoad(path);
+    snprintf(path, sizeof(path), "%s/test_y.npy", dataDir);
+    tensorArray_t *testLabelsRaw = npyLoad(path);
+    reshapeItemsAddBatchDim(testItems);
+    g_testDataset.items = testItems;
+    g_testDataset.labels = buildOneHotLabels(testLabelsRaw);
+}
+
+static sample_t *getTrainSample(size_t id) {
+    return npyGetSample(&g_trainDataset, id);
+}
+static sample_t *getValSample(size_t id) {
+    return npyGetSample(&g_valDataset, id);
+}
+static sample_t *getTestSample(size_t id) {
+    return npyGetSample(&g_testDataset, id);
+}
+static size_t getTrainSize(void) {
+    return g_trainDataset.items->size;
+}
+static size_t getValSize(void) {
+    return g_valDataset.items->size;
+}
+static size_t getTestSize(void) {
+    return g_testDataset.items->size;
+}
+
+static void buildModel(layer_t **model, layerQuant_t *lq) {
+    /* Input reshaped to [1, 1, 16000]. */
+    /* Front downsample: AvgPool1d(K=16,S=16) -> length 1000 (16 kHz -> 1 kHz). */
+    model[0] = avgPool1dLayerInit(&(avgPool1dInit_t){.kernelSize = DS_K, .stride = DS_K}, lq);
+
+    model[1] = conv1dLayerInit(
+        &(conv1dInit_t){
+            .inChannels = IN_CHANNELS, .outChannels = C1_OUT, .kernelSize = C1_K, .padding = SAME},
+        lq);
+    model[2] = reluLayerInit(lq);
+    model[3] = maxPool1dLayerInit(
+        &(maxPool1dInit_t){
+            .kernelSize = 4, .stride = 4, .inputChannels = C1_OUT, .inputLength = LEN_DS},
+        lq);
+
+    model[4] = conv1dLayerInit(
+        &(conv1dInit_t){
+            .inChannels = C1_OUT, .outChannels = C2_OUT, .kernelSize = C2_K, .padding = SAME},
+        lq);
+    model[5] = reluLayerInit(lq);
+    model[6] = maxPool1dLayerInit(
+        &(maxPool1dInit_t){
+            .kernelSize = 4, .stride = 4, .inputChannels = C2_OUT, .inputLength = LEN_DS / 4},
+        lq);
+
+    model[7] = conv1dLayerInit(
+        &(conv1dInit_t){
+            .inChannels = C2_OUT, .outChannels = C3_OUT, .kernelSize = C3_K, .padding = SAME},
+        lq);
+    model[8] = reluLayerInit(lq);
+    model[9] = maxPool1dLayerInit(
+        &(maxPool1dInit_t){
+            .kernelSize = 4, .stride = 4, .inputChannels = C3_OUT, .inputLength = LEN_DS / 16},
+        lq);
+
+    /* Rate-agnostic head: AdaptiveAvgPool1d(1) -> Flatten -> LayerNorm -> Linear -> Softmax.
+     * LayerNorm(C3_OUT) over the 64 pooled features stabilises raw-model training (mirrors the
+     * PyTorch nn.LayerNorm(64)); eps 1e-5 matches PyTorch's default. */
+    model[10] = adaptiveAvgPool1dLayerInit(&(adaptiveAvgPool1dInit_t){.outputSize = 1}, lq);
+    model[11] = flattenLayerInit();
+    model[12] = layerNormLayerInit(
+        &(layerNormInit_t){.normalizedShape = (size_t[]){C3_OUT}, .numNormDims = 1, .eps = 1e-5f},
+        lq);
+    model[13] =
+        linearLayerInit(&(linearInit_t){.inFeatures = C3_OUT, .outFeatures = g_numClasses}, lq);
+    model[14] = softmaxLayerInit(lq);
+}
+
+/* Load PyTorch state_dict from per-layer .npy files written by
+ * examples/kws_raw/train_pytorch.py --save-weights.
+ *
+ * Returns 0 on success, non-zero on first missing file. */
+static int loadStateDictFromDir(layer_t **model, const char *weightsDir) {
+    char wPath[300], bPath[300];
+    /* Param layers in order: conv1=model[1], conv2=model[4], conv3=model[7],
+     * ln=model[12] (gamma/beta), fc=model[13]. 5 entries. */
+    const char *names[5] = {"conv1", "conv2", "conv3", "ln", "fc"};
+    tensor_t *w[5] = {0};
+    tensor_t *b[5] = {0};
+
+    for (int i = 0; i < 5; i++) {
+        snprintf(wPath, sizeof(wPath), "%s/%s.weight.npy", weightsDir, names[i]);
+        snprintf(bPath, sizeof(bPath), "%s/%s.bias.npy", weightsDir, names[i]);
+        w[i] = npyLoadFlat(wPath);
+        b[i] = npyLoadFlat(bPath);
+        if (w[i] == NULL || b[i] == NULL) {
+            fprintf(stderr, "loadStateDictFromDir: missing %s or %s\n", wPath, bPath);
+            return 1;
+        }
+    }
+
+    modelLoadStateDict(
+        model, MODEL_SIZE,
+        (stateDictEntry_t[]){
+            {.name = names[0], .weightData = (float *)w[0]->data, .biasData = (float *)b[0]->data},
+            {.name = names[1], .weightData = (float *)w[1]->data, .biasData = (float *)b[1]->data},
+            {.name = names[2], .weightData = (float *)w[2]->data, .biasData = (float *)b[2]->data},
+            {.name = names[3], .weightData = (float *)w[3]->data, .biasData = (float *)b[3]->data},
+            {.name = names[4], .weightData = (float *)w[4]->data, .biasData = (float *)b[4]->data},
+        },
+        5);
+
+    for (int i = 0; i < 5; i++) {
+        freeTensor(w[i]);
+        freeTensor(b[i]);
+    }
+    return 0;
+}
+
+static FILE *g_log_file = NULL;
+static int g_first_epoch = 1;
+static struct timespec g_epoch_t0;
+
+static void epochCallback(size_t epoch, float trainLoss, epochStats_t evalStats) {
+    struct timespec t1;
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    double wall_s =
+        (double)(t1.tv_sec - g_epoch_t0.tv_sec) + (double)(t1.tv_nsec - g_epoch_t0.tv_nsec) * 1e-9;
+
+    if (!g_first_epoch) {
+        fprintf(g_log_file, ",\n");
+    }
+    fprintf(g_log_file,
+            "    {\"epoch\": %zu, \"step_losses\": [], \"train_loss\": %.6f, "
+            "\"val_loss\": %.6f, \"val_acc\": %.6f, \"wall_s\": %.4f}",
+            epoch, (double)trainLoss, (double)evalStats.loss, (double)evalStats.accuracy, wall_s);
+    fflush(g_log_file);
+    g_first_epoch = 0;
+
+    fprintf(stdout, "epoch %zu: train_loss=%.4f val_loss=%.4f val_acc=%.4f wall_s=%.2f\n", epoch,
+            (double)trainLoss, (double)evalStats.loss, (double)evalStats.accuracy, wall_s);
+    fflush(stdout);
+
+    clock_gettime(CLOCK_MONOTONIC, &g_epoch_t0);
+}
+
+static int ensureDir(const char *p) {
+    if (mkdir(p, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == 0) {
+        return 0;
+    }
+    if (errno == EEXIST) {
+        return 0;
+    }
+    fprintf(stderr, "ERROR: cannot create %s: %s\n", p, strerror(errno));
+    return 1;
+}
+
+int main(void) {
+    g_numClasses = readNumClasses();
+
+    char dataDir[256], weightsDir[256], logsDir[256], outputsDir[256];
+    snprintf(dataDir, sizeof(dataDir), "examples/kws_raw/data/%zuclass", g_numClasses);
+    snprintf(weightsDir, sizeof(weightsDir), "examples/kws_raw/weights/%zuclass", g_numClasses);
+    snprintf(logsDir, sizeof(logsDir), "examples/kws_raw/logs/%zuclass", g_numClasses);
+    snprintf(outputsDir, sizeof(outputsDir), "examples/kws_raw/outputs/%zuclass", g_numClasses);
+
+    if (ensureDir("examples/kws_raw/logs") != 0 || ensureDir(logsDir) != 0) {
+        return 1;
+    }
+    if (ensureDir("examples/kws_raw/outputs") != 0 || ensureDir(outputsDir) != 0) {
+        return 1;
+    }
+
+    initDataSets(dataDir);
+
+    dataLoader_t *testLoader = dataLoaderInit(getTestSample, getTestSize, 1, NULL, NULL,
+                                              /*shuffle*/ false, /*shuffleSeed*/ 0,
+                                              /*dropLast*/ true);
+
+    layerQuant_t lq;
+    layerQuantInitUniform(&lq, quantizationInitFloat());
+
+    layer_t *model[MODEL_SIZE];
+    buildModel(model, &lq);
+
+    const char *bitParity = getenv("BIT_PARITY");
+    if (bitParity != NULL && bitParity[0] != '\0') {
+        /* Bit-parity mode: load PyTorch state_dict, skip training, run inference. */
+        if (loadStateDictFromDir(model, weightsDir) != 0) {
+            fprintf(stderr, "BIT_PARITY: state_dict load failed\n");
+            return 1;
+        }
+        fprintf(stdout, "BIT_PARITY: loaded state_dict from %s\n", weightsDir);
+    } else {
+        dataLoader_t *trainLoader = dataLoaderInit(getTrainSample, getTrainSize, BATCH, NULL, NULL,
+                                                   /*shuffle*/ true, /*shuffleSeed*/ SHUFFLE_SEED,
+                                                   /*dropLast*/ true);
+        dataLoader_t *valLoader = dataLoaderInit(getValSample, getValSize, 1, NULL, NULL,
+                                                 /*shuffle*/ false, /*shuffleSeed*/ 0,
+                                                 /*dropLast*/ true);
+
+        optimizer_t *sgd =
+            sgdMCreateOptim(LR, MOMENTUM, /*weightDecay*/ 0.0f, model, MODEL_SIZE, FLOAT32);
+
+        char logPath[300];
+        snprintf(logPath, sizeof(logPath), "%s/c.json", logsDir);
+        g_log_file = fopen(logPath, "w");
+        if (!g_log_file) {
+            fprintf(stderr, "ERROR: cannot open log file for writing\n");
+            return 1;
+        }
+        fprintf(g_log_file,
+                "{\n"
+                "  \"impl\": \"c\",\n"
+                "  \"example\": \"kws_raw\",\n"
+                "  \"config\": {\"epochs\": %d, \"batch\": %d, \"lr\": %.6f, "
+                "\"momentum\": %.6f, \"seed\": %d, \"shuffle_seed\": %d},\n"
+                "  \"epochs\": [\n",
+                EPOCHS, BATCH, (double)LR, (double)MOMENTUM, SEED, SHUFFLE_SEED);
+        fflush(g_log_file);
+
+        clock_gettime(CLOCK_MONOTONIC, &g_epoch_t0);
+
+        trainingRunResult_t result =
+            trainingRun(model, MODEL_SIZE,
+                        (lossConfig_t){.funcType = CROSS_ENTROPY,
+                                       .backwardReduction = REDUCTION_MEAN,
+                                       .classWeights = NULL},
+                        trainLoader, valLoader, sgd, EPOCHS, calculateGradsSequential,
+                        inferenceWithLoss, epochCallback);
+        (void)result;
+
+        epochStats_t testStats = evaluationEpochWithMetrics(
+            model, MODEL_SIZE, CROSS_ENTROPY, testLoader, inferenceWithLoss, REDUCTION_MEAN);
+
+        fprintf(g_log_file,
+                "\n  ],\n"
+                "  \"final\": {\"test_loss\": %.6f, \"test_acc\": %.6f, "
+                "\"test_auc\": null}\n"
+                "}\n",
+                (double)testStats.loss, (double)testStats.accuracy);
+        fclose(g_log_file);
+
+        fprintf(stdout, "FINAL test_loss=%.4f test_acc=%.4f\n", (double)testStats.loss,
+                (double)testStats.accuracy);
+    }
+
+    /* Predictions on test set (both modes). */
+    size_t numTest = getTestSize();
+    int32_t *predictions = malloc(numTest * sizeof(int32_t));
+    if (!predictions) {
+        fprintf(stderr, "OOM allocating predictions\n");
+        return 1;
+    }
+
+    for (size_t i = 0; i < numTest; ++i) {
+        sample_t *s = getTestSample(i);
+        tensor_t *out = inference(model, MODEL_SIZE, s->item);
+        float *probs = (float *)out->data;
+        size_t argmax = 0;
+        float best = probs[0];
+        for (size_t c = 1; c < g_numClasses; ++c) {
+            if (probs[c] > best) {
+                best = probs[c];
+                argmax = c;
+            }
+        }
+        predictions[i] = (int32_t)argmax;
+        freeTensor(out);
+        freeSample(s);
+    }
+
+    char predPath[300];
+    snprintf(predPath, sizeof(predPath), "%s/c_predictions.npy", outputsDir);
+    size_t outShape[] = {numTest};
+    int status = 0;
+    int rc = npyWriteInt32(predPath, predictions, outShape, 1);
+    if (rc != 0) {
+        fprintf(stderr, "ERROR: npyWriteInt32 failed (rc=%d)\n", rc);
+        status = 1;
+    }
+    free(predictions);
+
+    return status;
+}
diff --git a/examples/kws_raw/train_pytorch.py b/examples/kws_raw/train_pytorch.py
new file mode 100644
index 0000000..b943a85
--- /dev/null
+++ b/examples/kws_raw/train_pytorch.py
@@ -0,0 +1,184 @@
+"""PyTorch reference implementation of the kws_raw 1D-CNN classifier.
+
+Input: raw [1,16000] waveform from prepare_data.py. The model downsamples
+16 kHz -> 1 kHz via a front AvgPool1d(K=16), then 3 Conv blocks + a rate-agnostic
+AdaptiveAvgPool1d(1) head + LayerNorm(64). Output: logs/<n>class/pytorch.json +
+outputs/<n>class/pytorch_predictions.npy +
+weights/<n>class/{conv1,conv2,conv3,ln,fc}.{weight,bias}.npy
+for the C-side BIT_PARITY mode. num_classes from KWS_CLASSES (default 6).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+from examples._shared.log_schema import RunLog, dump_log  # noqa: E402
+from examples._shared.seeds import SEED, SHUFFLE_SEED  # noqa: E402
+from examples._shared.xorshift32 import shuffle_indices  # noqa: E402
+
+HERE = Path(__file__).resolve().parent
+NUM_CLASSES = int(os.environ.get("KWS_CLASSES", "6"))
+assert NUM_CLASSES in (6, 35), NUM_CLASSES
+TAG = f"{NUM_CLASSES}class"
+DATA = HERE / "data" / TAG
+LOGS = HERE / "logs" / TAG
+OUTPUTS = HERE / "outputs" / TAG
+WEIGHTS = HERE / "weights" / TAG
+
+EPOCHS = 20
+BATCH = 32
+LR = 0.005
+MOMENTUM = 0.9
+
+
+class KwsDataset(torch.utils.data.Dataset):
+    def __init__(self, x: np.ndarray, y: np.ndarray) -> None:
+        self.x = torch.from_numpy(x.astype(np.float32))
+        self.y = torch.from_numpy(y.astype(np.int64))
+
+    def __len__(self) -> int:
+        return self.x.shape[0]
+
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.x[idx], self.y[idx]
+
+
+class XorShift32Sampler(torch.utils.data.Sampler[int]):
+    """Single-shot shuffle, no per-epoch reshuffle, matching framework DataLoader.c."""
+    def __init__(self, n: int, seed: int) -> None:
+        self.indices = shuffle_indices(n, seed)
+
+    def __iter__(self):
+        return iter(self.indices)
+
+    def __len__(self) -> int:
+        return len(self.indices)
+
+
+class KwsRawCnn(nn.Module):
+    def __init__(self, num_classes: int) -> None:
+        super().__init__()
+        self.pool0 = nn.AvgPool1d(kernel_size=16, stride=16)     # 16 kHz -> 1 kHz downsample
+        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1)  # SAME (K odd, stride 1)
+        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
+        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
+        # LayerNorm over the 64 pooled features (rate-agnostic, 1-D). Stabilises
+        # training of the raw model, which otherwise stalls at random init; the
+        # C framework has bit-parity LayerNorm so the gate is preserved.
+        self.ln = nn.LayerNorm(64)
+        self.fc = nn.Linear(64, num_classes)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.pool0(x)                  # [B,1,16000] -> [B,1,1000]
+        x = F.relu(self.conv1(x))          # [B,16,1000]
+        x = F.max_pool1d(x, 4)             # [B,16,250]
+        x = F.relu(self.conv2(x))          # [B,32,250]
+        x = F.max_pool1d(x, 4)             # [B,32,62]
+        x = F.relu(self.conv3(x))          # [B,64,62]
+        x = F.max_pool1d(x, 4)             # [B,64,15]
+        x = F.adaptive_avg_pool1d(x, 1)    # [B,64,1]
+        x = x.flatten(start_dim=1)         # [B,64]
+        x = self.ln(x)                     # LayerNorm(64)
+        return self.fc(x)
+
+
+def evaluate(model: nn.Module, x: np.ndarray, y: np.ndarray, batch: int) -> tuple[float, float]:
+    model.eval()
+    total_loss, total_correct, total = 0.0, 0, 0
+    with torch.no_grad():
+        for i in range(0, len(x), batch):
+            xb = torch.from_numpy(x[i : i + batch].astype(np.float32))
+            yb = torch.from_numpy(y[i : i + batch].astype(np.int64))
+            logits = model(xb)
+            loss = F.cross_entropy(logits, yb, reduction="sum")
+            total_loss += loss.item()
+            total_correct += (logits.argmax(dim=1) == yb).sum().item()
+            total += yb.shape[0]
+    return total_loss / total, total_correct / total
+
+
+def main() -> None:
+    torch.manual_seed(SEED)
+    np.random.seed(SEED)
+    torch.use_deterministic_algorithms(True, warn_only=True)
+
+    train_x = np.load(DATA / "train_x.npy")
+    train_y = np.load(DATA / "train_y.npy")
+    val_x = np.load(DATA / "val_x.npy")
+    val_y = np.load(DATA / "val_y.npy")
+    test_x = np.load(DATA / "test_x.npy")
+    test_y = np.load(DATA / "test_y.npy")
+
+    train_ds = KwsDataset(train_x, train_y)
+    sampler = XorShift32Sampler(len(train_ds), SHUFFLE_SEED)
+    loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH, sampler=sampler, drop_last=True)
+
+    model = KwsRawCnn(NUM_CLASSES)
+    optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM)
+
+    epoch_records = []
+    for epoch in range(EPOCHS):
+        t0 = time.time()
+        model.train()
+        step_losses: list[float] = []
+        for xb, yb in loader:
+            optimizer.zero_grad()
+            loss = F.cross_entropy(model(xb), yb)
+            loss.backward()
+            optimizer.step()
+            step_losses.append(loss.item())
+        train_loss = float(np.mean(step_losses)) if step_losses else 0.0
+        val_loss, val_acc = evaluate(model, val_x, val_y, BATCH)
+        epoch_records.append({
+            "epoch": epoch, "step_losses": step_losses, "train_loss": train_loss,
+            "val_loss": val_loss, "val_acc": val_acc, "wall_s": time.time() - t0,
+        })
+        print(f"epoch {epoch:2d}: train_loss={train_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}", flush=True)
+
+    test_loss, test_acc = evaluate(model, test_x, test_y, BATCH)
+    log: RunLog = {
+        "impl": "pytorch", "example": "kws_raw",
+        "config": {"epochs": EPOCHS, "batch": BATCH, "lr": LR, "momentum": MOMENTUM,
+                   "seed": SEED, "shuffle_seed": SHUFFLE_SEED},
+        "epochs": epoch_records,  # type: ignore[typeddict-item]
+        "final": {"test_loss": test_loss, "test_acc": test_acc, "test_auc": None},
+    }
+    LOGS.mkdir(parents=True, exist_ok=True)
+    OUTPUTS.mkdir(parents=True, exist_ok=True)
+    dump_log(LOGS / "pytorch.json", log)
+
+    model.eval()
+    with torch.no_grad():
+        preds = model(torch.from_numpy(test_x.astype(np.float32))).argmax(dim=1).numpy().astype(np.int32)
+    np.save(OUTPUTS / "pytorch_predictions.npy", preds)
+    print(f"FINAL test_loss={test_loss:.4f} test_acc={test_acc:.4f}", flush=True)
+
+    WEIGHTS.mkdir(parents=True, exist_ok=True)
+    layer_map = {
+        "conv1": model.conv1,
+        "conv2": model.conv2,
+        "conv3": model.conv3,
+        "ln": model.ln,
+        "fc": model.fc,
+    }
+    print("Saving per-layer weights:", flush=True)
+    for name, layer in layer_map.items():
+        w = layer.weight.detach().cpu().numpy().astype(np.float32)
+        np.save(WEIGHTS / f"{name}.weight.npy", w)
+        if layer.bias is not None:
+            b = layer.bias.detach().cpu().numpy().astype(np.float32)
+            np.save(WEIGHTS / f"{name}.bias.npy", b)
+        print(f"  wrote {name}.weight.npy shape={w.shape}", flush=True)
+
+
+if __name__ == "__main__":
+    main()