diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 11eaad9..78a2d1c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -207,6 +207,20 @@ jobs: - name: Train PyTorch kws_mfcc (produces reference predictions + weights) run: uv run examples/kws_mfcc/train_pytorch.py + - name: Cache kws_raw processed data (6-class) + id: kws-raw-cache + uses: actions/cache@v4 + with: + path: examples/kws_raw/data/6class + key: kws-raw-6class-${{ hashFiles('examples/kws_raw/prepare_data.py', 'examples/_shared/speechcommands_data.py') }} + + - name: Prepare kws_raw data (6-class; only on cache miss) + if: steps.kws-raw-cache.outputs.cache-hit != 'true' + run: uv run examples/kws_raw/prepare_data.py + + - name: Train PyTorch kws_raw (produces reference predictions + weights) + run: uv run examples/kws_raw/train_pytorch.py + - name: Configure run: cmake --preset examples @@ -268,6 +282,16 @@ jobs: --c examples/kws_mfcc/outputs/6class/c_predictions.npy \ --dtype int32 + - name: Run kws_raw in BIT_PARITY mode + run: BIT_PARITY=1 build/examples/examples/kws_raw/train_c_kws_raw + + - name: Diff kws_raw predictions (int32, exact match required) + run: | + uv run examples/_shared/compare_predictions.py \ + --pytorch examples/kws_raw/outputs/6class/pytorch_predictions.npy \ + --c examples/kws_raw/outputs/6class/c_predictions.npy \ + --dtype int32 + python-test: runs-on: ubuntu-latest diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d34dd41..abc3fe6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -4,3 +4,4 @@ add_subdirectory(ecg_anomaly_ae) add_subdirectory(mnist_mlp) add_subdirectory(mnist_cnn) add_subdirectory(kws_mfcc) +add_subdirectory(kws_raw) diff --git a/examples/README.md b/examples/README.md index 5a796b9..143d406 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,7 +12,8 @@ checking and visualizations. | `mnist_cnn/` | MNIST 1D-CNN digit classification | ✅ | | `har_classifier/` | UCI HAR 6-class activity classification | Stage 1 | | `ecg_anomaly_ae/` | ECG5000 reconstruction-based anomaly detection | Stage 2 ✅ | -| `kws_classifier/` | SpeechCommands 6-class keyword spotting | Stage 3 (planned) | +| `kws_mfcc/` | SpeechCommands keyword spotting (MFCC features) | Stage 3 ✅ | +| `kws_raw/` | SpeechCommands keyword spotting (raw waveform + in-model downsample) | Stage 3 ✅ | | `kws_denoising_ae/` | SpeechCommands additive-noise denoising | Stage 4 (planned) | ## Running an example diff --git a/examples/kws_raw/CMakeLists.txt b/examples/kws_raw/CMakeLists.txt new file mode 100644 index 0000000..a403373 --- /dev/null +++ b/examples/kws_raw/CMakeLists.txt @@ -0,0 +1,68 @@ +add_executable(train_c_kws_raw train_c.c) + +target_link_libraries(train_c_kws_raw PRIVATE + DataLoaderApi + DataLoader + NPYLoaderApi + NPYLoader + + Layer + + Conv1dApi + Conv1d + + LinearApi + Linear + + ReluApi + Relu + + FlattenApi + Flatten + + Pool1dApi + MaxPool1d + AvgPool1d + + AdaptivePool1dApi + AdaptiveAvgPool1d + + LayerNormApi + LayerNorm + + QuantizationApi + Quantization + + TensorApi + Tensor + Rounding + + TrainingLoopApi + CalculateGradsSequential + TrainingBatchDefault + TrainingEpochDefault + Optimizer + + LossFunction + CrossEntropy + + SoftmaxApi + Softmax + + Sgd + SgdApi + + InferenceApi + + StateDictApi + LayerWeightsApi + LayerQuant + LayerCommon + Distributions + + Common + StorageApi + RNG + + examples_shared +) diff --git a/examples/kws_raw/README.md b/examples/kws_raw/README.md new file mode 100644 index 0000000..9ab716a --- /dev/null +++ b/examples/kws_raw/README.md @@ -0,0 +1,61 @@ +# KWS Raw Waveform — PyTorch + C Parity Demo + +Trains a 1D-CNN keyword-spotter on **raw 16 kHz SpeechCommands waveforms** in both +PyTorch (reference) and the ODT C framework. Companion to `kws_mfcc/`: same data +and harness, but instead of pre-computing MFCC features, the model consumes the +native `[1, 16000]` waveform and **downsamples in-framework** — its first layer is +`AvgPool1d(K=16, S=16)`, a decimation-by-16 box filter that turns 16 kHz into +1 kHz. Change `K` to change the effective rate (8 → 2 kHz, …) with no re-prep; the +`AdaptiveAvgPool1d(1)` head is length-agnostic so the rest of the model is +unchanged (only the three MaxPool nominal `inputLength`s in `train_c.c` need to +track the new lengths). + +One binary, two modes — **bit-parity** (`BIT_PARITY=1`, the exact CI gate) and a +**train-from-scratch** informational demo. See `kws_mfcc/README.md` for the mode +explanation and the `KWS_CLASSES` knob; commands are identical with `kws_raw` +substituted. + +## Why LayerNorm + a longer schedule + +Raw waveforms are far harder to train than MFCC features: at the `kws_mfcc` +settings (lr=0.001, 15 epochs) the raw model never escapes its random-init +fixed point (flat loss, every clip predicted as one class), which would make the +bit-parity gate degenerate. Two changes fix it without leaving the framework's +bit-parity-covered layers: + +- a rate-agnostic **`LayerNorm(64)`** on the pooled features before the classifier + (the C framework has bit-parity LayerNorm; BatchNorm is not covered), and +- **lr=0.005, 20 epochs** (the model breaks through around epoch 15). + +The reference then reaches ~0.59 test accuracy with predictions spread across all +six classes, so the gate genuinely exercises the `AvgPool1d[1,16000]` + Conv + +LayerNorm arithmetic (C reproduces PyTorch's predictions int32-exactly). + +## Run it (6-class) + +```bash +uv run python examples/kws_raw/prepare_data.py +uv run python examples/kws_raw/train_pytorch.py +cmake --preset examples +cmake --build --preset examples --target train_c_kws_raw + +BIT_PARITY=1 ./build/examples/examples/kws_raw/train_c_kws_raw +uv run python examples/_shared/compare_predictions.py \ + --pytorch examples/kws_raw/outputs/6class/pytorch_predictions.npy \ + --c examples/kws_raw/outputs/6class/c_predictions.npy --dtype int32 +``` + +## Model + +- Input: `[1, 16000]` → `reshapeItemsAddBatchDim` → `[1, 1, 16000]` +- `AvgPool1d(16) → Conv1d(1→16,K3,SAME) → ReLU → MaxPool(4) → Conv1d(16→32,K3,SAME) + → ReLU → MaxPool(4) → Conv1d(32→64,K3,SAME) → ReLU → MaxPool(4) → + AdaptiveAvgPool1d(1) → Flatten → LayerNorm(64) → Linear(64→C) → Softmax → CE` +- Lengths: 16000 → 1000 → 250 → 62 → 15 → 1; ~10 K params +- State-dict layers: `conv1`, `conv2`, `conv3`, `ln`, `fc` +- Hyperparameters: SGD lr=0.005, momentum=0.9, batch=32, 20 epochs + +The train-from-scratch demo is the slowest in the suite (raw `[1,16000]` is the +heaviest input even after the AvgPool downsample) — run it offline. Bit-parity +mode requires exact equality; the train-from-scratch tolerances are informational +and match `kws_mfcc/`. diff --git a/examples/kws_raw/compare.py b/examples/kws_raw/compare.py new file mode 100644 index 0000000..2247d6f --- /dev/null +++ b/examples/kws_raw/compare.py @@ -0,0 +1,88 @@ +"""Compare PyTorch and C runs of the kws_raw classifier. + +Reads logs/class/{pytorch,c}.json and outputs/class/{pytorch,c}_predictions.npy. +Writes plots into plots/class/. Prints a final-state parity report within tolerances. +INFORMATIONAL only — the bit-parity check (compare_predictions.py) is the gate. +""" +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import numpy as np + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) + +from examples._shared.log_schema import load_log # noqa: E402 +from examples._shared.parity import ParityCheck, run_parity_checks # noqa: E402 +from examples._shared.plotting import ( # noqa: E402 + plot_accuracy_curves, + plot_confusion_matrix, + plot_loss_curves, +) + +HERE = Path(__file__).resolve().parent +NUM_CLASSES = int(os.environ.get("KWS_CLASSES", "6")) +assert NUM_CLASSES in (6, 35), NUM_CLASSES +TAG = f"{NUM_CLASSES}class" +LOGS = HERE / "logs" / TAG +OUTPUTS = HERE / "outputs" / TAG +PLOTS = HERE / "plots" / TAG +DATA = HERE / "data" / TAG + +CLASS_NAMES = ( + ["yes", "no", "up", "down", "silence", "unknown"] + if NUM_CLASSES == 6 + else [str(i) for i in range(NUM_CLASSES)] +) + +CHECKS = [ + ParityCheck("test_acc", abs_tol=0.025), # ±2.5 pp + ParityCheck("test_loss", abs_tol=0.15), # ±0.15 nats (informational) +] + + +def confusion_matrix(preds: np.ndarray, labels: np.ndarray, num_classes: int) -> np.ndarray: + cm = np.zeros((num_classes, num_classes), dtype=np.int64) + for p, a in zip(preds, labels): + cm[int(p), int(a)] += 1 + return cm + + +def main() -> int: + PLOTS.mkdir(parents=True, exist_ok=True) + pt = load_log(LOGS / "pytorch.json") + c = load_log(LOGS / "c.json") + + plot_loss_curves(PLOTS / "loss_curves.png", pt, c) + plot_accuracy_curves(PLOTS / "accuracy_curves.png", pt, c) + + test_y = np.load(DATA / "test_y.npy") + pt_pred = np.load(OUTPUTS / "pytorch_predictions.npy") + c_pred = np.load(OUTPUTS / "c_predictions.npy") + cm_pt = confusion_matrix(pt_pred, test_y, len(CLASS_NAMES)) + cm_c = confusion_matrix(c_pred, test_y, len(CLASS_NAMES)) + plot_confusion_matrix(PLOTS / "confusion_matrix_pt.png", cm_pt, CLASS_NAMES, "PyTorch KWS Raw") + plot_confusion_matrix(PLOTS / "confusion_matrix_c.png", cm_c, CLASS_NAMES, "C KWS Raw") + + pt_finals = pt["final"] + c_finals = c["final"] + overall_pass, results = run_parity_checks( + CHECKS, + {"test_acc": pt_finals["test_acc"], "test_loss": pt_finals["test_loss"]}, + {"test_acc": c_finals["test_acc"], "test_loss": c_finals["test_loss"]}, + ) + + print("\nParity report (PyTorch vs C) — INFORMATIONAL:") + print(f"{'metric':<14} {'pt':>10} {'c':>10} {'diff':>10} {'tol':>8} {'type':>5} {'pass':>6}") + for r in results: + print(f"{r.metric:<14} {r.pt_value:>10.5f} {r.c_value:>10.5f} {r.diff:>10.5f} " + f"{r.tolerance:>8.4f} {r.tolerance_type:>5} {str(r.passed):>6}") + print(f"\nOverall: {'PASS' if overall_pass else 'FAIL'} (informational; not a CI gate)") + return 0 if overall_pass else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/kws_raw/prepare_data.py b/examples/kws_raw/prepare_data.py new file mode 100644 index 0000000..45ed74c --- /dev/null +++ b/examples/kws_raw/prepare_data.py @@ -0,0 +1,42 @@ +"""Prepare raw SpeechCommands waveforms for the kws_raw example. + +Writes the native 16 kHz waveform directly — no resampling, no feature +extraction. Downsampling (16 kHz → 1 kHz via AvgPool1d) is the model's first +layer, so PyTorch and C read identical raw .npy. + +Output (under examples/kws_raw/data/class/, n = KWS_CLASSES in {6,35}, default 6): + {train,val,test}_x.npy [N,1,16000] f32 + {train,val,test}_y.npy [N] i32 (0..n-1) +""" +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import numpy as np + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) +from examples._shared.speechcommands_data import load_speechcommands # noqa: E402 + +HERE = Path(__file__).resolve().parent +RAW_ROOT = REPO_ROOT / "examples" / "_shared" / "data" / "speech_commands" + + +def main() -> None: + num_classes = int(os.environ.get("KWS_CLASSES", "6")) + assert num_classes in (6, 35), num_classes + data_dir = HERE / "data" / f"{num_classes}class" + data_dir.mkdir(parents=True, exist_ok=True) + + splits = load_speechcommands(RAW_ROOT, num_classes) + for split in ("train", "val", "test"): + x, y = splits[split] + np.save(data_dir / f"{split}_x.npy", x.astype(np.float32)) + np.save(data_dir / f"{split}_y.npy", y.astype(np.int32)) + print(f"{split}: x={x.shape} y={y.shape} classes={num_classes}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/examples/kws_raw/train_c.c b/examples/kws_raw/train_c.c new file mode 100644 index 0000000..0bddb70 --- /dev/null +++ b/examples/kws_raw/train_c.c @@ -0,0 +1,436 @@ +#define SOURCE_FILE "kws_raw_train_c" + +#include +#include +#include +#include +#include +#include +#include + +#include "AdaptivePool1dApi.h" +#include "CalculateGradsSequential.h" +#include "Common.h" +#include "Conv1dApi.h" +#include "DataLoader.h" +#include "DataLoaderApi.h" +#include "FlattenApi.h" +#include "InferenceApi.h" +#include "Layer.h" +#include "LayerCommon.h" +#include "LayerNormApi.h" +#include "LayerQuant.h" +#include "LinearApi.h" +#include "LossFunction.h" +#include "NPYLoaderApi.h" +#include "Pool1dApi.h" +#include "Quantization.h" +#include "QuantizationApi.h" +#include "ReluApi.h" +#include "SgdApi.h" +#include "SoftmaxApi.h" +#include "StateDictApi.h" +#include "StorageApi.h" +#include "Tensor.h" +#include "TensorApi.h" +#include "TrainingLoopApi.h" + +#include "npy_writer.h" + +#define EPOCHS 20 +#define BATCH 32 +#define LR 0.005f +#define MOMENTUM 0.9f +#define SEED 42 +#define SHUFFLE_SEED 42 +#define NUM_CLASSES_DEFAULT 6 + +#define IN_CHANNELS 1 +#define LEN_INPUT 16000 +#define DS_K 16 /* front AvgPool downsample: 16 kHz -> 1 kHz */ +#define LEN_DS 1000 /* LEN_INPUT / DS_K */ +#define C1_OUT 16 +#define C1_K 3 +#define C2_OUT 32 +#define C2_K 3 +#define C3_OUT 64 +#define C3_K 3 + +/* AvgPool(ds) + 3x(Conv1d+ReLU+MaxPool) + AdaptiveAvgPool + Flatten + LayerNorm + Linear + Softmax + * = 15 layers */ +#define MODEL_SIZE 15 + +static dataset_t g_trainDataset; +static dataset_t g_valDataset; +static dataset_t g_testDataset; + +static size_t g_numClasses = NUM_CLASSES_DEFAULT; + +static size_t readNumClasses(void) { + const char *env = getenv("KWS_CLASSES"); + if (env == NULL || env[0] == '\0') { + return NUM_CLASSES_DEFAULT; + } + long v = strtol(env, NULL, 10); + if (v != 6 && v != 35) { + fprintf(stderr, "KWS_CLASSES must be 6 or 35 (got '%s'); using %d\n", env, + NUM_CLASSES_DEFAULT); + return NUM_CLASSES_DEFAULT; + } + return (size_t)v; +} + +static void reshapeItemsAddBatchDim(tensorArray_t *items) { + for (size_t i = 0; i < items->size; ++i) { + tensor_t *t = items->array[i]; + size_t oldRank = t->shape->numberOfDimensions; + size_t newRank = oldRank + 1; + + size_t *newDims = reserveMemory(newRank * sizeof(size_t)); + size_t *newOrder = reserveMemory(newRank * sizeof(size_t)); + newDims[0] = 1; + for (size_t d = 0; d < oldRank; ++d) { + newDims[d + 1] = t->shape->dimensions[d]; + } + for (size_t d = 0; d < newRank; ++d) { + newOrder[d] = d; + } + + freeReservedMemory(t->shape->dimensions); + freeReservedMemory(t->shape->orderOfDimensions); + t->shape->dimensions = newDims; + t->shape->orderOfDimensions = newOrder; + t->shape->numberOfDimensions = newRank; + } +} + +static tensorArray_t *buildOneHotLabels(tensorArray_t *intLabels) { + tensorArray_t *out = reserveMemory(sizeof(tensorArray_t)); + tensor_t **arr = reserveMemory(intLabels->size * sizeof(tensor_t *)); + out->array = arr; + out->size = intLabels->size; + + for (size_t i = 0; i < intLabels->size; ++i) { + size_t *dims = reserveMemory(1 * sizeof(size_t)); + size_t *order = reserveMemory(1 * sizeof(size_t)); + dims[0] = g_numClasses; + order[0] = 0; + shape_t *shape = reserveMemory(sizeof(shape_t)); + shape->dimensions = dims; + shape->orderOfDimensions = order; + shape->numberOfDimensions = 1; + + quantization_t *q = quantizationInitFloat(); + tensor_t *t = initTensor(shape, q, NULL); + + int32_t cls = ((int32_t *)intLabels->array[i]->data)[0]; + float *data = (float *)t->data; + for (size_t c = 0; c < g_numClasses; ++c) { + data[c] = (c == (size_t)cls) ? 1.0f : 0.0f; + } + arr[i] = t; + } + return out; +} + +static void initDataSets(const char *dataDir) { + char path[300]; + snprintf(path, sizeof(path), "%s/train_x.npy", dataDir); + tensorArray_t *trainItems = npyLoad(path); + snprintf(path, sizeof(path), "%s/train_y.npy", dataDir); + tensorArray_t *trainLabelsRaw = npyLoad(path); + reshapeItemsAddBatchDim(trainItems); + g_trainDataset.items = trainItems; + g_trainDataset.labels = buildOneHotLabels(trainLabelsRaw); + + snprintf(path, sizeof(path), "%s/val_x.npy", dataDir); + tensorArray_t *valItems = npyLoad(path); + snprintf(path, sizeof(path), "%s/val_y.npy", dataDir); + tensorArray_t *valLabelsRaw = npyLoad(path); + reshapeItemsAddBatchDim(valItems); + g_valDataset.items = valItems; + g_valDataset.labels = buildOneHotLabels(valLabelsRaw); + + snprintf(path, sizeof(path), "%s/test_x.npy", dataDir); + tensorArray_t *testItems = npyLoad(path); + snprintf(path, sizeof(path), "%s/test_y.npy", dataDir); + tensorArray_t *testLabelsRaw = npyLoad(path); + reshapeItemsAddBatchDim(testItems); + g_testDataset.items = testItems; + g_testDataset.labels = buildOneHotLabels(testLabelsRaw); +} + +static sample_t *getTrainSample(size_t id) { + return npyGetSample(&g_trainDataset, id); +} +static sample_t *getValSample(size_t id) { + return npyGetSample(&g_valDataset, id); +} +static sample_t *getTestSample(size_t id) { + return npyGetSample(&g_testDataset, id); +} +static size_t getTrainSize(void) { + return g_trainDataset.items->size; +} +static size_t getValSize(void) { + return g_valDataset.items->size; +} +static size_t getTestSize(void) { + return g_testDataset.items->size; +} + +static void buildModel(layer_t **model, layerQuant_t *lq) { + /* Input reshaped to [1, 1, 16000]. */ + /* Front downsample: AvgPool1d(K=16,S=16) -> length 1000 (16 kHz -> 1 kHz). */ + model[0] = avgPool1dLayerInit(&(avgPool1dInit_t){.kernelSize = DS_K, .stride = DS_K}, lq); + + model[1] = conv1dLayerInit( + &(conv1dInit_t){ + .inChannels = IN_CHANNELS, .outChannels = C1_OUT, .kernelSize = C1_K, .padding = SAME}, + lq); + model[2] = reluLayerInit(lq); + model[3] = maxPool1dLayerInit( + &(maxPool1dInit_t){ + .kernelSize = 4, .stride = 4, .inputChannels = C1_OUT, .inputLength = LEN_DS}, + lq); + + model[4] = conv1dLayerInit( + &(conv1dInit_t){ + .inChannels = C1_OUT, .outChannels = C2_OUT, .kernelSize = C2_K, .padding = SAME}, + lq); + model[5] = reluLayerInit(lq); + model[6] = maxPool1dLayerInit( + &(maxPool1dInit_t){ + .kernelSize = 4, .stride = 4, .inputChannels = C2_OUT, .inputLength = LEN_DS / 4}, + lq); + + model[7] = conv1dLayerInit( + &(conv1dInit_t){ + .inChannels = C2_OUT, .outChannels = C3_OUT, .kernelSize = C3_K, .padding = SAME}, + lq); + model[8] = reluLayerInit(lq); + model[9] = maxPool1dLayerInit( + &(maxPool1dInit_t){ + .kernelSize = 4, .stride = 4, .inputChannels = C3_OUT, .inputLength = LEN_DS / 16}, + lq); + + /* Rate-agnostic head: AdaptiveAvgPool1d(1) -> Flatten -> LayerNorm -> Linear -> Softmax. + * LayerNorm(C3_OUT) over the 64 pooled features stabilises raw-model training (mirrors the + * PyTorch nn.LayerNorm(64)); eps 1e-5 matches PyTorch's default. */ + model[10] = adaptiveAvgPool1dLayerInit(&(adaptiveAvgPool1dInit_t){.outputSize = 1}, lq); + model[11] = flattenLayerInit(); + model[12] = layerNormLayerInit( + &(layerNormInit_t){.normalizedShape = (size_t[]){C3_OUT}, .numNormDims = 1, .eps = 1e-5f}, + lq); + model[13] = + linearLayerInit(&(linearInit_t){.inFeatures = C3_OUT, .outFeatures = g_numClasses}, lq); + model[14] = softmaxLayerInit(lq); +} + +/* Load PyTorch state_dict from per-layer .npy files written by + * examples/kws_raw/train_pytorch.py --save-weights. + * + * Returns 0 on success, non-zero on first missing file. */ +static int loadStateDictFromDir(layer_t **model, const char *weightsDir) { + char wPath[300], bPath[300]; + /* Param layers in order: conv1=model[1], conv2=model[4], conv3=model[7], + * ln=model[12] (gamma/beta), fc=model[13]. 5 entries. */ + const char *names[5] = {"conv1", "conv2", "conv3", "ln", "fc"}; + tensor_t *w[5] = {0}; + tensor_t *b[5] = {0}; + + for (int i = 0; i < 5; i++) { + snprintf(wPath, sizeof(wPath), "%s/%s.weight.npy", weightsDir, names[i]); + snprintf(bPath, sizeof(bPath), "%s/%s.bias.npy", weightsDir, names[i]); + w[i] = npyLoadFlat(wPath); + b[i] = npyLoadFlat(bPath); + if (w[i] == NULL || b[i] == NULL) { + fprintf(stderr, "loadStateDictFromDir: missing %s or %s\n", wPath, bPath); + return 1; + } + } + + modelLoadStateDict( + model, MODEL_SIZE, + (stateDictEntry_t[]){ + {.name = names[0], .weightData = (float *)w[0]->data, .biasData = (float *)b[0]->data}, + {.name = names[1], .weightData = (float *)w[1]->data, .biasData = (float *)b[1]->data}, + {.name = names[2], .weightData = (float *)w[2]->data, .biasData = (float *)b[2]->data}, + {.name = names[3], .weightData = (float *)w[3]->data, .biasData = (float *)b[3]->data}, + {.name = names[4], .weightData = (float *)w[4]->data, .biasData = (float *)b[4]->data}, + }, + 5); + + for (int i = 0; i < 5; i++) { + freeTensor(w[i]); + freeTensor(b[i]); + } + return 0; +} + +static FILE *g_log_file = NULL; +static int g_first_epoch = 1; +static struct timespec g_epoch_t0; + +static void epochCallback(size_t epoch, float trainLoss, epochStats_t evalStats) { + struct timespec t1; + clock_gettime(CLOCK_MONOTONIC, &t1); + double wall_s = + (double)(t1.tv_sec - g_epoch_t0.tv_sec) + (double)(t1.tv_nsec - g_epoch_t0.tv_nsec) * 1e-9; + + if (!g_first_epoch) { + fprintf(g_log_file, ",\n"); + } + fprintf(g_log_file, + " {\"epoch\": %zu, \"step_losses\": [], \"train_loss\": %.6f, " + "\"val_loss\": %.6f, \"val_acc\": %.6f, \"wall_s\": %.4f}", + epoch, (double)trainLoss, (double)evalStats.loss, (double)evalStats.accuracy, wall_s); + fflush(g_log_file); + g_first_epoch = 0; + + fprintf(stdout, "epoch %zu: train_loss=%.4f val_loss=%.4f val_acc=%.4f wall_s=%.2f\n", epoch, + (double)trainLoss, (double)evalStats.loss, (double)evalStats.accuracy, wall_s); + fflush(stdout); + + clock_gettime(CLOCK_MONOTONIC, &g_epoch_t0); +} + +static int ensureDir(const char *p) { + if (mkdir(p, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == 0) { + return 0; + } + if (errno == EEXIST) { + return 0; + } + fprintf(stderr, "ERROR: cannot create %s: %s\n", p, strerror(errno)); + return 1; +} + +int main(void) { + g_numClasses = readNumClasses(); + + char dataDir[256], weightsDir[256], logsDir[256], outputsDir[256]; + snprintf(dataDir, sizeof(dataDir), "examples/kws_raw/data/%zuclass", g_numClasses); + snprintf(weightsDir, sizeof(weightsDir), "examples/kws_raw/weights/%zuclass", g_numClasses); + snprintf(logsDir, sizeof(logsDir), "examples/kws_raw/logs/%zuclass", g_numClasses); + snprintf(outputsDir, sizeof(outputsDir), "examples/kws_raw/outputs/%zuclass", g_numClasses); + + if (ensureDir("examples/kws_raw/logs") != 0 || ensureDir(logsDir) != 0) { + return 1; + } + if (ensureDir("examples/kws_raw/outputs") != 0 || ensureDir(outputsDir) != 0) { + return 1; + } + + initDataSets(dataDir); + + dataLoader_t *testLoader = dataLoaderInit(getTestSample, getTestSize, 1, NULL, NULL, + /*shuffle*/ false, /*shuffleSeed*/ 0, + /*dropLast*/ true); + + layerQuant_t lq; + layerQuantInitUniform(&lq, quantizationInitFloat()); + + layer_t *model[MODEL_SIZE]; + buildModel(model, &lq); + + const char *bitParity = getenv("BIT_PARITY"); + if (bitParity != NULL && bitParity[0] != '\0') { + /* Bit-parity mode: load PyTorch state_dict, skip training, run inference. */ + if (loadStateDictFromDir(model, weightsDir) != 0) { + fprintf(stderr, "BIT_PARITY: state_dict load failed\n"); + return 1; + } + fprintf(stdout, "BIT_PARITY: loaded state_dict from %s\n", weightsDir); + } else { + dataLoader_t *trainLoader = dataLoaderInit(getTrainSample, getTrainSize, BATCH, NULL, NULL, + /*shuffle*/ true, /*shuffleSeed*/ SHUFFLE_SEED, + /*dropLast*/ true); + dataLoader_t *valLoader = dataLoaderInit(getValSample, getValSize, 1, NULL, NULL, + /*shuffle*/ false, /*shuffleSeed*/ 0, + /*dropLast*/ true); + + optimizer_t *sgd = + sgdMCreateOptim(LR, MOMENTUM, /*weightDecay*/ 0.0f, model, MODEL_SIZE, FLOAT32); + + char logPath[300]; + snprintf(logPath, sizeof(logPath), "%s/c.json", logsDir); + g_log_file = fopen(logPath, "w"); + if (!g_log_file) { + fprintf(stderr, "ERROR: cannot open log file for writing\n"); + return 1; + } + fprintf(g_log_file, + "{\n" + " \"impl\": \"c\",\n" + " \"example\": \"kws_raw\",\n" + " \"config\": {\"epochs\": %d, \"batch\": %d, \"lr\": %.6f, " + "\"momentum\": %.6f, \"seed\": %d, \"shuffle_seed\": %d},\n" + " \"epochs\": [\n", + EPOCHS, BATCH, (double)LR, (double)MOMENTUM, SEED, SHUFFLE_SEED); + fflush(g_log_file); + + clock_gettime(CLOCK_MONOTONIC, &g_epoch_t0); + + trainingRunResult_t result = + trainingRun(model, MODEL_SIZE, + (lossConfig_t){.funcType = CROSS_ENTROPY, + .backwardReduction = REDUCTION_MEAN, + .classWeights = NULL}, + trainLoader, valLoader, sgd, EPOCHS, calculateGradsSequential, + inferenceWithLoss, epochCallback); + (void)result; + + epochStats_t testStats = evaluationEpochWithMetrics( + model, MODEL_SIZE, CROSS_ENTROPY, testLoader, inferenceWithLoss, REDUCTION_MEAN); + + fprintf(g_log_file, + "\n ],\n" + " \"final\": {\"test_loss\": %.6f, \"test_acc\": %.6f, " + "\"test_auc\": null}\n" + "}\n", + (double)testStats.loss, (double)testStats.accuracy); + fclose(g_log_file); + + fprintf(stdout, "FINAL test_loss=%.4f test_acc=%.4f\n", (double)testStats.loss, + (double)testStats.accuracy); + } + + /* Predictions on test set (both modes). */ + size_t numTest = getTestSize(); + int32_t *predictions = malloc(numTest * sizeof(int32_t)); + if (!predictions) { + fprintf(stderr, "OOM allocating predictions\n"); + return 1; + } + + for (size_t i = 0; i < numTest; ++i) { + sample_t *s = getTestSample(i); + tensor_t *out = inference(model, MODEL_SIZE, s->item); + float *probs = (float *)out->data; + size_t argmax = 0; + float best = probs[0]; + for (size_t c = 1; c < g_numClasses; ++c) { + if (probs[c] > best) { + best = probs[c]; + argmax = c; + } + } + predictions[i] = (int32_t)argmax; + freeTensor(out); + freeSample(s); + } + + char predPath[300]; + snprintf(predPath, sizeof(predPath), "%s/c_predictions.npy", outputsDir); + size_t outShape[] = {numTest}; + int status = 0; + int rc = npyWriteInt32(predPath, predictions, outShape, 1); + if (rc != 0) { + fprintf(stderr, "ERROR: npyWriteInt32 failed (rc=%d)\n", rc); + status = 1; + } + free(predictions); + + return status; +} diff --git a/examples/kws_raw/train_pytorch.py b/examples/kws_raw/train_pytorch.py new file mode 100644 index 0000000..b943a85 --- /dev/null +++ b/examples/kws_raw/train_pytorch.py @@ -0,0 +1,184 @@ +"""PyTorch reference implementation of the kws_raw 1D-CNN classifier. + +Input: raw [1,16000] waveform from prepare_data.py. The model downsamples +16 kHz -> 1 kHz via a front AvgPool1d(K=16), then 3 Conv blocks + a rate-agnostic +AdaptiveAvgPool1d(1) head + LayerNorm(64). Output: logs/class/pytorch.json + +outputs/class/pytorch_predictions.npy + +weights/class/{conv1,conv2,conv3,ln,fc}.{weight,bias}.npy +for the C-side BIT_PARITY mode. num_classes from KWS_CLASSES (default 6). +""" +from __future__ import annotations + +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) +from examples._shared.log_schema import RunLog, dump_log # noqa: E402 +from examples._shared.seeds import SEED, SHUFFLE_SEED # noqa: E402 +from examples._shared.xorshift32 import shuffle_indices # noqa: E402 + +HERE = Path(__file__).resolve().parent +NUM_CLASSES = int(os.environ.get("KWS_CLASSES", "6")) +assert NUM_CLASSES in (6, 35), NUM_CLASSES +TAG = f"{NUM_CLASSES}class" +DATA = HERE / "data" / TAG +LOGS = HERE / "logs" / TAG +OUTPUTS = HERE / "outputs" / TAG +WEIGHTS = HERE / "weights" / TAG + +EPOCHS = 20 +BATCH = 32 +LR = 0.005 +MOMENTUM = 0.9 + + +class KwsDataset(torch.utils.data.Dataset): + def __init__(self, x: np.ndarray, y: np.ndarray) -> None: + self.x = torch.from_numpy(x.astype(np.float32)) + self.y = torch.from_numpy(y.astype(np.int64)) + + def __len__(self) -> int: + return self.x.shape[0] + + def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: + return self.x[idx], self.y[idx] + + +class XorShift32Sampler(torch.utils.data.Sampler[int]): + """Single-shot shuffle, no per-epoch reshuffle, matching framework DataLoader.c.""" + def __init__(self, n: int, seed: int) -> None: + self.indices = shuffle_indices(n, seed) + + def __iter__(self): + return iter(self.indices) + + def __len__(self) -> int: + return len(self.indices) + + +class KwsRawCnn(nn.Module): + def __init__(self, num_classes: int) -> None: + super().__init__() + self.pool0 = nn.AvgPool1d(kernel_size=16, stride=16) # 16 kHz -> 1 kHz downsample + self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1) # SAME (K odd, stride 1) + self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1) + self.conv3 = nn.Conv1d(32, 64, kernel_size=3, padding=1) + # LayerNorm over the 64 pooled features (rate-agnostic, 1-D). Stabilises + # training of the raw model, which otherwise stalls at random init; the + # C framework has bit-parity LayerNorm so the gate is preserved. + self.ln = nn.LayerNorm(64) + self.fc = nn.Linear(64, num_classes) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.pool0(x) # [B,1,16000] -> [B,1,1000] + x = F.relu(self.conv1(x)) # [B,16,1000] + x = F.max_pool1d(x, 4) # [B,16,250] + x = F.relu(self.conv2(x)) # [B,32,250] + x = F.max_pool1d(x, 4) # [B,32,62] + x = F.relu(self.conv3(x)) # [B,64,62] + x = F.max_pool1d(x, 4) # [B,64,15] + x = F.adaptive_avg_pool1d(x, 1) # [B,64,1] + x = x.flatten(start_dim=1) # [B,64] + x = self.ln(x) # LayerNorm(64) + return self.fc(x) + + +def evaluate(model: nn.Module, x: np.ndarray, y: np.ndarray, batch: int) -> tuple[float, float]: + model.eval() + total_loss, total_correct, total = 0.0, 0, 0 + with torch.no_grad(): + for i in range(0, len(x), batch): + xb = torch.from_numpy(x[i : i + batch].astype(np.float32)) + yb = torch.from_numpy(y[i : i + batch].astype(np.int64)) + logits = model(xb) + loss = F.cross_entropy(logits, yb, reduction="sum") + total_loss += loss.item() + total_correct += (logits.argmax(dim=1) == yb).sum().item() + total += yb.shape[0] + return total_loss / total, total_correct / total + + +def main() -> None: + torch.manual_seed(SEED) + np.random.seed(SEED) + torch.use_deterministic_algorithms(True, warn_only=True) + + train_x = np.load(DATA / "train_x.npy") + train_y = np.load(DATA / "train_y.npy") + val_x = np.load(DATA / "val_x.npy") + val_y = np.load(DATA / "val_y.npy") + test_x = np.load(DATA / "test_x.npy") + test_y = np.load(DATA / "test_y.npy") + + train_ds = KwsDataset(train_x, train_y) + sampler = XorShift32Sampler(len(train_ds), SHUFFLE_SEED) + loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH, sampler=sampler, drop_last=True) + + model = KwsRawCnn(NUM_CLASSES) + optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM) + + epoch_records = [] + for epoch in range(EPOCHS): + t0 = time.time() + model.train() + step_losses: list[float] = [] + for xb, yb in loader: + optimizer.zero_grad() + loss = F.cross_entropy(model(xb), yb) + loss.backward() + optimizer.step() + step_losses.append(loss.item()) + train_loss = float(np.mean(step_losses)) if step_losses else 0.0 + val_loss, val_acc = evaluate(model, val_x, val_y, BATCH) + epoch_records.append({ + "epoch": epoch, "step_losses": step_losses, "train_loss": train_loss, + "val_loss": val_loss, "val_acc": val_acc, "wall_s": time.time() - t0, + }) + print(f"epoch {epoch:2d}: train_loss={train_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}", flush=True) + + test_loss, test_acc = evaluate(model, test_x, test_y, BATCH) + log: RunLog = { + "impl": "pytorch", "example": "kws_raw", + "config": {"epochs": EPOCHS, "batch": BATCH, "lr": LR, "momentum": MOMENTUM, + "seed": SEED, "shuffle_seed": SHUFFLE_SEED}, + "epochs": epoch_records, # type: ignore[typeddict-item] + "final": {"test_loss": test_loss, "test_acc": test_acc, "test_auc": None}, + } + LOGS.mkdir(parents=True, exist_ok=True) + OUTPUTS.mkdir(parents=True, exist_ok=True) + dump_log(LOGS / "pytorch.json", log) + + model.eval() + with torch.no_grad(): + preds = model(torch.from_numpy(test_x.astype(np.float32))).argmax(dim=1).numpy().astype(np.int32) + np.save(OUTPUTS / "pytorch_predictions.npy", preds) + print(f"FINAL test_loss={test_loss:.4f} test_acc={test_acc:.4f}", flush=True) + + WEIGHTS.mkdir(parents=True, exist_ok=True) + layer_map = { + "conv1": model.conv1, + "conv2": model.conv2, + "conv3": model.conv3, + "ln": model.ln, + "fc": model.fc, + } + print("Saving per-layer weights:", flush=True) + for name, layer in layer_map.items(): + w = layer.weight.detach().cpu().numpy().astype(np.float32) + np.save(WEIGHTS / f"{name}.weight.npy", w) + if layer.bias is not None: + b = layer.bias.detach().cpu().numpy().astype(np.float32) + np.save(WEIGHTS / f"{name}.bias.npy", b) + print(f" wrote {name}.weight.npy shape={w.shape}", flush=True) + + +if __name__ == "__main__": + main()