FluffyAIcode · FluffyAIcode · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/auto-label-mac.yaml b/.github/workflows/auto-label-mac.yaml
@@ -0,0 +1,89 @@
+name: Auto-label needs-mac-m4
+
+# Auto-applies the ``needs-mac-m4`` label to PRs that touch
+# verifier-dependent code paths so the integration workflow
+# (.github/workflows/integration.yaml) runs without contributors
+# remembering to apply the label by hand.
+#
+# A PR opts INTO Mac M4 review by editing files under any of:
+#   inference_engine/  — runtime, scheduler, session, server, etc.
+#   sdks/              — Python + TypeScript SDK
+#   proto/             — protobuf wire contract
+#   tests/integration/ — the integration suite itself
+#   kv_cache_proposer/ — verifier + decoder
+#
+# A doc-only PR (touching only docs/, README.md, etc.) does NOT
+# trigger Mac M4 review, saving runner time.
+#
+# Once labelled, the integration workflow auto-fires; once a PR
+# lands without the label, the integration workflow auto-skips.
+
+on:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+    branches: [main]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Label PRs touching verifier-dependent paths
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const pr = context.payload.pull_request;
+            const labelName = "needs-mac-m4";
+
+            // Pull the diff file list. github-script gives us the
+            // full octokit; pagination matters for >100-file PRs
+            // but in practice the v0.3 PRs are well under that.
+            const { data: files } = await github.rest.pulls.listFiles({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: pr.number,
+              per_page: 100,
+            });
+
+            const triggers = [
+              "inference_engine/",
+              "sdks/",
+              "proto/",
+              "tests/integration/",
+              "kv_cache_proposer/",
+            ];
+
+            const matched = files.some(f =>
+              triggers.some(t => f.filename.startsWith(t))
+            );
+
+            const hasLabel = pr.labels.some(l => l.name === labelName);
+
+            if (matched && !hasLabel) {
+              core.info(`Adding ${labelName} (PR touches verifier-dependent paths).`);
+              await github.rest.issues.addLabels({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: pr.number,
+                labels: [labelName],
+              });
+            } else if (!matched && hasLabel) {
+              // PR was previously labelled; subsequent push removed
+              // all verifier-dependent file edits. Drop the label
+              // so the integration workflow doesn't burn runner
+              // time on doc-only updates.
+              core.info(`Removing ${labelName} (no verifier-dependent paths).`);
+              await github.rest.issues.removeLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: pr.number,
+                name: labelName,
+              });
+            } else {
+              core.info(
+                `No-op: matched=${matched} hasLabel=${hasLabel}.`
+              );
+            }
diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml
@@ -0,0 +1,136 @@
+name: Integration (Mac M4)
+
+# Self-hosted runner workflow that runs the integration suite under
+# tests/integration/ against real Qwen3-0.6B on Apple Silicon.
+#
+# Trigger model:
+#   - Pull-request events. Only fires when the PR carries the
+#     ``needs-mac-m4`` label (auto-applied by .github/workflows/
+#     auto-label-mac.yaml when a PR touches inference_engine/,
+#     sdks/, proto/, or tests/integration/). PRs that don't touch
+#     verifier-dependent code skip this gate entirely so the runner
+#     pool isn't burned on doc-only or CI-only PRs.
+#   - Manual workflow_dispatch for re-runs from the Actions UI.
+#
+# Runner requirements (self-hosted):
+#   - macOS 14+ on Apple Silicon (M-series).
+#   - Labels: [self-hosted, macOS, ARM64, kakeya-mac-m4].
+#   - Pre-warmed HF cache containing Qwen/Qwen3-0.6B at
+#     ~/.cache/huggingface/hub/ (avoids 10-minute first-run download).
+#   - Python 3.12+ on PATH.
+#   - At least 24 GB unified memory and ~50 GB free disk.
+#
+# See docs/ops/mac-m4-runner-setup.md for the one-time runner setup.
+
+on:
+  pull_request:
+    # Only run on PR events for branches targeting main.
+    types: [opened, synchronize, reopened, labeled]
+    branches: [main]
+  workflow_dispatch: {}
+
+# Cancel superseded runs on the same PR — saves runner time when
+# the contributor pushes a new commit before the previous run
+# finishes.
+concurrency:
+  group: integration-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  integration:
+    name: pytest -m integration on Mac M4
+    # Only fire on labeled PRs (this saves the runner pool from
+    # doc-only / CI-only PRs that don't touch verifier-dependent
+    # code). The auto-label workflow adds 'needs-mac-m4' on file
+    # paths that warrant the GA gate.
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      contains(github.event.pull_request.labels.*.name, 'needs-mac-m4')
+    runs-on: [self-hosted, macOS, ARM64, kakeya-mac-m4]
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          # Full history so the runner can compare against base for
+          # any future rebase-based gating.
+          fetch-depth: 0
+
+      - name: Verify host shape
+        run: |
+          echo "=== sysctl ==="
+          sysctl -n hw.model || true
+          sysctl -n hw.memsize || true
+          sysctl -n machdep.cpu.brand_string || true
+          echo "=== python ==="
+          python3 --version
+          python3 -c "import platform; print(platform.machine(), platform.platform())"
+
+      - name: Verify Qwen3-0.6B in HF cache
+        run: |
+          # Don't download here; the runner is expected to be
+          # pre-warmed. If the model isn't cached the test loads
+          # would hit HF and exceed the 90-min timeout. Surface a
+          # clear error early.
+          set -e
+          MODEL_DIR="$HOME/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B"
+          if [ ! -d "$MODEL_DIR" ]; then
+            echo "::error::HF cache miss for Qwen/Qwen3-0.6B."
+            echo "::error::Pre-warm the runner: python3 -c 'from transformers import AutoModelForCausalLM, AutoTokenizer; AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-0.6B\"); AutoTokenizer.from_pretrained(\"Qwen/Qwen3-0.6B\")'"
+            exit 1
+          fi
+          echo "Found $MODEL_DIR"
+
+      - name: Install Python dependencies
+        run: |
+          # The runner is expected to have a long-lived venv.
+          # If a per-run venv is preferred, swap to ``python3 -m venv .venv``.
+          python3 -m pip install --upgrade pip
+          python3 -m pip install -e .
+          python3 -m pip install pytest pytest-asyncio pytest-timeout coverage
+
+      - name: Run integration suite
+        env:
+          PYTHONPATH: .:sdks/python
+          # No HF download in tests; if we hit a cache miss it's a
+          # bug or a stale runner.
+          HF_HUB_OFFLINE: "1"
+        run: |
+          mkdir -p results/platform-tests
+          stamp=$(date +%s)
+          python3 -m pytest \
+            -m integration \
+            tests/integration/ \
+            --junitxml="results/platform-tests/integration-mac-m4-${stamp}.junit.xml" \
+            -v
+          # Record the artifact path for the upload step below.
+          echo "artifact_stamp=${stamp}" >> "$GITHUB_OUTPUT"
+        id: pytest_run
+
+      - name: Upload JUnit + log artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: integration-mac-m4-${{ steps.pytest_run.outputs.artifact_stamp || github.run_id }}
+          path: |
+            results/platform-tests/integration-mac-m4-*.junit.xml
+          retention-days: 30
+
+      - name: Surface failure summary
+        if: failure()
+        run: |
+          # Tail the last few lines of the JUnit so the failure is
+          # visible in the action log, not just inside the artifact.
+          for f in results/platform-tests/integration-mac-m4-*.junit.xml; do
+            echo "=== $f ==="
+            python3 - "$f" <<'PY'
+          import sys, xml.etree.ElementTree as ET
+          r = ET.parse(sys.argv[1]).getroot()
+          for tc in r.iter("testcase"):
+              for child in tc:
+                  if child.tag in ("failure", "error"):
+                      print(f"[{child.tag.upper()}] {tc.get('classname')}::{tc.get('name')}")
+                      msg = (child.get("message") or "").splitlines()
+                      if msg:
+                          print(f"    {msg[0][:180]}")
+          PY
+          done
diff --git a/docs/ops/mac-m4-runner-setup.md b/docs/ops/mac-m4-runner-setup.md
@@ -0,0 +1,137 @@
+# Mac M4 self-hosted runner setup
+
+This runner backs the **Integration (Mac M4)** GitHub Actions workflow
+(`.github/workflows/integration.yaml`). It runs `pytest -m integration`
+against real Qwen3-0.6B on every PR labelled `needs-mac-m4`
+(auto-applied by `.github/workflows/auto-label-mac.yaml` when a PR
+touches `inference_engine/`, `sdks/`, `proto/`, `tests/integration/`,
+or `kv_cache_proposer/`).
+
+## Hardware requirements
+
+| Resource | Minimum |
+| --- | --- |
+| Chip | Apple Silicon (M-series); M4 or newer recommended |
+| Unified memory | 24 GB (16 GB works for Qwen3-0.6B alone but no headroom for concurrent work) |
+| Free disk | ~50 GB (HF cache + venv + checkout history) |
+| Network | Reachable to github.com for runner registration; outbound to HF Hub for the one-time pre-warm |
+| OS | macOS 14 (Sonoma) or newer |
+
+## One-time setup
+
+### 1. Register the self-hosted runner
+
+Follow [GitHub's docs](https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/adding-self-hosted-runners) to add a runner to the repository:
+
+1. Repository → Settings → Actions → Runners → New self-hosted runner.
+2. Choose macOS / ARM64.
+3. Run the install + configure commands GitHub provides.
+4. **Important**: when prompted for labels, add `kakeya-mac-m4`
+   in addition to the default `self-hosted, macOS, ARM64`. The
+   workflow's `runs-on:` clause specifically requires that label.
+5. Run the runner as a launchd service (`./svc.sh install && ./svc.sh start`)
+   so it survives reboots.
+
+### 2. Pre-warm the HF cache
+
+The integration workflow runs with `HF_HUB_OFFLINE=1` so it never
+hits HuggingFace at test time (avoids 90-min runs blocking on a 4 GB
+download). Pre-warm the cache once per runner:
+
+```bash
+python3 -c "
+from transformers import AutoModelForCausalLM, AutoTokenizer
+AutoModelForCausalLM.from_pretrained('Qwen/Qwen3-0.6B')
+AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B')
+"
+```
+
+The model lands at `~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/`.
+The workflow's "Verify Qwen3-0.6B in HF cache" step fails fast with
+a clear error if that directory is missing.
+
+If a future test adds a new model id, update the pre-warm command
+(and the workflow's verify step) accordingly.
+
+### 3. Install Python toolchain
+
+The runner needs Python 3.12+. Use Homebrew or pyenv:
+
+```bash
+brew install python@3.12
+# or:
+pyenv install 3.12.7
+pyenv global 3.12.7
+```
+
+Confirm `python3 --version` returns 3.12.x and `python3 -c 'import platform; print(platform.machine())'` returns `arm64`.
+
+### 4. (Optional) long-lived venv
+
+The workflow currently does `pip install -e .` per run, which is
+~30 s on a warm pip cache. If you want to skip even that, create a
+venv at `~/kakeya-runner-venv` and add a step to the workflow that
+activates it before `pytest`. v0.3 keeps the per-run install for
+simplicity.
+
+## Runtime expectations
+
+| Phase | Wall time on M4 24 GB |
+| --- | --- |
+| Checkout + verify host | <5 s |
+| Verify HF cache | <1 s |
+| `pip install -e .` (warm pip) | 20-40 s |
+| `pytest -m integration` (80 tests, post-PR-N1..N4) | 60-120 s |
+| Artifact upload | <5 s |
+| **Total** | **~2-3 min** |
+
+The 90-minute timeout in the workflow is a safety margin. A run
+that exceeds 5 min should be investigated — likely a model-load
+regression or a runaway test.
+
+## Maintenance
+
+### Cache hygiene
+
+The runner's HF cache and pip cache grow over time. Recommend a
+monthly cron:
+
+```bash
+# ~/clean-kakeya-runner.sh
+find ~/.cache/huggingface/hub -type d -mtime +60 -prune -name 'models--*' -exec rm -rf {} +
+python3 -m pip cache purge
+```
+
+The Qwen3-0.6B cache is touched on every run, so `mtime +60` only
+prunes models added by future test additions that aren't currently
+exercised.
+
+### Runner upgrades
+
+GitHub publishes new runner versions ~monthly. Update via:
+
+```bash
+cd ~/actions-runner
+./svc.sh stop
+./config.sh remove --token <repo-config-token>
+# download the new tarball per GitHub UI instructions
+./config.sh --url https://github.com/<owner>/<repo> --token <new-token>
+./svc.sh install && ./svc.sh start
+```
+
+### Failure triage
+
+Workflow failures are visible at `Actions → Integration (Mac M4)`. The "Surface failure summary" step inlines the test names + first-line error messages so triage doesn't require downloading the JUnit XML.
+
+If the runner itself is offline (queue depth grows, no jobs pick up), check on the Mac:
+
+```bash
+cd ~/actions-runner
+sudo ./svc.sh status
+tail -200 ~/Library/Logs/actions-runner/Runner_*.log
+```
+
+Common causes:
+- macOS auto-update rebooted the host; service didn't auto-start (rare with `launchd` but possible).
+- HF cache was purged; the verify step fails. Re-warm.
+- Disk full from accumulated pip downloads; clear cache.