diff --git a/.dockerignore b/.dockerignore index 21507c8c..7f7eb200 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,9 +1,37 @@ +# Build artifacts & VCS .git target node_modules + +# Documentation (not needed in image) *.md docs/ tests/ .github/ helm/ + +# Codeiq workspace under src/codeiq/ (development scratchpad) src/codeiq/ + +# Secrets — explicit defense-in-depth; .dockerignore does NOT inherit +# .gitignore (Docker resolves COPY against the build context, which +# includes uncommitted/working-tree files). Audit RAN-46 §3. +.env +.env.* +*.pem +*.key +*.jks +*.p12 +*.pfx +*.keystore +id_rsa +id_ecdsa +id_ed25519 +id_dsa +credentials.json +credentials.yaml +secrets.json +secrets.yaml +*.serviceaccount.json +.aws/ +.codeiq/ diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 51da4ba2..a0d96850 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -90,8 +90,13 @@ jobs: - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3.12' - - name: Install semgrep - run: python -m pip install --quiet --upgrade pip semgrep + - name: Install semgrep (pinned for reproducibility) + # Pinned per OpenSSF Scorecard `Pinned-Dependencies` (RAN-46 §5). + # Bump via Dependabot pip ecosystem on a documented cadence; floating + # `semgrep` was previously flagged by Scorecard. pip is left unpinned + # — setup-python@v6 ships a current vendored pip, and the Scorecard + # rule fires only on user-installed packages. + run: python -m pip install --quiet 'semgrep==1.161.0' - name: Run semgrep (security-audit + owasp-top-ten + java) run: | semgrep scan \ diff --git a/.gitignore b/.gitignore index 27dc9fa8..f45b62ea 100644 --- a/.gitignore +++ b/.gitignore @@ -28,10 +28,34 @@ Thumbs.db *.mv.db # Environment & secrets +# Broad .env* glob catches .env, .env.local, .env.prod, .env.test, .env.* — all +# variants. Pre-PR-3 we only excluded the first two and several .env. +# variants would have committed silently. .env -.env.local +.env.* +# Java keystores & PKCS#12 archives — high-value secrets that have shown up in +# audits; never commit, even encrypted. +*.jks +*.p12 +*.pfx +*.keystore +# Generic credential / private-key patterns *.pem *.key +# SSH private keys (public *.pub keys are fine). +id_rsa +id_ecdsa +id_ed25519 +id_dsa +# AWS / cloud credentials +.aws/credentials +credentials.json +credentials.yaml +secrets.json +secrets.yaml +# Service-account JSON (GCP / Firebase) — typically named *.serviceaccount.json. +*-serviceaccount.json +*.serviceaccount.json # Logs *.log diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bc95370..2291d3bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -365,6 +365,54 @@ for that specific tag for the per-commit details. topology tool as a targeted Cypher query so the snapshot isn't needed. The cache is the bridge; the rewrite reduces peak memory. +- **Production-readiness PR 3 of 5 — supply chain & bundle integrity.** + Closes the air-gap drift, missing bundle integrity, and unpinned + scanner versions audit findings. + - **`codeiq bundle` SHA-256 manifest.** Every entry in `bundle.zip` + (manifest, scripts, graph DB files, H2 cache, source tree, flow.html, + optional CLI JAR) is now hashed as it streams through the + `ZipOutputStream`, and a `checksums.sha256` entry is written last in + standard GNU coreutils format. Receivers verify with + `sha256sum -c checksums.sha256`. The hash is computed by feeding each + chunk to both the SHA-256 digest and the ZIP stream — no double-read + even for multi-hundred-MB graph databases. Order is deterministic + (sorted dir walks + sorted git ls-files), so the resulting + `checksums.sha256` is byte-stable. + - **No public-internet calls in launcher scripts.** `serve.sh` and + `serve.bat` previously fell back to `curl -fL https://repo1.maven.org/...` + when the CLI JAR wasn't bundled — incompatible with the air-gapped + deploy model documented in `~/.claude/rules/build.md`. The Maven + Central download is removed; if the JAR is missing, the launcher + fails fast and tells the operator to either `--include-jar` when + bundling or stage from an internal artifact mirror. `serve.sh` also + runs `sha256sum -c --quiet checksums.sha256` automatically before + launching (skip with `CODEIQ_SKIP_VERIFY=1`). + - **Pinned Semgrep version.** `.github/workflows/security.yml` was + `pip install semgrep` (floating) — Scorecard's + `Pinned-Dependencies` flagged it. Now pinned to `semgrep==1.161.0` + (latest stable as of 2026-04-28). Bumps go through Dependabot's pip + ecosystem on a documented cadence. + - **Tightened secret-pattern exclusions.** `.gitignore` previously + only matched `.env` / `.env.local` — gaps for `.env.prod`, + `.env.test`, JKS / P12 keystores, SSH private keys, and + cloud-credential JSON. Broadened to `.env.*` plus explicit globs + for `*.jks`, `*.p12`, `*.pfx`, `*.keystore`, `id_{rsa,ecdsa,ed25519,dsa}`, + `credentials.{json,yaml}`, `secrets.{json,yaml}`, + `*.serviceaccount.json`. `.dockerignore` mirrors the same rules + (Docker resolves COPY against the build context, which includes + untracked working-tree files; .dockerignore does not inherit + .gitignore). + - **Bundle verification runbook.** `shared/runbooks/release.md` §4a + documents consumer-side `sha256sum -c` workflow, including the + deliberate exclusion of `checksums.sha256` from itself (would be + circular) and the Sigstore/GPG out-of-band signing that backs + `checksums.sha256` against tampering. + - **Tests:** `BundleCommandTest#bundleCreatesZipWithCorrectStructure` + extended with 4 new asserts: serve.sh contains no `curl`/`maven.org` + references (defense against re-introduction), `checksums.sha256` + exists, format-conforms to `<64-hex> `, and excludes itself. + Full suite: 3672 tests / 0 failures / 0 errors. + ## [0.1.0] - 2026-03-28 First general-availability cut. See the diff --git a/CLAUDE.md b/CLAUDE.md index 1f151ad8..3d1871af 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -447,6 +447,11 @@ bean for code paths that haven't been ported yet. - **`Files.probeContentType` is best-effort** — JDK 25 on Linux uses `/etc/mime.types` + magic-byte fallback. It returns `null` if the type can't be determined; treat that as "let it through" (the byte cap in `SafeFileReader` still bounds size). The allowlist for `/api/file` is `text/*` + `application/{json,xml,x-yaml,javascript}` — extending requires adding to the explicit list in `GraphController.readFile`. - **Sanitize user-controlled values before logging.** `BearerAuthFilter.sanitizeForLog(String)` strips `\p{Cntrl}` and truncates at 256 chars. Use it on anything tainted by `request.getRequestURI()`, `request.getMethod()`, headers, etc. before passing to a logger. CodeQL `java/log-injection` will flag direct `log.warn("... {} ...", request.getRequestURI())` as a vuln. - **`mcp.limits.max_depth` is a NEW field on `McpLimitsConfig`** (default 10). Audit #10 / C3 — the original audit assumed it existed but it didn't. When adding new MCP traversal tools, cap depth via `Math.min(callerSupplied, maxDepth)` before passing to Cypher. The REST endpoint already had this guard via `config.getMaxDepth()` from `CodeIqConfig`; the MCP path now mirrors it via `McpLimitsConfig.maxDepth()`. +- **`codeiq bundle` writes `checksums.sha256` LAST and excludes itself.** `BundleCommand#writeChecksumsManifest` runs after every other entry has been written, then the digests collected in `LinkedHashMap checksums` are emitted as ` \n` per line — exactly GNU coreutils `sha256sum` format, so receivers verify with `sha256sum -c checksums.sha256`. The manifest itself is intentionally NOT in the digest list (would be circular); to verify `checksums.sha256` against tampering, sign the bundle.zip out-of-band (Sigstore, GPG, or compare to the GitHub Release SHA-256). Don't try to "fix" the circular omission by hashing checksums.sha256 into the manifest — that turns into a cat-and-mouse loop. +- **`writeFileHashed` reads each file once, feeding both the SHA-256 and the ZIP stream.** Hundreds-of-MB graph DBs / CLI JARs can't be double-read for a separate hash pass. The 8KB chunk size in `BundleCommand` is small enough to keep memory flat regardless of file size; do NOT collect bytes into a `byte[]` and then split for "convenience". +- **`serve.sh` and `serve.bat` MUST NOT contain network calls.** Audit RAN-46 §3 — air-gapped deploy model. Pre-PR-3 these scripts had `curl -fL https://repo1.maven.org/...` to download the CLI JAR on first run; that's gone. Receivers must `--include-jar` when bundling or stage the JAR from an internal mirror. There's a regression test in `BundleCommandTest#bundleCreatesZipWithCorrectStructure` that asserts `serve.sh` contains neither `curl` nor `maven.org` — keep that test green. +- **`.dockerignore` does NOT inherit `.gitignore`.** Docker resolves COPY against the build context, which includes uncommitted/untracked working-tree files. `.gitignore` only stops things being staged; it has no effect on what `docker build` sees. Mirror the secret-pattern globs explicitly in `.dockerignore` (`.env*`, `*.jks`, `id_rsa`, `credentials.{json,yaml}`, etc.). Pre-PR-3 the `.dockerignore` was 9 lines and would have shipped a `.env.prod` straight into a published image. +- **Semgrep is pinned to `semgrep==1.161.0`** in `.github/workflows/security.yml`. Bumps go through Dependabot's pip ecosystem on a documented cadence — `pip install --upgrade semgrep` (floating) was previously flagged by Scorecard `Pinned-Dependencies`. Don't unpin to "always get latest"; a CI-time auto-bump on a security-scanner can break the build silently when the new release adds rules. ## Supply-chain observability (OpenSSF) diff --git a/shared/runbooks/release.md b/shared/runbooks/release.md index 759d9da3..ed0cda3e 100644 --- a/shared/runbooks/release.md +++ b/shared/runbooks/release.md @@ -80,6 +80,42 @@ Within 30 minutes of the release workflow finishing: If any of (1)–(4) fails, [`rollback.md`](rollback.md) applies. +### 4a. Consumer-side bundle integrity (`codeiq bundle` artifacts) + +When operators receive a `*-bundle.zip` produced by `codeiq bundle`, they +**must** verify integrity before launching the bundled `serve.sh` / +`serve.bat`. The bundle ships a `checksums.sha256` entry in standard GNU +coreutils format, generated as the last step of bundling +(`BundleCommand#writeChecksumsManifest`). + +```bash +# 1. Unzip into a clean directory. +unzip myrepo-v1.0-bundle.zip -d myrepo-bundle/ +cd myrepo-bundle + +# 2. Verify every file. Exits non-zero if any entry is missing or modified; +# `checksums.sha256` itself is intentionally not listed (would be circular). +sha256sum -c --quiet checksums.sha256 + +# 3. (Optional) Skip via env var only when the bundle is trusted source-internal: +# CODEIQ_SKIP_VERIFY=1 ./serve.sh +./serve.sh +``` + +`serve.sh` runs the same `sha256sum -c` automatically when the binary is +on `PATH`. **Do not set `CODEIQ_SKIP_VERIFY=1` in production**: it +disables the only consumer-side integrity gate when the bundle was +delivered out-of-band (USB, internal mirror, AKS sidecar artifact). For +verifying `checksums.sha256` itself against tampering, sign the +bundle.zip out-of-band (Sigstore, GPG, or compare to the GitHub Release +SHA-256 if the bundle was published to a release). + +If the consumer environment does not provide `sha256sum` (Windows without +WSL, locked-down build agents), distribute the bundle via Sigstore-signed +release and rely on the Sigstore client for integrity. `serve.bat` +intentionally does **not** include a Windows-native verification step +yet — tracked under follow-up. + --- ## 5. Hot-fix patch release (`X.Y.Z+1`) diff --git a/src/main/java/io/github/randomcodespace/iq/cli/BundleCommand.java b/src/main/java/io/github/randomcodespace/iq/cli/BundleCommand.java index 3cec0568..f250c6bc 100644 --- a/src/main/java/io/github/randomcodespace/iq/cli/BundleCommand.java +++ b/src/main/java/io/github/randomcodespace/iq/cli/BundleCommand.java @@ -191,6 +191,13 @@ public Integer call() { + "(SNAPSHOT JARs are not on Maven Central)"); } + // 9. checksums.sha256 — written LAST so it covers every preceding + // entry (and excludes itself, which would be circular). Receivers + // verify with `sha256sum -c checksums.sha256` post-unzip — the file + // format mirrors GNU coreutils sha256sum output exactly. + CliOutput.info(" Writing checksums.sha256 (" + checksums.size() + " entries)"); + writeChecksumsManifest(zos); + } catch (IOException e) { CliOutput.error("Failed to create bundle: " + e.getMessage()); return 1; @@ -253,24 +260,33 @@ private String createManifest(String projectName, String bundleTag, String versi private String generateServeShell(String version) { return """ #!/usr/bin/env bash + # codeiq bundle launcher (offline / air-gapped). + # + # No public-internet calls. The receiving environment must already + # have the CLI JAR present — either bundled via `codeiq bundle + # --include-jar` or staged from your internal artifact mirror. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" cd "$SCRIPT_DIR" + # Optional but recommended: verify bundle integrity before launch. + # checksums.sha256 is generated by `codeiq bundle` in standard + # GNU sha256sum format. Skip with CODEIQ_SKIP_VERIFY=1. + if [ "${CODEIQ_SKIP_VERIFY:-0}" != "1" ] && [ -f checksums.sha256 ] \\ + && command -v sha256sum >/dev/null 2>&1; then + echo "Verifying bundle integrity (sha256sum -c)..." + sha256sum -c --quiet checksums.sha256 + fi + # Read version from manifest VERSION=$(grep -o '"extractor_version" *: *"[^"]*"' manifest.json | grep -o '"[^"]*"$' | tr -d '"') JAR="code-iq-${VERSION}-cli.jar" - # Download CLI JAR if not present if [ ! -f "$JAR" ]; then - if [[ "$VERSION" == *-SNAPSHOT ]]; then - echo "ERROR: CLI JAR not found and version is a SNAPSHOT." - echo " Re-bundle with --include-jar or place $JAR in this directory." - exit 1 - fi - echo "Downloading codeiq CLI v${VERSION}..." - curl -fL -o "$JAR" \\ - "https://repo1.maven.org/maven2/io/github/randomcodespace/iq/code-iq/${VERSION}/code-iq-${VERSION}-cli.jar" + echo "ERROR: $JAR not found in $SCRIPT_DIR." >&2 + echo " Re-bundle with: codeiq bundle --include-jar" >&2 + echo " Or place the JAR next to serve.sh (e.g., from your internal mirror)." >&2 + exit 1 fi # Start serve (read-only) @@ -285,6 +301,10 @@ private String generateServeShell(String version) { private String generateServeBat(String version) { return """ @echo off\r + rem codeiq bundle launcher (offline / air-gapped).\r + rem No public-internet calls. The CLI JAR must already be present\r + rem alongside this script — bundle with `codeiq bundle --include-jar`\r + rem or stage from your internal artifact mirror.\r setlocal enabledelayedexpansion\r cd /d "%~dp0"\r \r @@ -298,14 +318,10 @@ private String generateServeBat(String version) { set "JAR=code-iq-!VERSION!-cli.jar"\r \r if not exist "!JAR!" (\r - echo !VERSION! | findstr /C:"-SNAPSHOT" >nul\r - if !errorlevel! == 0 (\r - echo ERROR: CLI JAR not found and version is a SNAPSHOT.\r - echo Re-bundle with --include-jar or place !JAR! in this directory.\r - exit /b 1\r - )\r - echo Downloading codeiq CLI v!VERSION!...\r - curl -fL -o "!JAR!" "https://repo1.maven.org/maven2/io/github/randomcodespace/iq/code-iq/!VERSION!/code-iq-!VERSION!-cli.jar"\r + echo ERROR: !JAR! not found in %~dp0.\r + echo Re-bundle with: codeiq bundle ^ --include-jar\r + echo Or place the JAR next to serve.bat (e.g., from your internal mirror).\r + exit /b 1\r )\r \r if "%PORT%"=="" set PORT=8080\r @@ -336,9 +352,7 @@ private int bundleDirectory(Path dir, String zipPrefix, ZipOutputStream zos, try { String entryName = zipPrefix + "/" + dir.relativize(file).toString() .replace('\\', '/'); - zos.putNextEntry(new ZipEntry(entryName)); - Files.copy(file, zos); - zos.closeEntry(); + writeFileHashed(zos, entryName, file); count[0]++; } catch (IOException e) { // Skip files that can't be read (e.g., locked) @@ -373,9 +387,7 @@ private int bundleSourceFiles(Path root, ZipOutputStream zos) { Path absPath = root.resolve(relPath); if (Files.isRegularFile(absPath)) { try { - zos.putNextEntry(new ZipEntry("source/" + relPath)); - Files.copy(absPath, zos); - zos.closeEntry(); + writeFileHashed(zos, "source/" + relPath, absPath); count++; } catch (IOException e) { // Skip @@ -402,9 +414,7 @@ private int bundleSourceFiles(Path root, ZipOutputStream zos) { try { String entryName = "source/" + root.relativize(file).toString() .replace('\\', '/'); - zos.putNextEntry(new ZipEntry(entryName)); - Files.copy(file, zos); - zos.closeEntry(); + writeFileHashed(zos, entryName, file); count[0]++; } catch (IOException e) { // Skip @@ -423,16 +433,15 @@ private void bundleCliJar(String version, ZipOutputStream zos) { if (runningJar != null && Files.isRegularFile(runningJar)) { String jarName = "code-iq-" + version + "-cli.jar"; try { - zos.putNextEntry(new ZipEntry(jarName)); - Files.copy(runningJar, zos); - zos.closeEntry(); long sizeMb = Files.size(runningJar) / (1024 * 1024); + writeFileHashed(zos, jarName, runningJar); CliOutput.info(" Included CLI JAR: " + jarName + " (" + sizeMb + " MB)"); } catch (IOException e) { CliOutput.warn(" Could not include CLI JAR: " + e.getMessage()); } } else { - CliOutput.warn(" Could not locate CLI JAR. The bundle will download it on first run."); + CliOutput.warn(" Could not locate CLI JAR. Receivers must place the matching" + + " code-iq-" + version + "-cli.jar next to serve.sh before running."); } } @@ -460,9 +469,84 @@ private Path findRunningJar() { // --- Utilities --- + /** + * Per-entry SHA-256 accumulator. {@link LinkedHashMap} preserves write + * order — paired with the deterministic ZIP write order (sorted dir walks + * + sorted git ls-files), this gives a byte-stable {@code checksums.sha256}. + * Format mirrors {@code sha256sum} output exactly so receivers can run + * {@code sha256sum -c checksums.sha256} to verify the unpacked bundle. + */ + private final java.util.Map checksums = new java.util.LinkedHashMap<>(); + private void writeEntry(ZipOutputStream zos, String name, String content) throws IOException { + writeEntryHashed(zos, name, content.getBytes(StandardCharsets.UTF_8)); + } + + /** + * Write a string/byte entry to the ZIP and record its SHA-256 in + * {@link #checksums}. Used for in-memory content (manifest, serve scripts, + * flow.html). + */ + private void writeEntryHashed(ZipOutputStream zos, String name, byte[] content) throws IOException { zos.putNextEntry(new ZipEntry(name)); - zos.write(content.getBytes(StandardCharsets.UTF_8)); + zos.write(content); + zos.closeEntry(); + checksums.put(name, sha256Hex(content)); + } + + /** + * Stream a file into the ZIP and record its SHA-256 in {@link #checksums}. + * The file is read once: each chunk is fed both to the hash and to the + * ZIP output stream. No intermediate byte[] for large files (graph DB, + * cache files, CLI JAR can be hundreds of MB). + */ + private void writeFileHashed(ZipOutputStream zos, String entryName, java.nio.file.Path file) throws IOException { + zos.putNextEntry(new ZipEntry(entryName)); + java.security.MessageDigest md; + try { + md = java.security.MessageDigest.getInstance("SHA-256"); + } catch (java.security.NoSuchAlgorithmException e) { + throw new IllegalStateException("SHA-256 unavailable in JDK", e); + } + try (java.io.InputStream in = Files.newInputStream(file)) { + byte[] buf = new byte[8192]; + int n; + while ((n = in.read(buf)) > 0) { + md.update(buf, 0, n); + zos.write(buf, 0, n); + } + } + zos.closeEntry(); + checksums.put(entryName, java.util.HexFormat.of().formatHex(md.digest())); + } + + private static String sha256Hex(byte[] content) { + try { + return java.util.HexFormat.of().formatHex( + java.security.MessageDigest.getInstance("SHA-256").digest(content)); + } catch (java.security.NoSuchAlgorithmException e) { + throw new IllegalStateException("SHA-256 unavailable in JDK", e); + } + } + + /** + * Emit the {@code checksums.sha256} entry — the canonical integrity manifest + * for receivers. Format: {@code \n} per line, which + * matches GNU coreutils {@code sha256sum} output so verification is a + * straight {@code sha256sum -c checksums.sha256} on the unpacked bundle. + * + *

Note: this file is itself NOT in the checksums map (would be circular). + * Operators wanting to verify {@code checksums.sha256} authenticity should + * verify the bundle.zip's signature/digest out-of-band (Sigstore, GPG, or + * the GitHub release SHA-256). + */ + private void writeChecksumsManifest(ZipOutputStream zos) throws IOException { + StringBuilder sb = new StringBuilder(checksums.size() * 80); + for (var e : checksums.entrySet()) { + sb.append(e.getValue()).append(" ").append(e.getKey()).append('\n'); + } + zos.putNextEntry(new ZipEntry("checksums.sha256")); + zos.write(sb.toString().getBytes(StandardCharsets.UTF_8)); zos.closeEntry(); } diff --git a/src/test/java/io/github/randomcodespace/iq/cli/BundleCommandTest.java b/src/test/java/io/github/randomcodespace/iq/cli/BundleCommandTest.java index b6a16287..115b623d 100644 --- a/src/test/java/io/github/randomcodespace/iq/cli/BundleCommandTest.java +++ b/src/test/java/io/github/randomcodespace/iq/cli/BundleCommandTest.java @@ -99,14 +99,35 @@ void bundleCreatesZipWithCorrectStructure(@TempDir Path tempDir) throws IOExcept assertTrue(manifest.contains("\"backend\" : \"neo4j\"")); assertTrue(manifest.contains("\"includes_source\" : true")); - // Verify serve.sh content + // Verify serve.sh content — air-gapped (no public-internet calls) String serveShell = new String( zf.getInputStream(zf.getEntry("serve.sh")).readAllBytes(), StandardCharsets.UTF_8); assertTrue(serveShell.contains("#!/usr/bin/env bash")); assertTrue(serveShell.contains("serve ./source")); assertTrue(serveShell.contains("--graph ./graph.db")); - assertTrue(serveShell.contains("maven.org")); + // Defense against re-introduction of network calls in launchers + assertFalse(serveShell.contains("curl"), + "serve.sh must not include any curl/network call (RAN-46 air-gap rule)"); + assertFalse(serveShell.contains("maven.org"), + "serve.sh must not reference Maven Central (RAN-46 air-gap rule)"); + assertTrue(serveShell.contains("sha256sum -c"), + "serve.sh must verify checksums.sha256 by default"); + + // Verify checksums.sha256 entry exists, format-conforms to GNU + // sha256sum, and excludes itself (would be circular). + assertNotNull(zf.getEntry("checksums.sha256"), + "Bundle must include checksums.sha256"); + String checksums = new String( + zf.getInputStream(zf.getEntry("checksums.sha256")).readAllBytes(), + StandardCharsets.UTF_8); + assertFalse(checksums.contains("checksums.sha256"), + "checksums.sha256 must not list itself (circular)"); + assertTrue(checksums.matches("(?s)([0-9a-f]{64} \\S.*\n)+"), + "Each line must match GNU sha256sum format: <64-hex> "); + // Sanity: manifest.json should appear in the checksums file. + assertTrue(checksums.contains(" manifest.json\n"), + "Manifest entry must be checksummed"); } }