From 720b203d778d7831e27310f4685213f51112c85c Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Wed, 29 Apr 2026 13:15:28 +0500 Subject: [PATCH 01/43] add detector corpora test workflow and script --- .github/workflows/detector-corpora-test.yml | 68 +++++++++++++++++++++ scripts/detector_corpora_test.sh | 52 ++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 .github/workflows/detector-corpora-test.yml create mode 100755 scripts/detector_corpora_test.sh diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml new file mode 100644 index 000000000000..0fab6a4f10f3 --- /dev/null +++ b/.github/workflows/detector-corpora-test.yml @@ -0,0 +1,68 @@ +name: Corpora Test + +on: + workflow_dispatch: + pull_request: + paths: + - 'pkg/detectors/**' + - '.github/workflows/detector-corpora-test.yml' + - 'scripts/detector_corpora_test.sh' + +env: + DATASETS: | + s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd + +jobs: + corpora-test: + if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }} + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version: "1.25" + + - name: Install dependencies + run: sudo apt-get install -y zstd jq + + - name: Install DuckDB + run: | + wget -q https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip + unzip duckdb_cli-linux-amd64.zip + sudo mv duckdb /usr/local/bin/ + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Run corpora test + run: | + files=() + while IFS= read -r dataset; do + [[ -z "$dataset" ]] && continue + files+=("$dataset") + done <<< "$DATASETS" + ./scripts/detector_corpora_test.sh "${files[@]}" | tee /tmp/corpora-results.txt + + - name: Post results to PR + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const results = fs.readFileSync('/tmp/corpora-results.txt', 'utf8'); + const body = `## Corpora Test Results\n\n\`\`\`\n${results}\n\`\`\``; + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh new file mode 100755 index 000000000000..f7dae86017fe --- /dev/null +++ b/scripts/detector_corpora_test.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [ ...]" + exit 1 +fi + +OUTPUT_JSONL="/tmp/corpora_results.jsonl" +> "$OUTPUT_JSONL" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +TRUFFLEHOG_BIN="${REPO_ROOT}/trufflehog" + +CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" + +scan() { + local input="$1" + set +e + unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ + --no-update \ + --log-level=3 \ + --concurrency=6 \ + --json \ + --print-avg-detector-time \ + stdin >> "$OUTPUT_JSONL" 2>/dev/null + set -e +} + +for CORPORA_FILE in "$@"; do + if [[ "$CORPORA_FILE" == s3://* ]]; then + aws s3 cp "$CORPORA_FILE" - | scan /dev/stdin + else + scan "$CORPORA_FILE" + fi +done + +duckdb -c " +CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); + +SELECT + t.DetectorName detector, + COUNT(*) total, + SUM(CASE WHEN Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) verified, + SUM(CASE WHEN NOT Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) unverified, + SUM(CASE WHEN VerificationError IS NOT NULL THEN 1 ELSE 0 END) \"unknown\" +FROM t +GROUP BY all +ORDER BY total DESC, detector +LIMIT 50; +" From 942b25f336396d6f2586e22a3620adc56a117793 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Wed, 29 Apr 2026 13:37:27 +0500 Subject: [PATCH 02/43] only run once per PR, make comment descriptive, add handling for manual runs to get PR issue number --- .github/workflows/detector-corpora-test.yml | 36 +++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 0fab6a4f10f3..1f14091f4072 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -3,6 +3,7 @@ name: Corpora Test on: workflow_dispatch: pull_request: + types: [opened, reopened] paths: - 'pkg/detectors/**' - '.github/workflows/detector-corpora-test.yml' @@ -59,10 +60,41 @@ jobs: script: | const fs = require('fs'); const results = fs.readFileSync('/tmp/corpora-results.txt', 'utf8'); - const body = `## Corpora Test Results\n\n\`\`\`\n${results}\n\`\`\``; + const body = [ + `## Corpora Test Results`, + ``, + `This test scans a real-world dataset of public content to measure how often this detector fires. A high number of unverified or unknown results may indicate the detector is too noisy and could impact signal quality in production.`, + ``, + `| Column | Meaning |`, + `|--------|---------|`, + `| \`total\` | All findings for this detector |`, + `| \`verified\` | Confirmed valid credentials |`, + `| \`unverified\` | Matched pattern but could not verify (credential may be invalid or service unreachable) |`, + `| \`unknown\` | Verification attempted but errored |`, + ``, + `\`\`\``, + results, + `\`\`\``, + ].join('\n'); + let issue_number; + if (context.eventName === 'workflow_dispatch') { + const pulls = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`, + state: 'open', + }); + if (pulls.data.length === 0) { + core.setFailed(`No open PR found for branch ${context.ref}`); + return; + } + issue_number = pulls.data[0].number; + } else { + issue_number = context.issue.number; + } await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, - issue_number: context.issue.number, + issue_number, body, }); From 27b8867f21c1960c961142f77a8204a01ea8bed8 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Wed, 29 Apr 2026 13:42:48 +0500 Subject: [PATCH 03/43] comment out types to see result on all commits --- .github/workflows/detector-corpora-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 1f14091f4072..09b527a8eafc 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -3,7 +3,7 @@ name: Corpora Test on: workflow_dispatch: pull_request: - types: [opened, reopened] + # types: [opened, reopened] TODO: only done to see results in the PR (uncomment this when before merging) paths: - 'pkg/detectors/**' - '.github/workflows/detector-corpora-test.yml' From e360e6db9f22ce019b08b703c7d76e2cfb77a30f Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Wed, 29 Apr 2026 13:51:34 +0500 Subject: [PATCH 04/43] uncomment types --- .github/workflows/detector-corpora-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 09b527a8eafc..1f14091f4072 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -3,7 +3,7 @@ name: Corpora Test on: workflow_dispatch: pull_request: - # types: [opened, reopened] TODO: only done to see results in the PR (uncomment this when before merging) + types: [opened, reopened] paths: - 'pkg/detectors/**' - '.github/workflows/detector-corpora-test.yml' From 1aae080feb8173bec29b5dc72c148954a0af29b4 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Wed, 29 Apr 2026 14:23:18 +0500 Subject: [PATCH 05/43] remove table from comment --- .github/workflows/detector-corpora-test.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 1f14091f4072..9782341b3788 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -65,13 +65,6 @@ jobs: ``, `This test scans a real-world dataset of public content to measure how often this detector fires. A high number of unverified or unknown results may indicate the detector is too noisy and could impact signal quality in production.`, ``, - `| Column | Meaning |`, - `|--------|---------|`, - `| \`total\` | All findings for this detector |`, - `| \`verified\` | Confirmed valid credentials |`, - `| \`unverified\` | Matched pattern but could not verify (credential may be invalid or service unreachable) |`, - `| \`unknown\` | Verification attempted but errored |`, - ``, `\`\`\``, results, `\`\`\``, From 0a78ecc8f6ff19450019943f3b18e3613885f6e1 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Wed, 29 Apr 2026 14:25:10 +0500 Subject: [PATCH 06/43] comment out types --- .github/workflows/detector-corpora-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 9782341b3788..267866736143 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -3,7 +3,7 @@ name: Corpora Test on: workflow_dispatch: pull_request: - types: [opened, reopened] + # types: [opened, reopened] TODO: Decide if we should run this on every push paths: - 'pkg/detectors/**' - '.github/workflows/detector-corpora-test.yml' From b9d8506b19285919bba8a9d3baea3b27fa687a94 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 17:32:23 +0500 Subject: [PATCH 07/43] Phase 0: add explicit pipefail and capture trufflehog stderr --- .github/workflows/detector-corpora-test.yml | 2 ++ scripts/detector_corpora_test.sh | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 267866736143..9d07ccf69d84 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -46,7 +46,9 @@ jobs: aws-region: us-east-1 - name: Run corpora test + shell: bash run: | + set -o pipefail files=() while IFS= read -r dataset; do [[ -z "$dataset" ]] && continue diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh index f7dae86017fe..b93060195c51 100755 --- a/scripts/detector_corpora_test.sh +++ b/scripts/detector_corpora_test.sh @@ -9,6 +9,10 @@ fi OUTPUT_JSONL="/tmp/corpora_results.jsonl" > "$OUTPUT_JSONL" +# Captures trufflehog stderr (incl. --print-avg-detector-time output) for downstream phases. +STDERR_FILE="/tmp/corpora-stderr.txt" +> "$STDERR_FILE" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(dirname "$SCRIPT_DIR")" TRUFFLEHOG_BIN="${REPO_ROOT}/trufflehog" @@ -24,7 +28,7 @@ scan() { --concurrency=6 \ --json \ --print-avg-detector-time \ - stdin >> "$OUTPUT_JSONL" 2>/dev/null + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" set -e } From 26c1c03f52373b518674ba0bcd8929bab84464be Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 17:58:44 +0500 Subject: [PATCH 08/43] Phase 1: differential diffing PR vs main --- .github/workflows/detector-corpora-test.yml | 72 +++++++++---- scripts/detector_corpora_test.sh | 23 ++++- scripts/diff_corpora_results.py | 106 ++++++++++++++++++++ 3 files changed, 178 insertions(+), 23 deletions(-) create mode 100755 scripts/diff_corpora_results.py diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 9d07ccf69d84..f0e3f814e4e4 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -8,6 +8,7 @@ on: - 'pkg/detectors/**' - '.github/workflows/detector-corpora-test.yml' - 'scripts/detector_corpora_test.sh' + - 'scripts/diff_corpora_results.py' env: DATASETS: | @@ -23,6 +24,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Install Go uses: actions/setup-go@v5 @@ -32,12 +35,6 @@ jobs: - name: Install dependencies run: sudo apt-get install -y zstd jq - - name: Install DuckDB - run: | - wget -q https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip - unzip duckdb_cli-linux-amd64.zip - sudo mv duckdb /usr/local/bin/ - - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -45,8 +42,49 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - - name: Run corpora test + - name: Resolve merge-base and prepare main worktree + shell: bash + run: | + set -o pipefail + git fetch --no-tags --prune origin main + MERGE_BASE=$(git merge-base origin/main HEAD) + echo "Merge base: $MERGE_BASE" + git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" + + - name: Build trufflehog (PR HEAD) + shell: bash + run: | + set -o pipefail + CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . + + - name: Build trufflehog (main merge-base) + shell: bash + working-directory: /tmp/trufflehog-main-src + run: | + set -o pipefail + CGO_ENABLED=0 go build -o /tmp/trufflehog-main . + + - name: Run corpora test (main build) + shell: bash + env: + TRUFFLEHOG_BIN: /tmp/trufflehog-main + OUTPUT_JSONL: /tmp/results-main.jsonl + STDERR_FILE: /tmp/corpora-stderr-main.txt + run: | + set -o pipefail + files=() + while IFS= read -r dataset; do + [[ -z "$dataset" ]] && continue + files+=("$dataset") + done <<< "$DATASETS" + ./scripts/detector_corpora_test.sh "${files[@]}" + + - name: Run corpora test (PR build) shell: bash + env: + TRUFFLEHOG_BIN: /tmp/trufflehog-pr + OUTPUT_JSONL: /tmp/results-pr.jsonl + STDERR_FILE: /tmp/corpora-stderr-pr.txt run: | set -o pipefail files=() @@ -54,23 +92,21 @@ jobs: [[ -z "$dataset" ]] && continue files+=("$dataset") done <<< "$DATASETS" - ./scripts/detector_corpora_test.sh "${files[@]}" | tee /tmp/corpora-results.txt + ./scripts/detector_corpora_test.sh "${files[@]}" + + - name: Diff results + shell: bash + run: | + set -o pipefail + python3 scripts/diff_corpora_results.py /tmp/results-main.jsonl /tmp/results-pr.jsonl > /tmp/diff-report.md + cat /tmp/diff-report.md - name: Post results to PR uses: actions/github-script@v7 with: script: | const fs = require('fs'); - const results = fs.readFileSync('/tmp/corpora-results.txt', 'utf8'); - const body = [ - `## Corpora Test Results`, - ``, - `This test scans a real-world dataset of public content to measure how often this detector fires. A high number of unverified or unknown results may indicate the detector is too noisy and could impact signal quality in production.`, - ``, - `\`\`\``, - results, - `\`\`\``, - ].join('\n'); + const body = fs.readFileSync('/tmp/diff-report.md', 'utf8'); let issue_number; if (context.eventName === 'workflow_dispatch') { const pulls = await github.rest.pulls.list({ diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh index b93060195c51..fb9241173728 100755 --- a/scripts/detector_corpora_test.sh +++ b/scripts/detector_corpora_test.sh @@ -6,24 +6,35 @@ if [[ $# -lt 1 ]]; then exit 1 fi -OUTPUT_JSONL="/tmp/corpora_results.jsonl" +# CI sets OUTPUT_JSONL to per-run paths and skips the human-readable DuckDB +# summary. Local invocations leave it unset and get the summary table for +# debugging. +if [[ -z "${OUTPUT_JSONL+x}" ]]; then + OUTPUT_JSONL="/tmp/corpora_results.jsonl" + RUN_DUCKDB_SUMMARY=1 +else + RUN_DUCKDB_SUMMARY=0 +fi > "$OUTPUT_JSONL" # Captures trufflehog stderr (incl. --print-avg-detector-time output) for downstream phases. -STDERR_FILE="/tmp/corpora-stderr.txt" +STDERR_FILE="${STDERR_FILE:-/tmp/corpora-stderr.txt}" > "$STDERR_FILE" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(dirname "$SCRIPT_DIR")" -TRUFFLEHOG_BIN="${REPO_ROOT}/trufflehog" +TRUFFLEHOG_BIN="${TRUFFLEHOG_BIN:-${REPO_ROOT}/trufflehog}" -CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" +if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then + CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" +fi scan() { local input="$1" set +e unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ --no-update \ + --no-verification \ --log-level=3 \ --concurrency=6 \ --json \ @@ -40,7 +51,8 @@ for CORPORA_FILE in "$@"; do fi done -duckdb -c " +if [[ "$RUN_DUCKDB_SUMMARY" == "1" ]]; then + duckdb -c " CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); SELECT @@ -54,3 +66,4 @@ GROUP BY all ORDER BY total DESC, detector LIMIT 50; " +fi diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py new file mode 100755 index 000000000000..4765eb06c38b --- /dev/null +++ b/scripts/diff_corpora_results.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Diffs two trufflehog JSONL outputs (main vs PR build) and emits a Markdown +report to stdout. + +Identity per finding: (DetectorName, Raw or RawV2 fallback). Set semantics — +duplicates within a single scan collapse into one identity, so a regex change +either adds a new (detector, secret) identity or removes one. + +Verification is disabled at scan time (see scripts/detector_corpora_test.sh), +so verified/unverified deltas are intentionally not surfaced — the diff +measures regex match changes only. + +Usage: diff_corpora_results.py +""" +import json +import sys +from collections import defaultdict + + +PREAMBLE = ( + "This bench measures regex match regressions only. Verification is " + "disabled to avoid network-flake noise; verifier behavior is tested " + "separately by detector unit tests." +) + + +def load_findings(path): + """Returns dict: detector_name -> {"identities": set[str], "total": int}.""" + by_detector = defaultdict(lambda: {"identities": set(), "total": 0}) + with open(path, "r", encoding="utf-8", errors="replace") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + detector = obj.get("DetectorName") or "" + if not detector: + continue + raw = obj.get("Raw") or obj.get("RawV2") or "" + by_detector[detector]["identities"].add(raw) + by_detector[detector]["total"] += 1 + return by_detector + + +def render(main, pr): + detectors = sorted(set(main) | set(pr)) + rows = [] + has_diff = False + for d in detectors: + m = main.get(d, {"identities": set(), "total": 0}) + p = pr.get(d, {"identities": set(), "total": 0}) + new = p["identities"] - m["identities"] + removed = m["identities"] - p["identities"] + if new or removed: + has_diff = True + rows.append({ + "detector": d, + "total_main": m["total"], + "total_pr": p["total"], + "unique_main": len(m["identities"]), + "unique_pr": len(p["identities"]), + "new": len(new), + "removed": len(removed), + }) + + title = "## Corpora Test Results — Diff (PR vs main)" + parts = [title, "", PREAMBLE, ""] + + if not rows: + parts += ["_(No findings on either side.)_", ""] + return "\n".join(parts) + + if has_diff: + rows.sort(key=lambda r: (r["new"] + r["removed"], r["detector"]), reverse=True) + else: + parts += ["✅ No diff vs main — regex matches are identical across both builds.", ""] + rows.sort(key=lambda r: r["detector"]) + + parts += [ + "| Detector | total main | total PR | unique main | unique PR | NEW | REMOVED |", + "|---|---:|---:|---:|---:|---:|---:|", + ] + for r in rows: + parts.append( + f"| {r['detector']} | {r['total_main']} | {r['total_pr']} | " + f"{r['unique_main']} | {r['unique_pr']} | {r['new']} | {r['removed']} |" + ) + parts.append("") + return "\n".join(parts) + + +def main(): + if len(sys.argv) != 3: + print("Usage: diff_corpora_results.py ", file=sys.stderr) + sys.exit(2) + main_findings = load_findings(sys.argv[1]) + pr_findings = load_findings(sys.argv[2]) + sys.stdout.write(render(main_findings, pr_findings)) + + +if __name__ == "__main__": + main() From f46e86c0f02612d75f0d71c052996dc57255aef4 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 18:16:07 +0500 Subject: [PATCH 09/43] DEMO: loosen Stripe regex (will revert) --- pkg/detectors/stripe/stripe.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/detectors/stripe/stripe.go b/pkg/detectors/stripe/stripe.go index 807f5fa5e5be..c297eec5da7a 100644 --- a/pkg/detectors/stripe/stripe.go +++ b/pkg/detectors/stripe/stripe.go @@ -19,7 +19,7 @@ var _ detectors.Detector = (*Scanner)(nil) var ( // doesn't include test keys with "sk_test" - secretKey = regexp.MustCompile(`[rs]k_live_[a-zA-Z0-9]{20,247}`) + secretKey = regexp.MustCompile(`[a-zA-Z0-9]{20,247}`) ) // Keywords are used for efficiently pre-filtering chunks. From 021e8c36ef2adf13e1c31ca346106357b874ff36 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 18:48:59 +0500 Subject: [PATCH 10/43] DEMO: loosen JDBC regex (will revert) --- pkg/detectors/jdbc/jdbc.go | 2 +- pkg/detectors/stripe/stripe.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index cb9816f468f4..3e759bb5df9e 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,7 +53,7 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. diff --git a/pkg/detectors/stripe/stripe.go b/pkg/detectors/stripe/stripe.go index c297eec5da7a..807f5fa5e5be 100644 --- a/pkg/detectors/stripe/stripe.go +++ b/pkg/detectors/stripe/stripe.go @@ -19,7 +19,7 @@ var _ detectors.Detector = (*Scanner)(nil) var ( // doesn't include test keys with "sk_test" - secretKey = regexp.MustCompile(`[a-zA-Z0-9]{20,247}`) + secretKey = regexp.MustCompile(`[rs]k_live_[a-zA-Z0-9]{20,247}`) ) // Keywords are used for efficiently pre-filtering chunks. From 420ec5673f315254792d5a03f62fe35793f31960 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 18:54:23 +0500 Subject: [PATCH 11/43] Phase 1 fix: add --allow-verification-overlap, fix no-diff detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bench uses --no-verification, so the engine's overlap-path dedup (which exists to protect verifiers from duplicate calls) adds noise without value here — it causes shifts in unrelated detectors when only one detector's regex changes. Pair --allow-verification-overlap with --no-verification so each detector's regex behavior is measured independently. Also fix the false 'no diff vs main' claim that triggered when NEW/REMOVED were zero but total counts differed. --- scripts/detector_corpora_test.sh | 16 ++++++++++++++++ scripts/diff_corpora_results.py | 11 ++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh index fb9241173728..c1068f84646d 100755 --- a/scripts/detector_corpora_test.sh +++ b/scripts/detector_corpora_test.sh @@ -29,12 +29,28 @@ if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" fi +# --no-verification and --allow-verification-overlap are paired intentionally. +# This bench measures per-detector regex behavior in isolation: +# - --no-verification: avoids network-flake noise (rate limits, transient 5xx +# errors) that would otherwise produce verified/unverified deltas +# indistinguishable from real regex regressions. Verifier behavior is +# covered by detector unit tests. +# - --allow-verification-overlap: bypasses the engine's cross-detector +# overlap routing (pkg/engine/engine.go:862-872 + likelyDuplicate). That +# routing exists for verification safety — when one chunk has matches from +# multiple detectors, it dedups near-identical results so the same secret +# isn't sent to multiple verifiers. With verification off, the routing has +# no purpose, but its dedup side-effect (silently dropping a detector's +# other matches in a multi-match chunk) makes a regex change in detector A +# shift raw match counts in unrelated detector B, contaminating the diff. +# Bypassing it gives each detector independent regex measurement. scan() { local input="$1" set +e unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ --no-update \ --no-verification \ + --allow-verification-overlap \ --log-level=3 \ --concurrency=6 \ --json \ diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py index 4765eb06c38b..26a8a1223e9e 100755 --- a/scripts/diff_corpora_results.py +++ b/scripts/diff_corpora_results.py @@ -19,8 +19,9 @@ PREAMBLE = ( - "This bench measures regex match regressions only. Verification is " - "disabled to avoid network-flake noise; verifier behavior is tested " + "This bench measures regex match regressions only. It runs with " + "`--no-verification --allow-verification-overlap` so each detector's " + "regex behavior is measured independently — verifier behavior is tested " "separately by detector unit tests." ) @@ -55,7 +56,11 @@ def render(main, pr): p = pr.get(d, {"identities": set(), "total": 0}) new = p["identities"] - m["identities"] removed = m["identities"] - p["identities"] - if new or removed: + # A row is "diff-clean" only when NEW, REMOVED, AND raw totals all match. + # Total-count differences without identity changes are still real (e.g., + # a regex change in one detector can shift duplicate-match counts via + # cross-detector dedup), so they must not be reported as ✅. + if new or removed or m["total"] != p["total"]: has_diff = True rows.append({ "detector": d, From b0c3d287df1159e4fd9fa3cd67025af4fff4962a Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 19:23:55 +0500 Subject: [PATCH 12/43] revert jdbc detector change --- pkg/detectors/jdbc/jdbc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index 3e759bb5df9e..cb9816f468f4 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,7 +53,7 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. From 25f08fcdde975b7882c874d8dd606b57977b5e36 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 22:00:37 +0500 Subject: [PATCH 13/43] Phase 2: detector scoping, new-detector handling, blast radius, status emoji --- .github/workflows/detector-corpora-test.yml | 108 +++++++- scripts/detect_changed_detectors.sh | 175 ++++++++++++ scripts/detector_corpora_test.sh | 64 ++++- scripts/diff_corpora_results.py | 290 +++++++++++++++++--- 4 files changed, 578 insertions(+), 59 deletions(-) create mode 100755 scripts/detect_changed_detectors.sh diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index f0e3f814e4e4..2e3e1d7930d3 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -6,9 +6,11 @@ on: # types: [opened, reopened] TODO: Decide if we should run this on every push paths: - 'pkg/detectors/**' + - 'pkg/engine/defaults/defaults.go' - '.github/workflows/detector-corpora-test.yml' - 'scripts/detector_corpora_test.sh' - 'scripts/diff_corpora_results.py' + - 'scripts/detect_changed_detectors.sh' env: DATASETS: | @@ -42,34 +44,95 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - - name: Resolve merge-base and prepare main worktree + - name: Resolve merge-base + id: merge_base shell: bash run: | set -o pipefail git fetch --no-tags --prune origin main MERGE_BASE=$(git merge-base origin/main HEAD) echo "Merge base: $MERGE_BASE" + echo "sha=$MERGE_BASE" >> "$GITHUB_OUTPUT" + + # Determine which detectors changed in this PR. The PR build scopes its + # scan to the full set; the main build excludes detectors that don't + # exist there yet (new detectors). If the set is empty, the workflow + # short-circuits with a skip comment — scoping is the entire point of + # Phase 2, falling back to scan-all defeats it. + - name: Detect changed detectors + id: detect + shell: bash + env: + BASE_REF: ${{ steps.merge_base.outputs.sha }} + run: | + set -o pipefail + chmod +x scripts/detect_changed_detectors.sh + PR_CSV=$(./scripts/detect_changed_detectors.sh --pr-csv || true) + MAIN_CSV=$(./scripts/detect_changed_detectors.sh --main-csv || true) + NEW_LIST=$(./scripts/detect_changed_detectors.sh --new-only || true) + NEW_CSV=$(echo "$NEW_LIST" | paste -sd, -) + echo "PR detectors: $PR_CSV" + echo "Main detectors: $MAIN_CSV" + echo "New detectors: $NEW_CSV" + echo "pr_csv=$PR_CSV" >> "$GITHUB_OUTPUT" + echo "main_csv=$MAIN_CSV" >> "$GITHUB_OUTPUT" + echo "new_csv=$NEW_CSV" >> "$GITHUB_OUTPUT" + if [[ -n "$PR_CSV" ]]; then + echo "any_changed=true" >> "$GITHUB_OUTPUT" + else + echo "any_changed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Skip comment (no detector source changed) + if: steps.detect.outputs.any_changed != 'true' + uses: actions/github-script@v7 + with: + script: | + if (context.eventName === 'workflow_dispatch') return; + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: '## Corpora Test Results\n\nNo detector source files changed in this PR. Bench skipped.', + }); + + - name: Prepare main worktree + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + MERGE_BASE: ${{ steps.merge_base.outputs.sha }} + run: | + set -o pipefail git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" - name: Build trufflehog (PR HEAD) + if: steps.detect.outputs.any_changed == 'true' shell: bash run: | set -o pipefail CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . - name: Build trufflehog (main merge-base) + if: steps.detect.outputs.any_changed == 'true' shell: bash working-directory: /tmp/trufflehog-main-src run: | set -o pipefail CGO_ENABLED=0 go build -o /tmp/trufflehog-main . - - name: Run corpora test (main build) + # The PR scan always runs (any_changed=true means at least one detector + # is in pr_csv). It also captures the corpus byte total for the diff + # script's blast-radius column — same content streams to both binaries, + # so measuring once is enough. + - name: Run corpora test (PR build) + if: steps.detect.outputs.any_changed == 'true' shell: bash env: - TRUFFLEHOG_BIN: /tmp/trufflehog-main - OUTPUT_JSONL: /tmp/results-main.jsonl - STDERR_FILE: /tmp/corpora-stderr-main.txt + TRUFFLEHOG_BIN: /tmp/trufflehog-pr + OUTPUT_JSONL: /tmp/results-pr.jsonl + STDERR_FILE: /tmp/corpora-stderr-pr.txt + INCLUDE_DETECTORS: ${{ steps.detect.outputs.pr_csv }} + CORPUS_BYTES_FILE: /tmp/corpus-bytes.txt run: | set -o pipefail files=() @@ -79,14 +142,25 @@ jobs: done <<< "$DATASETS" ./scripts/detector_corpora_test.sh "${files[@]}" - - name: Run corpora test (PR build) + # Main scan is skipped when main_csv is empty (PR adds only new + # detectors — nothing to compare against on main). The diff step is + # safe with an empty main JSONL: every PR finding is treated as NEW, + # which is correct semantics for new detectors. + - name: Run corpora test (main build) + if: steps.detect.outputs.any_changed == 'true' shell: bash env: - TRUFFLEHOG_BIN: /tmp/trufflehog-pr - OUTPUT_JSONL: /tmp/results-pr.jsonl - STDERR_FILE: /tmp/corpora-stderr-pr.txt + TRUFFLEHOG_BIN: /tmp/trufflehog-main + OUTPUT_JSONL: /tmp/results-main.jsonl + STDERR_FILE: /tmp/corpora-stderr-main.txt + INCLUDE_DETECTORS: ${{ steps.detect.outputs.main_csv }} run: | set -o pipefail + if [[ -z "$INCLUDE_DETECTORS" ]]; then + echo "No overlapping detectors in main; skipping main scan." + : > "$OUTPUT_JSONL" + exit 0 + fi files=() while IFS= read -r dataset; do [[ -z "$dataset" ]] && continue @@ -95,13 +169,27 @@ jobs: ./scripts/detector_corpora_test.sh "${files[@]}" - name: Diff results + if: steps.detect.outputs.any_changed == 'true' shell: bash + env: + CHANGED: ${{ steps.detect.outputs.pr_csv }} + NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }} run: | set -o pipefail - python3 scripts/diff_corpora_results.py /tmp/results-main.jsonl /tmp/results-pr.jsonl > /tmp/diff-report.md + CORPUS_BYTES=0 + if [[ -s /tmp/corpus-bytes.txt ]]; then + CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt) + fi + python3 scripts/diff_corpora_results.py \ + /tmp/results-main.jsonl /tmp/results-pr.jsonl \ + --changed-detectors="$CHANGED" \ + --new-detectors="$NEW_DETECTORS" \ + --corpus-bytes="$CORPUS_BYTES" \ + > /tmp/diff-report.md cat /tmp/diff-report.md - name: Post results to PR + if: steps.detect.outputs.any_changed == 'true' uses: actions/github-script@v7 with: script: | diff --git a/scripts/detect_changed_detectors.sh b/scripts/detect_changed_detectors.sh new file mode 100755 index 000000000000..77655e26cdda --- /dev/null +++ b/scripts/detect_changed_detectors.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash +# +# detect_changed_detectors.sh — Phase 2 +# +# Emits the list of detectors changed between two git refs, formatted for +# trufflehog's --include-detectors flag (comma-separated, lowercase protobuf +# enum names, optional ".v" version suffix). +# +# Source of truth for each detector's identifier: +# - Proto enum name comes from the detector's Type() implementation in its +# source files (e.g. `return detectorspb.DetectorType_AzureBatch` → +# `azurebatch`). Necessary because the package directory often differs +# from the enum name (azure_batch vs AzureBatch, npmtokenv2 vs NpmToken, +# close vs closecrm, etc.). +# - Version comes from the directory suffix only (`/v`). Detectors that +# encode the version in the dir name (e.g. `npmtokenv2`) are emitted +# without a version suffix; trufflehog then matches all versions of that +# proto type — wider scope but correct. +# +# "New detector" detection compares pkg/engine/defaults/defaults.go imports +# between the two refs. A detector imported at HEAD but not at BASE is new. +# +# Modes: +# (none) List all changed detectors at HEAD, one per line, in +# [.v] form. +# --pr-csv Same set as default mode, comma-joined. +# --main-csv Changed detectors that also exist at BASE (excludes new), +# comma-joined. Use as --include-detectors for the main build. +# --new-only Just the new detectors (in HEAD but not BASE), one per line. +# +# Env: +# BASE_REF default origin/main +# HEAD_REF default HEAD + +set -euo pipefail + +MODE="${1:-list}" +BASE_REF="${BASE_REF:-origin/main}" +HEAD_REF="${HEAD_REF:-HEAD}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +cd "$REPO_ROOT" + +# Resolve BASE to a concrete commit. Workflow already runs `git fetch origin +# main`; locally that may not be true, so we fall back to `main` if the +# remote-tracking ref is missing. +if ! git rev-parse --verify "$BASE_REF" >/dev/null 2>&1; then + if git rev-parse --verify main >/dev/null 2>&1; then + BASE_REF=main + else + echo "error: cannot resolve BASE_REF=$BASE_REF and no local 'main'" >&2 + exit 1 + fi +fi + +MERGE_BASE=$(git merge-base "$BASE_REF" "$HEAD_REF") + +# Step 1 — changed detector dirs (relative to repo root). +# Pattern: pkg/detectors/(/v)?/.go, excludes _test.go and +# files inside common/, custom_detectors/. +mapfile -t CHANGED_DIRS < <( + git diff --name-only "$MERGE_BASE...$HEAD_REF" -- 'pkg/detectors/**/*.go' \ + | grep -Ev '_test\.go$' \ + | grep -Ev '^pkg/detectors/(common|custom_detectors)/' \ + | sed -E 's|^(pkg/detectors/[^/]+(/v[0-9]+)?)/[^/]+\.go$|\1|' \ + | sort -u +) + +# Step 2 — defaults.go imports at each ref. Each line has form +# "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/(/v)?" +# We extract just the (/v)? portion to use as the dir identifier. +parse_defaults_imports() { + local ref="$1" + git show "$ref:pkg/engine/defaults/defaults.go" 2>/dev/null \ + | grep -oE '"github\.com/trufflesecurity/trufflehog/v3/pkg/detectors/[^"]+"' \ + | sed -E 's|.*/pkg/detectors/||; s|"$||' \ + | sort -u +} + +mapfile -t HEAD_IMPORTS < <(parse_defaults_imports "$HEAD_REF") +mapfile -t BASE_IMPORTS < <(parse_defaults_imports "$MERGE_BASE") + +# Set difference: detectors imported at HEAD but not at BASE. The dir +# identifier (e.g. "github/v2", "stripe") matches the form we extracted in +# step 1, so we can intersect directly without re-mapping. +NEW_DIRS_FILE=$(mktemp) +trap 'rm -f "$NEW_DIRS_FILE"' EXIT +comm -23 \ + <(printf '%s\n' "${HEAD_IMPORTS[@]}") \ + <(printf '%s\n' "${BASE_IMPORTS[@]}") \ + > "$NEW_DIRS_FILE" + +is_new_detector() { + grep -qxF "$1" "$NEW_DIRS_FILE" +} + +# Step 3 — for a dir, derive `[.v]`. +detector_id_for_dir() { + local dir="$1" + local version="" + if [[ "$dir" =~ ^pkg/detectors/[^/]+/v([0-9]+)$ ]]; then + version=".v${BASH_REMATCH[1]}" + fi + + # Extract proto enum name. Multiple matches are possible (a detector may + # also reference related types in helpers); the Type() return is by far + # the most common, so the modal value wins. + local proto + proto=$( + grep -hE 'return[[:space:]]+\S*DetectorType_[A-Za-z0-9]+' "$dir"/*.go 2>/dev/null \ + | grep -v '_test\.go' \ + | grep -oE 'DetectorType_[A-Za-z0-9]+' \ + | sort | uniq -c | sort -rn \ + | head -1 \ + | awk '{print $2}' \ + | sed 's/^DetectorType_//' \ + | tr '[:upper:]' '[:lower:]' + ) + if [[ -z "$proto" ]]; then + return 1 + fi + echo "${proto}${version}" +} + +# Step 4 — emit per mode. +emit_list() { + local dir id + for dir in "${CHANGED_DIRS[@]:-}"; do + [[ -z "$dir" ]] && continue + if id=$(detector_id_for_dir "$dir"); then + echo "$id" + else + echo "warning: could not resolve detector id for $dir" >&2 + fi + done | sort -u +} + +emit_main_list() { + local dir id + for dir in "${CHANGED_DIRS[@]:-}"; do + [[ -z "$dir" ]] && continue + # Strip `pkg/detectors/` prefix to get the import-path form, then + # check against the new-detector set. + local import_form="${dir#pkg/detectors/}" + if is_new_detector "$import_form"; then + continue + fi + if id=$(detector_id_for_dir "$dir"); then + echo "$id" + fi + done | sort -u +} + +emit_new_list() { + local dir id + for dir in "${CHANGED_DIRS[@]:-}"; do + [[ -z "$dir" ]] && continue + local import_form="${dir#pkg/detectors/}" + if ! is_new_detector "$import_form"; then + continue + fi + if id=$(detector_id_for_dir "$dir"); then + echo "$id" + fi + done | sort -u +} + +case "$MODE" in + list) emit_list ;; + --pr-csv) emit_list | paste -sd, - ;; + --main-csv) emit_main_list | paste -sd, - ;; + --new-only) emit_new_list ;; + *) echo "Usage: $0 [--pr-csv|--main-csv|--new-only]" >&2; exit 2 ;; +esac diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh index c1068f84646d..cde5b3210228 100755 --- a/scripts/detector_corpora_test.sh +++ b/scripts/detector_corpora_test.sh @@ -29,6 +29,24 @@ if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" fi +# When set, scope the scan to specific detectors. Comma-separated, lowercase +# proto enum names with optional ".v" suffix (matches the format produced +# by scripts/detect_changed_detectors.sh). +INCLUDE_DETECTORS="${INCLUDE_DETECTORS:-}" +INCLUDE_FLAG=() +if [[ -n "$INCLUDE_DETECTORS" ]]; then + INCLUDE_FLAG=(--include-detectors="$INCLUDE_DETECTORS") +fi + +# When set, total uncompressed content bytes streamed to trufflehog (across +# all datasets in this run) are written to this path. Used by the diff +# script to compute blast-radius density. Awk inline-counts the post-jq +# stream so we don't double-read; END block runs before stdin EOF +# propagates out of the pipeline, so the value is written by the time the +# scan exits. +CORPUS_BYTES_FILE="${CORPUS_BYTES_FILE:-}" +TOTAL_BYTES=0 + # --no-verification and --allow-verification-overlap are paired intentionally. # This bench measures per-detector regex behavior in isolation: # - --no-verification: avoids network-flake noise (rate limits, transient 5xx @@ -46,17 +64,41 @@ fi # Bypassing it gives each detector independent regex measurement. scan() { local input="$1" + local bytes_tmp="" + if [[ -n "$CORPUS_BYTES_FILE" ]]; then + bytes_tmp=$(mktemp) + fi set +e - unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ - --no-update \ - --no-verification \ - --allow-verification-overlap \ - --log-level=3 \ - --concurrency=6 \ - --json \ - --print-avg-detector-time \ - stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + if [[ -n "$bytes_tmp" ]]; then + unzstd -c "$input" | jq -r .content \ + | awk -v BF="$bytes_tmp" '{ b += length($0) + 1; print } END { printf "%d", b > BF; close(BF) }' \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=6 \ + --json \ + --print-avg-detector-time \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + else + unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=6 \ + --json \ + --print-avg-detector-time \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + fi set -e + if [[ -n "$bytes_tmp" ]]; then + TOTAL_BYTES=$((TOTAL_BYTES + $(cat "$bytes_tmp"))) + rm -f "$bytes_tmp" + fi } for CORPORA_FILE in "$@"; do @@ -67,6 +109,10 @@ for CORPORA_FILE in "$@"; do fi done +if [[ -n "$CORPUS_BYTES_FILE" ]]; then + echo "$TOTAL_BYTES" > "$CORPUS_BYTES_FILE" +fi + if [[ "$RUN_DUCKDB_SUMMARY" == "1" ]]; then duckdb -c " CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py index 26a8a1223e9e..58e00043ee0a 100755 --- a/scripts/diff_corpora_results.py +++ b/scripts/diff_corpora_results.py @@ -11,8 +11,19 @@ so verified/unverified deltas are intentionally not surfaced — the diff measures regex match changes only. -Usage: diff_corpora_results.py +Phase 2: when --changed-detectors is provided, the report focuses on the +detectors changed by the PR. Detectors flagged via --new-detectors are +rendered with 🆕 status and absolute density (no main baseline). When +--corpus-bytes is provided, a blast-radius column projects matches per +10 GB of scanned content. + +Usage: + diff_corpora_results.py + [--changed-detectors=] + [--new-detectors=] + [--corpus-bytes=] """ +import argparse import json import sys from collections import defaultdict @@ -25,9 +36,37 @@ "separately by detector unit tests." ) +# 10 GB notional monorepo for blast-radius projection. +BLAST_RADIUS_BYTES = 10 * 1024 * 1024 * 1024 + +# Cap how many sample Raw values we render in the per-detector details. +SAMPLE_LIMIT = 10 +SAMPLE_TRUNCATE = 120 + + +def parse_csv(s): + """Parse a comma-separated detector list into normalized name set. + + Strips ``.v`` version suffixes and lowercases. JSONL DetectorName is the + proto enum name (e.g., ``JDBC``); we match case-insensitively by name only, + since version doesn't appear in the output. Versioned scoping happens at + the trufflehog --include-detectors level. + """ + if not s: + return set() + out = set() + for item in s.split(","): + item = item.strip() + if not item: + continue + if "." in item: + item = item.split(".", 1)[0] + out.add(item.lower()) + return out + def load_findings(path): - """Returns dict: detector_name -> {"identities": set[str], "total": int}.""" + """Returns dict: detector_name -> {"identities": set[str], "total": int, "samples": list[str]}.""" by_detector = defaultdict(lambda: {"identities": set(), "total": 0}) with open(path, "r", encoding="utf-8", errors="replace") as f: for line in f: @@ -47,64 +86,235 @@ def load_findings(path): return by_detector -def render(main, pr): - detectors = sorted(set(main) | set(pr)) +def status_emoji(new_count, removed_count, unique_main): + """Hybrid threshold: 🔴 on absolute (>5) OR relative (>20% of main) NEW, OR any REMOVED.""" + if removed_count > 0: + return "🔴" + if new_count > 5 or new_count > 0.20 * max(unique_main, 1): + return "🔴" + if new_count > 0: + return "⚠️" + return "✅" + + +def truncate(s, n=SAMPLE_TRUNCATE): + if len(s) <= n: + return s + return s[: n - 1] + "…" + + +def render_blast_radius(matches, corpus_bytes, signed=False): + if corpus_bytes is None or corpus_bytes <= 0: + return "" + density = matches / corpus_bytes # matches per byte + projected = density * BLAST_RADIUS_BYTES + if signed: + sign = "+" if projected > 0 else ("−" if projected < 0 else "") + return f"{sign}{abs(projected):,.0f}" + return f"{projected:,.0f}" + + +def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): + new_detectors = new_detectors or set() + + if changed: + all_names = {d for d in (set(main) | set(pr)) + if d.lower() in changed} + # Detectors that the PR claims to have changed (or added) but that + # produced zero matches on either side. These don't appear in JSONL, + # so we surface them as a warning row. + seen_lower = {d.lower() for d in (set(main) | set(pr))} + missing = sorted(d for d in changed if d not in seen_lower) + else: + all_names = set(main) | set(pr) + missing = [] + rows = [] has_diff = False - for d in detectors: + for d in sorted(all_names): + is_new = d.lower() in new_detectors m = main.get(d, {"identities": set(), "total": 0}) p = pr.get(d, {"identities": set(), "total": 0}) - new = p["identities"] - m["identities"] - removed = m["identities"] - p["identities"] - # A row is "diff-clean" only when NEW, REMOVED, AND raw totals all match. - # Total-count differences without identity changes are still real (e.g., - # a regex change in one detector can shift duplicate-match counts via - # cross-detector dedup), so they must not be reported as ✅. - if new or removed or m["total"] != p["total"]: + new_ids = p["identities"] - m["identities"] + removed_ids = m["identities"] - p["identities"] + + if is_new: + emoji = "🆕" + else: + emoji = status_emoji(len(new_ids), len(removed_ids), len(m["identities"])) + + if new_ids or removed_ids or m["total"] != p["total"]: has_diff = True + + if is_new: + blast = render_blast_radius(p["total"], corpus_bytes, signed=False) + else: + blast = render_blast_radius(p["total"] - m["total"], corpus_bytes, signed=True) + rows.append({ "detector": d, + "is_new": is_new, + "emoji": emoji, "total_main": m["total"], "total_pr": p["total"], "unique_main": len(m["identities"]), "unique_pr": len(p["identities"]), - "new": len(new), - "removed": len(removed), + "new_count": len(new_ids), + "removed_count": len(removed_ids), + "new_samples": sorted(new_ids)[:SAMPLE_LIMIT], + "removed_samples": sorted(removed_ids)[:SAMPLE_LIMIT], + "blast": blast, }) - title = "## Corpora Test Results — Diff (PR vs main)" - parts = [title, "", PREAMBLE, ""] + parts = ["## Corpora Test Results — Diff (PR vs main)", "", PREAMBLE, ""] + if changed: + parts.append( + f"_Scoped to {len(changed)} detector(s) changed in this PR; " + f"unchanged detectors are not measured._" + ) + parts.append("") - if not rows: - parts += ["_(No findings on either side.)_", ""] + if not rows and not missing: + parts += ["_(No findings on either side for the changed detectors.)_", ""] return "\n".join(parts) - if has_diff: - rows.sort(key=lambda r: (r["new"] + r["removed"], r["detector"]), reverse=True) - else: - parts += ["✅ No diff vs main — regex matches are identical across both builds.", ""] - rows.sort(key=lambda r: r["detector"]) - - parts += [ - "| Detector | total main | total PR | unique main | unique PR | NEW | REMOVED |", - "|---|---:|---:|---:|---:|---:|---:|", - ] - for r in rows: - parts.append( - f"| {r['detector']} | {r['total_main']} | {r['total_pr']} | " - f"{r['unique_main']} | {r['unique_pr']} | {r['new']} | {r['removed']} |" - ) - parts.append("") + if rows: + if has_diff or any(r["is_new"] for r in rows): + rows.sort( + key=lambda r: ( + 0 if r["is_new"] else 1, + -(r["new_count"] + r["removed_count"]), + r["detector"], + ) + ) + else: + parts += [ + "✅ No diff vs main — regex matches are identical across both builds.", + "", + ] + rows.sort(key=lambda r: r["detector"]) + + show_blast = corpus_bytes is not None and corpus_bytes > 0 + cols = ["Status", "Detector", "total main", "total PR", + "unique main", "unique PR", "NEW", "REMOVED"] + aligns = ["", "", "---:", "---:", "---:", "---:", "---:", "---:"] + if show_blast: + cols.append("Blast radius (Δ per 10 GB)") + aligns.append("---:") + parts += [ + "| " + " | ".join(cols) + " |", + "|" + "|".join(a if a else "---" for a in aligns) + "|", + ] + + for r in rows: + if r["is_new"]: + cells = [ + r["emoji"], + r["detector"], + "—", + str(r["total_pr"]), + "—", + str(r["unique_pr"]), + "—", + "—", + ] + else: + cells = [ + r["emoji"], + r["detector"], + str(r["total_main"]), + str(r["total_pr"]), + str(r["unique_main"]), + str(r["unique_pr"]), + str(r["new_count"]), + str(r["removed_count"]), + ] + if show_blast: + cells.append(r["blast"] or "—") + parts.append("| " + " | ".join(cells) + " |") + parts.append("") + + if show_blast: + parts += [ + "_Blast radius projects PR-vs-main match-count delta to a 10 GB " + "monorepo (positive = added matches, negative = removed). For 🆕 " + "rows it shows absolute projected matches with no baseline._", + "", + ] + + if missing: + parts += [ + "### ⚠️ Changed detectors with zero matches in both builds", + "", + "These detectors were modified by the PR but produced no matches " + "against the corpus on either side. Could be a deliberate scope " + "narrowing, or — more concerning — a regex so loose the engine " + "silently filtered the flood (issue #3578). Worth a manual look.", + "", + ] + for d in missing: + parts.append(f"- `{d}`") + parts.append("") + + detail_rows = [r for r in rows if r["new_samples"] or r["removed_samples"]] + if detail_rows: + parts += ["### Per-detector details", ""] + for r in detail_rows: + parts.append(f"
{r['emoji']} {r['detector']}") + parts.append("") + if r["new_samples"]: + label = ( + f"NEW findings (showing {len(r['new_samples'])} of {r['new_count']})" + if r["new_count"] > len(r["new_samples"]) + else f"NEW findings ({r['new_count']})" + ) + parts.append(f"**{label}:**") + parts.append("") + for s in r["new_samples"]: + parts.append(f"- `{truncate(s)}`") + parts.append("") + if r["removed_samples"]: + label = ( + f"REMOVED findings (showing {len(r['removed_samples'])} of {r['removed_count']})" + if r["removed_count"] > len(r["removed_samples"]) + else f"REMOVED findings ({r['removed_count']})" + ) + parts.append(f"**{label}:**") + parts.append("") + for s in r["removed_samples"]: + parts.append(f"- `{truncate(s)}`") + parts.append("") + parts.append("
") + parts.append("") + return "\n".join(parts) def main(): - if len(sys.argv) != 3: - print("Usage: diff_corpora_results.py ", file=sys.stderr) - sys.exit(2) - main_findings = load_findings(sys.argv[1]) - pr_findings = load_findings(sys.argv[2]) - sys.stdout.write(render(main_findings, pr_findings)) + parser = argparse.ArgumentParser() + parser.add_argument("main_jsonl") + parser.add_argument("pr_jsonl") + parser.add_argument("--changed-detectors", default="", + help="CSV of detectors changed in PR; filters report.") + parser.add_argument("--new-detectors", default="", + help="CSV of detectors present in PR but not main; rendered with 🆕.") + parser.add_argument("--corpus-bytes", type=int, default=0, + help="Total uncompressed bytes scanned; enables blast-radius column.") + args = parser.parse_args() + + main_findings = load_findings(args.main_jsonl) + pr_findings = load_findings(args.pr_jsonl) + changed = parse_csv(args.changed_detectors) + new_detectors = parse_csv(args.new_detectors) + corpus_bytes = args.corpus_bytes if args.corpus_bytes > 0 else None + + sys.stdout.write(render( + main_findings, + pr_findings, + changed=changed if changed else None, + new_detectors=new_detectors, + corpus_bytes=corpus_bytes, + )) if __name__ == "__main__": From 735522bca4dc2217d1f544b89b7b1b45aafaaf5e Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Wed, 29 Apr 2026 22:21:20 +0500 Subject: [PATCH 14/43] DEMO: loosen JDBC + add fictional acmevault detector --- pkg/detectors/acmevault/eraser.go | 116 +++++++++++++ .../acmevault/eraser_integration_test.go | 161 ++++++++++++++++++ pkg/detectors/acmevault/eraser_test.go | 69 ++++++++ pkg/detectors/jdbc/jdbc.go | 2 +- pkg/engine/defaults/defaults.go | 2 + 5 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 pkg/detectors/acmevault/eraser.go create mode 100644 pkg/detectors/acmevault/eraser_integration_test.go create mode 100644 pkg/detectors/acmevault/eraser_test.go diff --git a/pkg/detectors/acmevault/eraser.go b/pkg/detectors/acmevault/eraser.go new file mode 100644 index 000000000000..d0bb694ef7e0 --- /dev/null +++ b/pkg/detectors/acmevault/eraser.go @@ -0,0 +1,116 @@ +package acmevault + +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + + regexp "github.com/wasilibs/go-re2" + + "github.com/trufflesecurity/trufflehog/v3/pkg/common" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detector_typepb" +) + +type Scanner struct { + client *http.Client +} + +// Ensure the Scanner satisfies the interface at compile time. +var _ detectors.Detector = (*Scanner)(nil) + +var ( + defaultClient = common.SaneHttpClient() + // Make sure that your group is surrounded in boundary characters such as below to reduce false positives. + keyPat = regexp.MustCompile(`acme-vault-[A-Za-z0-9]{32}defaults`) +) + +// Keywords are used for efficiently pre-filtering chunks. +// Use identifiers in the secret preferably, or the provider name. +func (s Scanner) Keywords() []string { + return []string{"acmevault"} +} + +// FromData will find and optionally verify acmevault secrets in a given set of bytes. +func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { + dataStr := string(data) + + uniqueMatches := make(map[string]struct{}) + for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) { + uniqueMatches[match[1]] = struct{}{} + } + + for match := range uniqueMatches { + s1 := detectors.Result{ + DetectorType: detector_typepb.DetectorType_Eraser, + Raw: []byte(match), + SecretParts: map[string]string{"key": match}, + ExtraData: map[string]string{ + "rotation_guide": "https://howtorotate.com/docs/tutorials/acmevault/", + }, + } + + if verify { + client := s.client + if client == nil { + client = defaultClient + } + + isVerified, extraData, verificationErr := verifyMatch(ctx, client, match) + s1.Verified = isVerified + s1.ExtraData = extraData + s1.SetVerificationError(verificationErr, match) + } + + results = append(results, s1) + } + + return +} + +func verifyMatch(ctx context.Context, client *http.Client, token string) (bool, map[string]string, error) { + // https://docs.acmevault.io/reference/generate-diagram-from-acmevault-dsl + payload := strings.NewReader("{\"elements\":[{\"type\":\"diagram\"}]}") + + url := "https://app.acmevault.io/api/render/elements" + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, payload) + if err != nil { + return false, nil, err + } + + req.Header = http.Header{"Authorization": []string{"Bearer " + token}} + req.Header.Add("content-type", "application/json") + + res, err := client.Do(req) + if err != nil { + return false, nil, err + } + defer func() { + _, _ = io.Copy(io.Discard, res.Body) + _ = res.Body.Close() + }() + + switch res.StatusCode { + case http.StatusOK: + return true, nil, nil + case http.StatusUnauthorized: + // 401 API token unauthorized + // The secret is determinately not verified (nothing to do) + return false, nil, nil + default: + // 400 The request is missing the 'text' parameter + // 500 acmevault was unable to generate a result + // 503 Service temporarily unavailable. This may be the result of too many requests. + return false, nil, fmt.Errorf("unexpected HTTP response status %d", res.StatusCode) + } +} + +func (s Scanner) Type() detector_typepb.DetectorType { + return detector_typepb.DetectorType_Eraser +} + +func (s Scanner) Description() string { + return "acmevault is a tool used for generating diagrams from DSL. acmevault API tokens can be used to authenticate and interact with the acmevault API." +} diff --git a/pkg/detectors/acmevault/eraser_integration_test.go b/pkg/detectors/acmevault/eraser_integration_test.go new file mode 100644 index 000000000000..d91273ecaa0a --- /dev/null +++ b/pkg/detectors/acmevault/eraser_integration_test.go @@ -0,0 +1,161 @@ +//go:build detectors +// +build detectors + +package acmevault + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/trufflesecurity/trufflehog/v3/pkg/common" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detector_typepb" +) + +func TestEraser_FromChunk(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + defer cancel() + testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors5") + if err != nil { + t.Fatalf("could not get test secrets from GCP: %s", err) + } + secret := testSecrets.MustGetField("ERASER") + inactiveSecret := testSecrets.MustGetField("ERASER_INACTIVE") + + type args struct { + ctx context.Context + data []byte + verify bool + } + tests := []struct { + name string + s Scanner + args args + want []detectors.Result + wantErr bool + wantVerificationErr bool + }{ + { + name: "found, verified", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a eraser secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detector_typepb.DetectorType_Eraser, + Verified: true, + }, + }, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "found, unverified", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a eraser secret %s within but not valid", inactiveSecret)), // the secret would satisfy the regex but not pass validation + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detector_typepb.DetectorType_Eraser, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "not found", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte("You cannot find the secret within"), + verify: true, + }, + want: nil, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "found, would be verified if not for timeout", + s: Scanner{client: common.SaneHttpClientTimeOut(1 * time.Microsecond)}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a eraser secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detector_typepb.DetectorType_Eraser, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: true, + }, + { + name: "found, verified but unexpected api surface", + s: Scanner{client: common.ConstantResponseHttpClient(500, "")}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a eraser secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detector_typepb.DetectorType_Eraser, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := tt.s.FromData(tt.args.ctx, tt.args.verify, tt.args.data) + if (err != nil) != tt.wantErr { + t.Errorf("Eraser.FromData() error = %v, wantErr %v", err, tt.wantErr) + return + } + for i := range got { + if len(got[i].Raw) == 0 { + t.Fatalf("no raw secret present: \n %+v", got[i]) + } + if (got[i].VerificationError() != nil) != tt.wantVerificationErr { + t.Fatalf("wantVerificationError = %v, verification error = %v", tt.wantVerificationErr, got[i].VerificationError()) + } + } + ignoreOpts := cmpopts.IgnoreFields(detectors.Result{}, "Raw", "verificationError") + if diff := cmp.Diff(got, tt.want, ignoreOpts); diff != "" { + t.Errorf("Eraser.FromData() %s diff: (-got +want)\n%s", tt.name, diff) + } + }) + } +} + +func BenchmarkFromData(benchmark *testing.B) { + ctx := context.Background() + s := Scanner{} + for name, data := range detectors.MustGetBenchmarkData() { + benchmark.Run(name, func(b *testing.B) { + b.ResetTimer() + for n := 0; n < b.N; n++ { + _, err := s.FromData(ctx, false, data) + if err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/pkg/detectors/acmevault/eraser_test.go b/pkg/detectors/acmevault/eraser_test.go new file mode 100644 index 000000000000..a93cae06ae39 --- /dev/null +++ b/pkg/detectors/acmevault/eraser_test.go @@ -0,0 +1,69 @@ +package acmevault + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" +) + +func TestEraser_Pattern(t *testing.T) { + d := Scanner{} + ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) + tests := []struct { + name string + input string + want []string + }{ + { + name: "typical pattern", + input: "eraser_token = 'KkBmh6TUBIcyFAp20XXa'", + want: []string{"KkBmh6TUBIcyFAp20XXa"}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input)) + if len(matchedDetectors) == 0 { + t.Errorf("keywords '%v' not matched by: %s", d.Keywords(), test.input) + return + } + + results, err := d.FromData(context.Background(), false, []byte(test.input)) + if err != nil { + t.Errorf("error = %v", err) + return + } + + if len(results) != len(test.want) { + if len(results) == 0 { + t.Errorf("did not receive result") + } else { + t.Errorf("expected %d results, only received %d", len(test.want), len(results)) + } + return + } + + actual := make(map[string]struct{}, len(results)) + for _, r := range results { + if len(r.RawV2) > 0 { + actual[string(r.RawV2)] = struct{}{} + } else { + actual[string(r.Raw)] = struct{}{} + } + } + expected := make(map[string]struct{}, len(test.want)) + for _, v := range test.want { + expected[v] = struct{}{} + } + + if diff := cmp.Diff(expected, actual); diff != "" { + t.Errorf("%s diff: (-want +got)\n%s", test.name, diff) + } + }) + } +} diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index cb9816f468f4..3e759bb5df9e 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,7 +53,7 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. diff --git a/pkg/engine/defaults/defaults.go b/pkg/engine/defaults/defaults.go index f769af2c5614..e8253675d544 100644 --- a/pkg/engine/defaults/defaults.go +++ b/pkg/engine/defaults/defaults.go @@ -7,6 +7,7 @@ import ( accuweatherv1 "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/accuweather/v1" accuweatherv2 "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/accuweather/v2" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/adafruitio" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/acmevault" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/adzuna" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/aeroworkflow" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/agora" @@ -871,6 +872,7 @@ func buildDetectorList() []detectors.Detector { &accuweatherv1.Scanner{}, &accuweatherv2.Scanner{}, &adafruitio.Scanner{}, + &acmevault.Scanner{}, // &adobeio.Scanner{}, &adzuna.Scanner{}, &aeroworkflow.Scanner{}, From 7b44c92d3a5c16a2da49e55b5eb87c318034c23b Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 00:27:21 +0500 Subject: [PATCH 15/43] Phase 2 fix: harden corpus byte counting against early trufflehog exit awk's END block doesn't run when trufflehog exits before draining stdin (SIGPIPE kills awk first), leaving the bytes file empty and breaking the step with a `$((TOTAL_BYTES + ))` syntax error. Read the file with a default of 0 and validate it's an integer before arithmetic. Also fold unzstd/jq stderr into STDERR_FILE so benign Broken pipe notices stay out of CI logs. Co-Authored-By: Claude Opus 4.7 --- scripts/detector_corpora_test.sh | 36 +++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh index cde5b3210228..50a2f78f27c3 100755 --- a/scripts/detector_corpora_test.sh +++ b/scripts/detector_corpora_test.sh @@ -68,9 +68,13 @@ scan() { if [[ -n "$CORPUS_BYTES_FILE" ]]; then bytes_tmp=$(mktemp) fi + # jq stderr is folded into STDERR_FILE so benign "Broken pipe" notices + # (trufflehog exits before jq finishes draining the corpus) don't pollute + # CI logs. Real jq parse errors land in the same file for postmortem. set +e if [[ -n "$bytes_tmp" ]]; then - unzstd -c "$input" | jq -r .content \ + unzstd -c "$input" 2>> "$STDERR_FILE" \ + | jq -r .content 2>> "$STDERR_FILE" \ | awk -v BF="$bytes_tmp" '{ b += length($0) + 1; print } END { printf "%d", b > BF; close(BF) }' \ | "$TRUFFLEHOG_BIN" \ --no-update \ @@ -83,20 +87,28 @@ scan() { "${INCLUDE_FLAG[@]}" \ stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" else - unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ - --no-update \ - --no-verification \ - --allow-verification-overlap \ - --log-level=3 \ - --concurrency=6 \ - --json \ - --print-avg-detector-time \ - "${INCLUDE_FLAG[@]}" \ - stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + unzstd -c "$input" 2>> "$STDERR_FILE" \ + | jq -r .content 2>> "$STDERR_FILE" \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=6 \ + --json \ + --print-avg-detector-time \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" fi set -e + # awk's END block may not run if trufflehog exits before draining stdin + # (SIGPIPE kills awk first), leaving bytes_tmp empty. Default to 0 and + # require a clean integer before arithmetic so a partial read can't + # break the step with `$((TOTAL_BYTES + ))`. if [[ -n "$bytes_tmp" ]]; then - TOTAL_BYTES=$((TOTAL_BYTES + $(cat "$bytes_tmp"))) + bytes=$(cat "$bytes_tmp" 2>/dev/null || echo 0) + [[ "$bytes" =~ ^[0-9]+$ ]] || bytes=0 + TOTAL_BYTES=$((TOTAL_BYTES + bytes)) rm -f "$bytes_tmp" fi } From e0e33bcb56efe27cb66ee1bfcc77fd070f636ce2 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 10:51:33 +0500 Subject: [PATCH 16/43] Phase 3a (1/3): add hack/extract-keywords for detector keyword introspection Static AST parse of a detector package to extract the strings returned by its Keywords() method. Used by the upcoming keyword-corpus builder to fan out per-detector GitHub Code Search queries during the corpora bench. AST-first because each detector lives in its own package; importing them dynamically would require codegen or `plugin`. Falls back to a regex over the function body, then a directory-wide grep, when AST resolution can't statically resolve the return value (helper calls, build-tagged variants). Co-Authored-By: Claude Opus 4.7 --- hack/extract-keywords/main.go | 316 ++++++++++++++++++++++++++++++++++ 1 file changed, 316 insertions(+) create mode 100644 hack/extract-keywords/main.go diff --git a/hack/extract-keywords/main.go b/hack/extract-keywords/main.go new file mode 100644 index 000000000000..3a95773f86b8 --- /dev/null +++ b/hack/extract-keywords/main.go @@ -0,0 +1,316 @@ +// extract-keywords parses a detector package directory and prints the +// strings returned by its `Keywords() []string` method as a JSON array. +// +// Used by scripts/build_keyword_corpus.py to fan out per-detector GitHub +// Code Search queries during the corpora bench. Static parsing is preferred +// over compile-and-import because each detector lives in its own package +// and importing them dynamically requires either codegen or `plugin`. +// +// Resolution order: +// 1. Walk all non-test *.go files via go/parser. +// 2. Find a method named Keywords with no parameters and a single +// []string return; take its first ReturnStmt. +// 3. If the return expr is a []string composite literal, collect string +// literal elements. +// 4. If it's an identifier, look up a package-level var with that name and +// extract from its initializer composite literal. +// 5. If AST extraction yields nothing, fall back to a regex over the body +// of the same Keywords function — handles oddities like build-tag-gated +// bodies that the parser may have skipped. +// +// Exit codes: +// +// 0 — keywords printed (possibly empty array). +// 1 — directory unreadable or no Keywords method found anywhere. +// +// An empty array on exit 0 is a deliberate signal to the caller that this +// detector should be marked thin-L1 and skipped, distinct from a hard +// failure (exit 1). +package main + +import ( + "encoding/json" + "fmt" + "go/ast" + "go/parser" + "go/token" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" +) + +func main() { + if len(os.Args) != 2 { + fmt.Fprintln(os.Stderr, "usage: extract-keywords ") + os.Exit(1) + } + dir := os.Args[1] + info, err := os.Stat(dir) + if err != nil || !info.IsDir() { + fmt.Fprintf(os.Stderr, "extract-keywords: %s is not a readable directory\n", dir) + os.Exit(1) + } + + keywords, found, err := extractFromDir(dir) + if err != nil { + fmt.Fprintf(os.Stderr, "extract-keywords: %v\n", err) + os.Exit(1) + } + if !found { + fmt.Fprintf(os.Stderr, "extract-keywords: no Keywords() method found in %s\n", dir) + os.Exit(1) + } + + out, _ := json.Marshal(keywords) + fmt.Println(string(out)) +} + +// extractFromDir parses all non-test Go files in dir and returns the +// keyword list. The found return distinguishes "Keywords method exists but +// returns nothing extractable" (found=true, empty slice) from "no Keywords +// method at all" (found=false). +func extractFromDir(dir string) (keywords []string, found bool, err error) { + fset := token.NewFileSet() + // parser.ParseDir is deprecated in favour of go/packages, but we + // deliberately want a build-tag-agnostic union of every file in the + // directory rather than the type-checked, build-tag-respecting view that + // go/packages produces. Switching would force a new direct module + // dependency for marginal gain on a CI helper. + //nolint:staticcheck + pkgs, err := parser.ParseDir(fset, dir, func(fi os.FileInfo) bool { + return !strings.HasSuffix(fi.Name(), "_test.go") + }, 0) + if err != nil { + return nil, false, fmt.Errorf("parse %s: %w", dir, err) + } + + // Most detector dirs have one package; versioned dirs (e.g. github/v2) + // also have one. Iterating handles both without a special case. + for _, pkg := range pkgs { + fnDecl, fnFile := findKeywordsFunc(pkg) + if fnDecl == nil { + continue + } + found = true + kws := extractFromFunc(fnDecl, pkg) + if len(kws) > 0 { + return kws, true, nil + } + // AST resolution failed — fall back to regex over the source range + // of the Keywords function body. Handles cases the AST walker + // can't statically resolve (helper calls, build-tagged variants). + if grepped := grepFallback(fset, fnFile, fnDecl); len(grepped) > 0 { + return grepped, true, nil + } + } + + if !found { + // Last-ditch: pure-grep across all source files in the dir. Catches + // cases where parser.ParseDir filtered the file out (rare; e.g. + // build-tag exclusion with the default ParseDir filter). + grepped, ok := grepDirFallback(dir) + if ok { + return grepped, true, nil + } + } + + return nil, found, nil +} + +// findKeywordsFunc returns the Keywords method decl (if any) and the file +// containing it. +func findKeywordsFunc(pkg *ast.Package) (*ast.FuncDecl, *ast.File) { + for _, file := range pkg.Files { + for _, decl := range file.Decls { + fn, ok := decl.(*ast.FuncDecl) + if !ok { + continue + } + if fn.Name == nil || fn.Name.Name != "Keywords" { + continue + } + if fn.Recv == nil || len(fn.Recv.List) != 1 { + continue + } + // Must look like `Keywords() []string`. Don't be picky about the + // receiver — both Scanner and scanner are seen in the codebase. + if fn.Type.Params != nil && len(fn.Type.Params.List) > 0 { + continue + } + if fn.Type.Results == nil || len(fn.Type.Results.List) != 1 { + continue + } + return fn, file + } + } + return nil, nil +} + +// extractFromFunc walks the function body for a return statement whose +// expression is either a []string composite literal or an identifier +// referring to a package-level var initialised with one. +func extractFromFunc(fn *ast.FuncDecl, pkg *ast.Package) []string { + if fn.Body == nil { + return nil + } + var out []string + ast.Inspect(fn.Body, func(n ast.Node) bool { + ret, ok := n.(*ast.ReturnStmt) + if !ok || len(ret.Results) == 0 { + return true + } + switch expr := ret.Results[0].(type) { + case *ast.CompositeLit: + out = append(out, stringLitsFromComposite(expr)...) + case *ast.Ident: + if vals := lookupPackageStringSlice(pkg, expr.Name); len(vals) > 0 { + out = append(out, vals...) + } + } + return false + }) + return dedupNonEmpty(out) +} + +// stringLitsFromComposite extracts string literal elements from a +// `[]string{"a", "b", ...}` composite literal. Non-literal elements (e.g. +// helper calls) are silently dropped — the caller falls back to regex. +func stringLitsFromComposite(c *ast.CompositeLit) []string { + if c == nil { + return nil + } + if !isStringSliceType(c.Type) { + return nil + } + var out []string + for _, el := range c.Elts { + if lit, ok := el.(*ast.BasicLit); ok && lit.Kind == token.STRING { + if s, err := strconv.Unquote(lit.Value); err == nil { + out = append(out, s) + } + } + } + return out +} + +func isStringSliceType(expr ast.Expr) bool { + at, ok := expr.(*ast.ArrayType) + if !ok { + return false + } + id, ok := at.Elt.(*ast.Ident) + return ok && id.Name == "string" +} + +// lookupPackageStringSlice resolves a package-level +// `var = []string{...}` declaration into its string literals. +func lookupPackageStringSlice(pkg *ast.Package, name string) []string { + for _, file := range pkg.Files { + for _, decl := range file.Decls { + gen, ok := decl.(*ast.GenDecl) + if !ok || gen.Tok != token.VAR { + continue + } + for _, spec := range gen.Specs { + vs, ok := spec.(*ast.ValueSpec) + if !ok { + continue + } + for i, n := range vs.Names { + if n.Name != name || i >= len(vs.Values) { + continue + } + if c, ok := vs.Values[i].(*ast.CompositeLit); ok { + if vals := stringLitsFromComposite(c); len(vals) > 0 { + return vals + } + } + } + } + } + } + return nil +} + +// stringLitRE matches Go double-quoted string literals (including escapes +// and \u sequences). Backtick raw strings are uncommon in keyword lists +// and are intentionally not handled. +var stringLitRE = regexp.MustCompile(`"((?:\\.|[^"\\])*)"`) + +// grepFallback extracts string literals from the source span of the +// Keywords function body using a regex. Used when AST resolution fails. +func grepFallback(fset *token.FileSet, file *ast.File, fn *ast.FuncDecl) []string { + if fn.Body == nil { + return nil + } + tokFile := fset.File(file.Pos()) + if tokFile == nil { + return nil + } + src, err := os.ReadFile(tokFile.Name()) + if err != nil { + return nil + } + start := tokFile.Offset(fn.Body.Lbrace) + end := tokFile.Offset(fn.Body.Rbrace) + if start < 0 || end <= start || end > len(src) { + return nil + } + return matchStringLits(string(src[start:end])) +} + +// grepDirFallback scans every .go file in dir for a `Keywords() []string` +// signature and extracts string literals from its body. Used when +// parser.ParseDir didn't surface any package (build-tag filtering, etc.). +func grepDirFallback(dir string) ([]string, bool) { + matches, err := filepath.Glob(filepath.Join(dir, "*.go")) + if err != nil { + return nil, false + } + bodyRE := regexp.MustCompile(`(?ms)Keywords\(\)\s*\[\]string\s*\{(.*?)^\}`) + var out []string + found := false + for _, m := range matches { + if strings.HasSuffix(m, "_test.go") { + continue + } + src, err := os.ReadFile(m) + if err != nil { + continue + } + for _, body := range bodyRE.FindAllStringSubmatch(string(src), -1) { + found = true + out = append(out, matchStringLits(body[1])...) + } + } + return dedupNonEmpty(out), found +} + +func matchStringLits(s string) []string { + var out []string + for _, m := range stringLitRE.FindAllStringSubmatch(s, -1) { + // m[0] is `"..."`, suitable for strconv.Unquote. + if v, err := strconv.Unquote(m[0]); err == nil { + out = append(out, v) + } + } + return dedupNonEmpty(out) +} + +func dedupNonEmpty(in []string) []string { + seen := make(map[string]struct{}, len(in)) + out := make([]string, 0, len(in)) + for _, s := range in { + if s == "" { + continue + } + if _, ok := seen[s]; ok { + continue + } + seen[s] = struct{}{} + out = append(out, s) + } + return out +} From a93890e718af9c5a4a42164374e7d8a39681a823 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 10:51:55 +0500 Subject: [PATCH 17/43] Phase 3a (2/3): add Layer 1 keyword corpus builder + workflow integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build_keyword_corpus.py queries GitHub Code Search for each changed detector's pre-filter keywords and emits a zstd-compressed JSONL whose shape matches the existing S3 corpus exactly: each line is `{"provenance": {...}, "content": ""}`. The corpora script's existing `unzstd | jq -r .content` pipe handles it unchanged — provenance is descriptive only and never reaches trufflehog. Rate-limit policy is header-driven: the search bucket's X-RateLimit-Remaining and X-RateLimit-Reset headers gate every call, with a 2.1s floor between requests as belt-and-suspenders. 403/429s honor Retry-After or fall back to the reset window. Cap is 100 unique results per detector, deduped on (repo, path, sha), with a per-keyword sub-cap so one popular keyword can't starve the others. A sidecar JSON reports per-detector fetch counts and a thin_l1 list of detectors whose total returned results were zero (or whose keyword extraction failed). The diff script reads it via a new --keyword-corpus-meta arg and renders a single contiguous blockquote callout above the per-detector details — a sidecar instead of an in-corpus signal because stdin metadata is dropped from trufflehog's findings output. Workflow change: a new "Build keyword corpus (Layer 1)" step fires after detector detection and overwrites DATASETS via $GITHUB_ENV to append the keyword corpus path. The corpora script picks it up unchanged through its existing local-file branch. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/detector-corpora-test.yml | 44 ++ scripts/build_keyword_corpus.py | 651 ++++++++++++++++++++ scripts/diff_corpora_results.py | 57 +- 3 files changed, 751 insertions(+), 1 deletion(-) create mode 100755 scripts/build_keyword_corpus.py diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 2e3e1d7930d3..653e3e073785 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -11,6 +11,8 @@ on: - 'scripts/detector_corpora_test.sh' - 'scripts/diff_corpora_results.py' - 'scripts/detect_changed_detectors.sh' + - 'scripts/build_keyword_corpus.py' + - 'hack/extract-keywords/**' env: DATASETS: | @@ -96,6 +98,43 @@ jobs: body: '## Corpora Test Results\n\nNo detector source files changed in this PR. Bench skipped.', }); + # Layer 1 keyword corpus — fetch real-world snippets from GitHub Code + # Search for each changed detector's pre-filter keywords. Output is a + # zstd-compressed JSONL whose shape matches the S3 corpus, so the + # corpora script picks it up unchanged via the DATASETS append below. + # The same corpus file is fed to both PR and main builds; thin-L1 + # detectors and per-detector counts are written to a sidecar JSON the + # diff step renders. + - name: Build extract-keywords helper + if: steps.detect.outputs.any_changed == 'true' + shell: bash + run: | + set -o pipefail + CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords + + - name: Build keyword corpus (Layer 1) + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DETECTORS: ${{ steps.detect.outputs.pr_csv }} + run: | + set -o pipefail + python3 scripts/build_keyword_corpus.py \ + --detectors="$DETECTORS" \ + --extract-keywords-bin=/tmp/extract-keywords \ + --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ + --output-meta=/tmp/keyword-corpus-meta.json \ + --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" + # Append to DATASETS for downstream scan steps. The python script + # always writes a (possibly empty) corpus, so the path is safe to + # append unconditionally — empty zstd frames decompress to 0 + # bytes and pass through the existing scan pipeline cleanly. + echo "DATASETS<> "$GITHUB_ENV" + echo "$DATASETS" >> "$GITHUB_ENV" + echo "/tmp/keyword-corpus.jsonl.zstd" >> "$GITHUB_ENV" + echo "EOF" >> "$GITHUB_ENV" + - name: Prepare main worktree if: steps.detect.outputs.any_changed == 'true' shell: bash @@ -180,11 +219,16 @@ jobs: if [[ -s /tmp/corpus-bytes.txt ]]; then CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt) fi + META_ARG=() + if [[ -s /tmp/keyword-corpus-meta.json ]]; then + META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json) + fi python3 scripts/diff_corpora_results.py \ /tmp/results-main.jsonl /tmp/results-pr.jsonl \ --changed-detectors="$CHANGED" \ --new-detectors="$NEW_DETECTORS" \ --corpus-bytes="$CORPUS_BYTES" \ + "${META_ARG[@]}" \ > /tmp/diff-report.md cat /tmp/diff-report.md diff --git a/scripts/build_keyword_corpus.py b/scripts/build_keyword_corpus.py new file mode 100755 index 000000000000..2f232a8ee62c --- /dev/null +++ b/scripts/build_keyword_corpus.py @@ -0,0 +1,651 @@ +#!/usr/bin/env python3 +"""Build the Layer 1 keyword corpus by querying GitHub Code Search for the +keywords each changed detector pre-filters on. + +Output is a zstd-compressed JSONL whose shape matches the S3 corpus: +each line is `{"provenance": {...}, "content": ""}`. +The corpora script extracts `.content` and pipes it to trufflehog via +stdin, so provenance fields are descriptive only — they aid postmortem +debugging of where a finding came from but don't reach trufflehog itself. + +A sidecar meta JSON is written next to the corpus. It reports per-detector +result counts plus a `thin_l1` list of detectors whose total returned +results was zero. The diff script reads it to render a thin-coverage +callout. + +Rate-limit policy: + - Search bucket is 30 requests/minute on the authenticated search API. + - We track X-RateLimit-Remaining and X-RateLimit-Reset on every search + response and pre-emptively sleep when remaining < safety threshold. + - Floor of 2.1s between consecutive search calls as belt-and-suspenders. + - 403/429 responses: honor Retry-After / X-RateLimit-Reset, sleep, retry + once. Two failures in a row → give up the keyword and move on. + +Cap: + - At most --max-results-per-detector unique results across all keywords + for that detector (default 100). + - Per-keyword sub-cap of ceil(cap / len(keywords)) so one popular + keyword can't starve the others. + - Identity for dedup: (repo_full_name, path, sha). + +Dependencies: + - Python stdlib only at runtime. + - `zstd` CLI (already installed in the corpora workflow) for the final + compression step. +""" +from __future__ import annotations + +import argparse +import json +import math +import os +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass, field +from typing import Any + + +GITHUB_API = "https://api.github.com" +USER_AGENT = "trufflehog-detector-bench/0.1" +SEARCH_PER_PAGE = 100 # API max — fewer round-trips means less rate budget eaten. +SEARCH_FLOOR_SLEEP = 2.1 # seconds — 30 req/min => 2s; 0.1 of cushion. +RAW_FETCH_TIMEOUT = 20.0 +SEARCH_TIMEOUT = 30.0 +MAX_RAW_BYTES = 384 * 1024 # GH Code Search index ceiling; defensive cap. + + +@dataclass +class RateState: + """Rate-limit state for the search bucket. Updated from every search + response and consulted before the next call.""" + + remaining: int = 30 # Optimistic; real value comes back in the first response. + reset_epoch: float = 0.0 + last_call: float = 0.0 + + def wait_before_call(self, safety: int = 2) -> None: + """Sleep just enough to respect both the 30/min header budget and + the per-call floor.""" + now = time.time() + # Floor pacing. + gap = SEARCH_FLOOR_SLEEP - (now - self.last_call) + if gap > 0: + time.sleep(gap) + # Header-driven pacing. + if self.remaining is not None and self.remaining < safety: + now = time.time() + wait = max(0.0, self.reset_epoch - now) + 1.0 + if wait > 0: + print( + f"[rate-limit] remaining={self.remaining}, sleeping {wait:.1f}s for reset", + file=sys.stderr, + ) + time.sleep(wait) + # After the reset window expires, the bucket is full again. + self.remaining = 30 + + +@dataclass +class DetectorReport: + detector: str + keywords: list[str] = field(default_factory=list) + fetched: int = 0 + keyword_failures: list[str] = field(default_factory=list) + thin_l1: bool = False + + +def main() -> int: + args = parse_args() + + token = os.environ.get("GITHUB_TOKEN", "").strip() + if not token: + print( + "[build_keyword_corpus] GITHUB_TOKEN is empty; writing an empty corpus " + "and marking all detectors thin_l1.", + file=sys.stderr, + ) + + detectors = [d.strip() for d in args.detectors.split(",") if d.strip()] + if not detectors: + # Nothing changed — write empty outputs and exit cleanly so the + # workflow can still append the path to DATASETS without a special + # case. + write_outputs(args.output_corpus, args.output_meta, [], {"reports": [], "thin_l1": []}) + return 0 + + rate = RateState() + reports: list[DetectorReport] = [] + corpus_lines: list[dict[str, Any]] = [] + seen_global: set[tuple[str, str, str]] = set() + + # Anything below can take time and touch the network. We want a written + # corpus + meta sidecar regardless of whether we got partway through, so + # downstream workflow steps stay deterministic even on fetch failures. + try: + run_main_loop(args, detectors, token, rate, reports, corpus_lines, seen_global) + finally: + summary = build_summary(reports) + write_outputs(args.output_corpus, args.output_meta, corpus_lines, summary) + print( + f"[build_keyword_corpus] wrote {len(corpus_lines)} corpus lines, " + f"{len(summary['thin_l1'])} detector(s) marked thin_l1", + file=sys.stderr, + ) + return 0 + + +def build_summary(reports: list[DetectorReport]) -> dict[str, Any]: + return { + "reports": [ + { + "detector": r.detector, + "keywords": r.keywords, + "fetched": r.fetched, + "keyword_failures": r.keyword_failures, + "thin_l1": r.thin_l1, + } + for r in reports + ], + "thin_l1": [r.detector for r in reports if r.thin_l1], + } + + +def run_main_loop( + args: argparse.Namespace, + detectors: list[str], + token: str, + rate: RateState, + reports: list[DetectorReport], + corpus_lines: list[dict[str, Any]], + seen_global: set[tuple[str, str, str]], +) -> None: + for raw_name in detectors: + # detect_changed_detectors.sh emits names like "github.v2"; the + # source dir is pkg/detectors/github/v2. Strip the .v suffix + # and translate it into a /v path component. + detector_name, version_suffix = split_version(raw_name) + package_dir = resolve_package_dir(detector_name, version_suffix, args.detectors_root) + + report = DetectorReport(detector=raw_name) + reports.append(report) + + if package_dir is None: + print( + f"[build_keyword_corpus] {raw_name}: cannot resolve package dir; " + "marking thin_l1", + file=sys.stderr, + ) + report.thin_l1 = True + continue + + keywords = run_extract_keywords(args.extract_keywords_bin, package_dir) + report.keywords = keywords + if not keywords: + print( + f"[build_keyword_corpus] {raw_name}: no keywords extracted from " + f"{package_dir}; marking thin_l1", + file=sys.stderr, + ) + report.thin_l1 = True + continue + + if not token: + report.thin_l1 = True + continue + + per_kw_cap = max(1, math.ceil(args.max_results_per_detector / len(keywords))) + cap_remaining = args.max_results_per_detector + + print( + f"[build_keyword_corpus] {raw_name}: keywords={keywords} " + f"cap={args.max_results_per_detector} per_kw_cap={per_kw_cap}", + file=sys.stderr, + ) + + for kw in keywords: + if cap_remaining <= 0: + break + try: + added = fetch_keyword_results( + keyword=kw, + detector_label=raw_name, + cap_remaining=cap_remaining, + per_kw_cap=per_kw_cap, + rate=rate, + token=token, + seen_global=seen_global, + corpus_lines=corpus_lines, + ) + except KeywordFetchError as exc: + print( + f"[build_keyword_corpus] {raw_name}: keyword '{kw}' failed: {exc}", + file=sys.stderr, + ) + report.keyword_failures.append(kw) + continue + except Exception as exc: # noqa: BLE001 — last-resort, see below + # We want partial outputs on the way out even if a fetch + # step blows up unexpectedly. Log, mark, continue — the + # finally block in main() still writes corpus/meta. + print( + f"[build_keyword_corpus] {raw_name}: keyword '{kw}' raised " + f"{type(exc).__name__}: {exc}", + file=sys.stderr, + ) + report.keyword_failures.append(kw) + continue + report.fetched += added + cap_remaining -= added + + if report.fetched == 0: + report.thin_l1 = True + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser() + p.add_argument( + "--detectors", + default=os.environ.get("DETECTORS", ""), + help="Comma-separated detector list (matches detect_changed_detectors.sh format).", + ) + p.add_argument( + "--detectors-root", + default="pkg/detectors", + help="Path to the detectors source tree (default pkg/detectors).", + ) + p.add_argument( + "--extract-keywords-bin", + default=os.environ.get("EXTRACT_KEYWORDS_BIN", "/tmp/extract-keywords"), + help="Pre-built extract-keywords binary.", + ) + p.add_argument( + "--output-corpus", + default="/tmp/keyword-corpus.jsonl.zstd", + help="Path for the zstd-compressed JSONL corpus output.", + ) + p.add_argument( + "--output-meta", + default="/tmp/keyword-corpus-meta.json", + help="Path for the per-detector meta sidecar JSON.", + ) + p.add_argument( + "--max-results-per-detector", + type=int, + default=int(os.environ.get("KEYWORD_CORPUS_CAP", "100")), + help="Cap on unique results fetched per detector across all keywords.", + ) + return p.parse_args() + + +def split_version(name: str) -> tuple[str, str]: + """`jdbc` → ('jdbc', ''); `github.v2` → ('github', 'v2').""" + if "." in name: + base, _, ver = name.partition(".") + return base, ver + return name, "" + + +def resolve_package_dir(name: str, version: str, root: str) -> str | None: + """Map a detector identifier back to its package directory. + + detect_changed_detectors.sh emits the proto-enum name (lowercase), but + package directory names sometimes diverge (e.g. proto NpmToken lives in + pkg/detectors/npmtoken). When the simple lowercase mapping doesn't + exist we fall through with None and let the caller mark thin_l1 — this + is correct semantics: we couldn't find data for this detector, surface + it as thin coverage rather than failing the workflow. + """ + candidates = [name] + if version: + candidates = [os.path.join(c, version) for c in candidates] + for c in candidates: + path = os.path.join(root, c) + if os.path.isdir(path): + return path + return None + + +def run_extract_keywords(binary: str, package_dir: str) -> list[str]: + if not os.path.isfile(binary): + print( + f"[build_keyword_corpus] extract-keywords binary not found at {binary}", + file=sys.stderr, + ) + return [] + try: + out = subprocess.run( + [binary, package_dir], + capture_output=True, + text=True, + timeout=20, + check=False, + ) + except subprocess.TimeoutExpired: + print(f"[build_keyword_corpus] extract-keywords timed out on {package_dir}", file=sys.stderr) + return [] + if out.returncode != 0: + if out.stderr.strip(): + print(out.stderr.strip(), file=sys.stderr) + return [] + try: + loaded = json.loads(out.stdout.strip() or "[]") + except json.JSONDecodeError: + return [] + if not isinstance(loaded, list): + return [] + return [k for k in loaded if isinstance(k, str) and k] + + +class KeywordFetchError(Exception): + """Wraps a fatal failure for a single keyword lookup.""" + + +def fetch_keyword_results( + *, + keyword: str, + detector_label: str, + cap_remaining: int, + per_kw_cap: int, + rate: RateState, + token: str, + seen_global: set[tuple[str, str, str]], + corpus_lines: list[dict[str, Any]], +) -> int: + """Returns the number of new corpus lines added for this keyword.""" + added = 0 + page = 1 + while added < per_kw_cap and (cap_remaining - added) > 0: + items, has_more = search_code(keyword, page, rate, token) + if not items: + break + for item in items: + if added >= per_kw_cap or (cap_remaining - added) <= 0: + break + repo = (item.get("repository") or {}).get("full_name") or "" + path = item.get("path") or "" + sha = item.get("sha") or "" + key = (repo, path, sha) + if not repo or not path or key in seen_global: + continue + download_url = item.get("html_url") + # `git_url` (blob API) is the canonical content source; fall + # back to constructing a raw URL from the html_url when blob is + # absent. Keep both candidates for robustness. + raw_candidates = build_raw_candidates(item) + content = fetch_first_ok(raw_candidates, token=token) + if content is None: + continue + seen_global.add(key) + corpus_lines.append( + { + "provenance": { + "layer": "L1", + "detector": detector_label, + "keyword": keyword, + "repo": repo, + "path": path, + "sha": sha, + "url": download_url or "", + }, + "content": content, + } + ) + added += 1 + if not has_more: + break + page += 1 + return added + + +def search_code( + keyword: str, + page: int, + rate: RateState, + token: str, +) -> tuple[list[dict[str, Any]], bool]: + """Single page of GitHub Code Search. Returns (items, has_more). + + `has_more` is True iff the response yielded a full page of results, + indicating the next page may have content. Using the size of the + returned items list (vs. parsing the total_count field) avoids + overshooting the 1000-result hard cap that the search API enforces. + """ + qs = urllib.parse.urlencode( + {"q": keyword, "per_page": SEARCH_PER_PAGE, "page": page} + ) + url = f"{GITHUB_API}/search/code?{qs}" + body, headers = github_request( + url, + token=token, + accept="application/vnd.github.v3+json", + rate=rate, + is_search=True, + ) + update_rate(rate, headers) + if body is None: + return [], False + try: + data = json.loads(body) + except json.JSONDecodeError: + return [], False + items = data.get("items") or [] + has_more = len(items) >= SEARCH_PER_PAGE + return items, has_more + + +def build_raw_candidates(item: dict[str, Any]) -> list[str]: + """Build candidate raw-content URLs from a code-search hit. + + The search API doesn't return a direct raw URL — `html_url` points at + the GitHub web UI. Translate it to raw.githubusercontent.com by + replacing `/blob/` with the raw host. Also include the `git_url` blob + API URL as a backup; that path is on the core 5000/hr token bucket + rather than the 30/min search bucket, so it's a safer fallback when + raw.githubusercontent.com gives us trouble. + """ + out: list[str] = [] + html_url = item.get("html_url") or "" + if html_url and "/blob/" in html_url: + raw = ( + html_url.replace("https://github.com/", "https://raw.githubusercontent.com/", 1) + .replace("/blob/", "/", 1) + ) + out.append(raw) + git_url = item.get("git_url") or "" + if git_url: + out.append(git_url) # GET on this returns a JSON envelope with base64 content. + return out + + +def fetch_first_ok(urls: list[str], *, token: str) -> str | None: + """Try each candidate URL in order and return the first successful + body, or None if all fail. The blob-API form returns a JSON envelope + that we decode separately.""" + for url in urls: + try: + if url.startswith("https://raw.githubusercontent.com/"): + req = urllib.request.Request(url, headers=raw_headers(token)) + with urllib.request.urlopen(req, timeout=RAW_FETCH_TIMEOUT) as resp: + data = resp.read(MAX_RAW_BYTES + 1) + if len(data) > MAX_RAW_BYTES: + return None + return decode_text(data) + # Blob API path: fetch JSON, base64-decode `content`. + body, _headers = github_request( + url, + token=token, + accept="application/vnd.github.v3+json", + rate=None, + is_search=False, + ) + if not body: + continue + try: + payload = json.loads(body) + except json.JSONDecodeError: + continue + if (payload.get("encoding") or "").lower() == "base64": + import base64 + + raw = base64.b64decode(payload.get("content") or "") + if len(raw) > MAX_RAW_BYTES: + return None + return decode_text(raw) + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError): + continue + return None + + +def github_request( + url: str, + *, + token: str, + accept: str, + rate: RateState | None, + is_search: bool, + max_retries: int = 1, +) -> tuple[str | None, dict[str, str]]: + """Issue a GitHub API request, honoring rate-limit pacing for searches + and retrying once on 403/429 if the headers indicate a wait window. + Returns (body, headers) — body is None on hard failure.""" + headers = { + "User-Agent": USER_AGENT, + "Accept": accept, + } + if token: + headers["Authorization"] = f"Bearer {token}" + + attempt = 0 + while True: + if is_search and rate is not None: + rate.wait_before_call() + rate.last_call = time.time() + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=SEARCH_TIMEOUT) as resp: + response_headers = {k.lower(): v for k, v in resp.headers.items()} + body = resp.read().decode("utf-8", errors="replace") + return body, response_headers + except urllib.error.HTTPError as exc: + response_headers = {k.lower(): v for k, v in (exc.headers or {}).items()} + if exc.code in (403, 429) and attempt < max_retries: + wait = compute_retry_wait(response_headers) + print( + f"[rate-limit] {exc.code} on {url}; sleeping {wait:.1f}s", + file=sys.stderr, + ) + time.sleep(wait) + attempt += 1 + continue + print(f"[github_request] {exc.code} on {url}: giving up", file=sys.stderr) + return None, response_headers + except (urllib.error.URLError, TimeoutError, OSError) as exc: + if attempt < max_retries: + time.sleep(2.0) + attempt += 1 + continue + print(f"[github_request] transport error on {url}: {exc}", file=sys.stderr) + return None, {} + except ValueError as exc: + # Malformed header (typically a corrupt token) — no point retrying. + print(f"[github_request] invalid request for {url}: {exc}", file=sys.stderr) + return None, {} + + +def compute_retry_wait(headers: dict[str, str]) -> float: + """Honor Retry-After (seconds) when present, else fall back to + X-RateLimit-Reset; floor at 1 second so we always make forward + progress even if the headers are wrong/missing.""" + if "retry-after" in headers: + try: + return max(1.0, float(headers["retry-after"])) + except ValueError: + pass + reset = headers.get("x-ratelimit-reset") + if reset: + try: + wait = float(reset) - time.time() + 1.0 + return max(1.0, wait) + except ValueError: + pass + return 60.0 + + +def update_rate(rate: RateState, headers: dict[str, str]) -> None: + rem = headers.get("x-ratelimit-remaining") + reset = headers.get("x-ratelimit-reset") + if rem is not None: + try: + rate.remaining = int(rem) + except ValueError: + pass + if reset is not None: + try: + rate.reset_epoch = float(reset) + except ValueError: + pass + + +def raw_headers(token: str) -> dict[str, str]: + h = {"User-Agent": USER_AGENT, "Accept": "application/vnd.github.v3.raw"} + if token: + h["Authorization"] = f"Bearer {token}" + return h + + +def decode_text(data: bytes) -> str: + """UTF-8 with replacement; raw blobs may contain odd bytes but trufflehog + consumes the JSON-extracted .content as text via stdin so we want a + valid string regardless.""" + return data.decode("utf-8", errors="replace") + + +def write_outputs( + output_corpus: str, + output_meta: str, + corpus_lines: list[dict[str, Any]], + summary: dict[str, Any], +) -> None: + """Write the JSONL corpus, compress it with zstd, and write the meta + sidecar. zstd is invoked as a subprocess so we don't depend on a Python + extension module — the `zstd` CLI is already installed in the corpora + workflow.""" + # 1. Plain JSONL → temp file. + if output_corpus.endswith(".zstd"): + tmp_jsonl = output_corpus[: -len(".zstd")] + elif output_corpus.endswith(".zst"): + tmp_jsonl = output_corpus[: -len(".zst")] + else: + tmp_jsonl = output_corpus + ".jsonl" + with open(tmp_jsonl, "w", encoding="utf-8") as f: + for line in corpus_lines: + f.write(json.dumps(line, ensure_ascii=False)) + f.write("\n") + + # 2. zstd compress in place. + if output_corpus.endswith(".zstd") or output_corpus.endswith(".zst"): + try: + subprocess.run( + ["zstd", "-q", "-f", "-o", output_corpus, tmp_jsonl], + check=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as exc: + print(f"[build_keyword_corpus] zstd compression failed: {exc}", file=sys.stderr) + raise + os.unlink(tmp_jsonl) + else: + # Caller asked for an uncompressed output; leave it alone. + os.replace(tmp_jsonl, output_corpus) + + # 3. Sidecar meta. + with open(output_meta, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2) + f.write("\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py index 58e00043ee0a..633c87f398a0 100755 --- a/scripts/diff_corpora_results.py +++ b/scripts/diff_corpora_results.py @@ -17,11 +17,19 @@ --corpus-bytes is provided, a blast-radius column projects matches per 10 GB of scanned content. +Phase 3a: --keyword-corpus-meta points at the sidecar JSON written by +scripts/build_keyword_corpus.py. When present, detectors whose Layer 1 +(GitHub Code Search) fetch returned zero results get a concise warning +rendered above the per-detector details — they're flagged so reviewers +know the bench's verdict for those detectors leans entirely on the S3 +corpus and may be under-sampled. + Usage: diff_corpora_results.py [--changed-detectors=] [--new-detectors=] [--corpus-bytes=] + [--keyword-corpus-meta=] """ import argparse import json @@ -114,8 +122,35 @@ def render_blast_radius(matches, corpus_bytes, signed=False): return f"{projected:,.0f}" -def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): +def load_keyword_corpus_meta(path): + """Read the sidecar emitted by build_keyword_corpus.py. + + Returns a dict with `thin_l1` (set of detector names, lowercase, dotted + suffix stripped to match identity normalization elsewhere in this + file) and `reports` (kept as-is for future use). Missing/unreadable + file → empty result, surfaced silently — Phase 3a coverage is a + nice-to-have, not load-bearing. + """ + if not path: + return {"thin_l1": set(), "reports": []} + try: + with open(path, "r", encoding="utf-8") as f: + raw = json.load(f) + except (OSError, json.JSONDecodeError): + return {"thin_l1": set(), "reports": []} + thin = set() + for name in raw.get("thin_l1") or []: + if not isinstance(name, str): + continue + norm = name.split(".", 1)[0].strip().lower() + if norm: + thin.add(norm) + return {"thin_l1": thin, "reports": raw.get("reports") or []} + + +def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, keyword_meta=None): new_detectors = new_detectors or set() + keyword_meta = keyword_meta or {"thin_l1": set(), "reports": []} if changed: all_names = {d for d in (set(main) | set(pr)) @@ -256,6 +291,22 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): parts.append(f"- `{d}`") parts.append("") + thin_l1 = sorted(keyword_meta.get("thin_l1") or set()) + if changed: + thin_l1 = [d for d in thin_l1 if d in changed] + if thin_l1: + # Single contiguous blockquote: `>` on the spacer line keeps GitHub + # Markdown from splitting the bullet list into a second quote. + parts.append( + "> ⚠️ **Thin Layer 1 coverage:** GitHub Code Search returned no " + "snippets for the detectors below. The bench's verdict for them " + "leans entirely on the S3 corpus and may be under-sampled." + ) + parts.append(">") + for d in thin_l1: + parts.append(f"> - `{d}`") + parts.append("") + detail_rows = [r for r in rows if r["new_samples"] or r["removed_samples"]] if detail_rows: parts += ["### Per-detector details", ""] @@ -300,6 +351,8 @@ def main(): help="CSV of detectors present in PR but not main; rendered with 🆕.") parser.add_argument("--corpus-bytes", type=int, default=0, help="Total uncompressed bytes scanned; enables blast-radius column.") + parser.add_argument("--keyword-corpus-meta", default="", + help="Path to build_keyword_corpus.py sidecar; surfaces thin-L1 warnings.") args = parser.parse_args() main_findings = load_findings(args.main_jsonl) @@ -307,6 +360,7 @@ def main(): changed = parse_csv(args.changed_detectors) new_detectors = parse_csv(args.new_detectors) corpus_bytes = args.corpus_bytes if args.corpus_bytes > 0 else None + keyword_meta = load_keyword_corpus_meta(args.keyword_corpus_meta) sys.stdout.write(render( main_findings, @@ -314,6 +368,7 @@ def main(): changed=changed if changed else None, new_detectors=new_detectors, corpus_bytes=corpus_bytes, + keyword_meta=keyword_meta, )) From 24fdf366d53e8e49d44cd05c3ea4466845e50a2d Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 13:54:58 +0500 Subject: [PATCH 18/43] Phase 4 complete - Heatmap visualization --- .github/workflows/detector-corpora-test.yml | 61 ++++- scripts/diff_corpora_results.py | 77 +++++- scripts/render_heatmap.py | 263 ++++++++++++++++++++ 3 files changed, 399 insertions(+), 2 deletions(-) create mode 100644 scripts/render_heatmap.py diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 653e3e073785..9a96710ed3f7 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -12,6 +12,7 @@ on: - 'scripts/diff_corpora_results.py' - 'scripts/detect_changed_detectors.sh' - 'scripts/build_keyword_corpus.py' + - 'scripts/render_heatmap.py' - 'hack/extract-keywords/**' env: @@ -39,6 +40,11 @@ jobs: - name: Install dependencies run: sudo apt-get install -y zstd jq + # matplotlib drives the Phase 4 heatmap render. Pinned-major install + # to keep CI deterministic; ~30s cold install, fine at current scale. + - name: Install matplotlib + run: pip install --quiet 'matplotlib>=3.7,<4' + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -207,12 +213,52 @@ jobs: done <<< "$DATASETS" ./scripts/detector_corpora_test.sh "${files[@]}" + # Render the per-(detector × decoder) Δ heatmap. Decoder columns are + # the only stable per-finding signal post-stdin (file metadata is lost + # — see render_heatmap.py docstring), and they're real diagnostic + # signal: regressions in the BASE64 path mean something different from + # regressions in PLAIN. The script silently no-ops (no PNG written) + # when the grid would be empty or all-zero, so the diff step's + # `[[ -s ... ]]` guard handles "nothing to embed" cleanly. + - name: Render heatmap + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + CHANGED: ${{ steps.detect.outputs.pr_csv }} + run: | + set -o pipefail + python3 scripts/render_heatmap.py \ + /tmp/results-main.jsonl /tmp/results-pr.jsonl \ + --changed-detectors="$CHANGED" \ + --output=/tmp/heatmap.png || true + if [[ -s /tmp/heatmap.png ]]; then + ls -lh /tmp/heatmap.png + else + echo "No heatmap produced (empty grid or no diff)." + fi + + # Archive the heatmap regardless of whether it inlines into the + # comment — storage is cheap, comments can be pruned. Capturing + # artifact-url lets the diff step fall back to a clickable link if + # the inline base64 would push the comment past GitHub's 65 KB body + # cap. + - name: Upload heatmap artifact + id: upload_heatmap + if: steps.detect.outputs.any_changed == 'true' + uses: actions/upload-artifact@v4 + with: + name: detector-bench-heatmap + path: /tmp/heatmap.png + if-no-files-found: ignore + retention-days: 14 + - name: Diff results if: steps.detect.outputs.any_changed == 'true' shell: bash env: CHANGED: ${{ steps.detect.outputs.pr_csv }} NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }} + HEATMAP_ARTIFACT_URL: ${{ steps.upload_heatmap.outputs.artifact-url }} run: | set -o pipefail CORPUS_BYTES=0 @@ -223,14 +269,27 @@ jobs: if [[ -s /tmp/keyword-corpus-meta.json ]]; then META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json) fi + HEATMAP_ARG=() + if [[ -s /tmp/heatmap.png ]]; then + HEATMAP_ARG=(--heatmap-png=/tmp/heatmap.png) + if [[ -n "$HEATMAP_ARTIFACT_URL" ]]; then + HEATMAP_ARG+=(--heatmap-artifact-url="$HEATMAP_ARTIFACT_URL") + fi + fi python3 scripts/diff_corpora_results.py \ /tmp/results-main.jsonl /tmp/results-pr.jsonl \ --changed-detectors="$CHANGED" \ --new-detectors="$NEW_DETECTORS" \ --corpus-bytes="$CORPUS_BYTES" \ "${META_ARG[@]}" \ + "${HEATMAP_ARG[@]}" \ > /tmp/diff-report.md - cat /tmp/diff-report.md + # Skip dumping report to logs when the heatmap is inlined — a + # 60+ KB base64 blob makes CI logs unreadable. Print size + the + # non-image lines instead. + REPORT_BYTES=$(wc -c < /tmp/diff-report.md) + echo "diff-report.md: ${REPORT_BYTES} bytes" + grep -v '^!\[.*data:image' /tmp/diff-report.md || true - name: Post results to PR if: steps.detect.outputs.any_changed == 'true' diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py index 633c87f398a0..c153e1555090 100755 --- a/scripts/diff_corpora_results.py +++ b/scripts/diff_corpora_results.py @@ -24,15 +24,26 @@ know the bench's verdict for those detectors leans entirely on the S3 corpus and may be under-sampled. +Phase 4: --heatmap-png and --heatmap-artifact-url embed the heatmap +rendered by scripts/render_heatmap.py at the top of the report. We try +inline base64 first (no auth, renders everywhere GitHub Markdown does); +if the encoded image would push the comment near GitHub's 65 KB body +limit, we fall back to a clickable artifact link instead. Missing PNG +or zero-length file → no heatmap section, no error. + Usage: diff_corpora_results.py [--changed-detectors=] [--new-detectors=] [--corpus-bytes=] [--keyword-corpus-meta=] + [--heatmap-png=] + [--heatmap-artifact-url=] """ import argparse +import base64 import json +import os import sys from collections import defaultdict @@ -51,6 +62,12 @@ SAMPLE_LIMIT = 10 SAMPLE_TRUNCATE = 120 +# GitHub issue/PR comment bodies are capped at 65536 chars. The summary table +# plus per-detector details routinely consume 5-15 KB; budgeting ~50 KB for the +# inline PNG keeps comfortable headroom while still permitting most heatmaps +# to embed directly. Larger images fall back to the artifact-URL link. +HEATMAP_INLINE_LIMIT_BYTES = 50 * 1024 + def parse_csv(s): """Parse a comma-separated detector list into normalized name set. @@ -148,7 +165,56 @@ def load_keyword_corpus_meta(path): return {"thin_l1": thin, "reports": raw.get("reports") or []} -def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, keyword_meta=None): +def build_heatmap_section(png_path, artifact_url): + """Return the Markdown lines that embed the heatmap, or [] if there's + nothing to embed. + + Strategy: + 1. If the PNG file is missing/empty, nothing to do. + 2. Try inline base64 if the encoded body fits the inline budget. + Inline survives comment pruning of artifacts and renders without + GitHub auth on mobile/desktop alike. + 3. Otherwise fall back to a plain Markdown link to the workflow + artifact URL with a one-line explainer. Artifact URLs require + GitHub login, so no embedded ``![](...)`` — that would render as + a broken image. + """ + if not png_path or not os.path.isfile(png_path): + return [] + try: + size = os.path.getsize(png_path) + except OSError: + return [] + if size <= 0: + return [] + + if size <= HEATMAP_INLINE_LIMIT_BYTES: + try: + with open(png_path, "rb") as f: + encoded = base64.b64encode(f.read()).decode("ascii") + except OSError: + return [] + return [ + "### Δ heatmap (changed detectors × decoder)", + "", + f"![PR vs main — Δ unique findings per (detector, decoder)]" + f"(data:image/png;base64,{encoded})", + "", + ] + + if artifact_url: + return [ + "### Δ heatmap (changed detectors × decoder)", + "", + f"_Heatmap PNG too large to inline ({size // 1024} KB); " + f"[download from workflow artifacts]({artifact_url})._", + "", + ] + return [] + + +def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, + keyword_meta=None, heatmap_png=None, heatmap_artifact_url=None): new_detectors = new_detectors or set() keyword_meta = keyword_meta or {"thin_l1": set(), "reports": []} @@ -209,6 +275,8 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, keywor ) parts.append("") + parts += build_heatmap_section(heatmap_png, heatmap_artifact_url) + if not rows and not missing: parts += ["_(No findings on either side for the changed detectors.)_", ""] return "\n".join(parts) @@ -353,6 +421,11 @@ def main(): help="Total uncompressed bytes scanned; enables blast-radius column.") parser.add_argument("--keyword-corpus-meta", default="", help="Path to build_keyword_corpus.py sidecar; surfaces thin-L1 warnings.") + parser.add_argument("--heatmap-png", default="", + help="Path to heatmap PNG produced by render_heatmap.py.") + parser.add_argument("--heatmap-artifact-url", default="", + help="Workflow artifact URL for the heatmap; used as fallback link " + "when the PNG exceeds the inline budget.") args = parser.parse_args() main_findings = load_findings(args.main_jsonl) @@ -369,6 +442,8 @@ def main(): new_detectors=new_detectors, corpus_bytes=corpus_bytes, keyword_meta=keyword_meta, + heatmap_png=args.heatmap_png or None, + heatmap_artifact_url=args.heatmap_artifact_url or None, )) diff --git a/scripts/render_heatmap.py b/scripts/render_heatmap.py new file mode 100644 index 000000000000..bce62890ed94 --- /dev/null +++ b/scripts/render_heatmap.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +"""Render a per-(detector, decoder) Δ heatmap of detector findings. + +Inputs are the same JSONL files produced by trufflehog stdin scans that +``diff_corpora_results.py`` consumes (main vs PR). The diff script identifies +findings by ``(DetectorName, Raw or RawV2)`` with set semantics; the heatmap +keeps that identity but adds ``DecoderName`` to the bucketing key, so each +cell answers "how many unique secrets did this (detector, decoder) cell gain +or lose?" + +Bucketing rationale (Phase 4 design decision): + + Stdin scans drop file metadata — both Layer 0 (S3 corpus) and Layer 1 + (keyword corpus) findings come back with empty ``SourceMetadata.Data.Stdin``, + so we can't bucket by file extension. ``DecoderName`` is the only stable + per-finding signal that always exists, and it carries real diagnostic + meaning: "the regression came in via the BASE64 decode path" or "the + ESCAPED_UNICODE path lit up new false positives" tells reviewers which + lane to investigate. Robust-by-construction beats heuristic-on-Raw or + reverse-correlation-on-L1. + +Visual choices: + + - Diverging RdBu_r colormap: red = increase (regression-likely), blue = + decrease (lost recall), white = 0. + - SymLogNorm: cells with Δ=1 in a rare decoder remain visible even when a + sibling cell has Δ=200 in PLAIN. Linear band around 0 keeps the white + "no change" reading; log-ish outside it preserves the rare-decoder + diagnostic. Without this, common-decoder outliers wash out small but + important signals. + - Every cell is annotated with its integer Δ. Belt-and-suspenders against + color-only readings. + - Empty decoder columns (no findings on either side for any changed + detector) are dropped — no need to render dead space. + +Identity bucketing: + + identity := (DecoderName, Raw or RawV2) + per-cell Δ := |pr_only| - |main_only| + + Note this is a stricter identity than the summary table's + ``(DetectorName, Raw or RawV2)``: a single secret found via both PLAIN + and BASE64 contributes one identity to the table but two to the heatmap. + That's the desired behavior — the heatmap diagnoses *which decoder path + changed*, not *how many distinct secrets changed overall*. + +Skips rendering and exits with code 0 (no file written) when the grid would +be all-zero or empty. The diff script handles a missing file gracefully. + +Usage: + render_heatmap.py --changed-detectors= + [--output=/tmp/heatmap.png] +""" +from __future__ import annotations + +import argparse +import json +import sys +from collections import defaultdict + + +# Standard decoders emitted by trufflehog. Ordered by expected frequency so +# the heatmap reads left-to-right common→rare. Any decoder not in this list +# falls through to alphabetical ordering after the canonical ones. +DECODER_ORDER = ["PLAIN", "BASE64", "UTF8", "UTF16", "ESCAPED_UNICODE"] + + +def parse_csv(s): + """Lowercase + strip ``.v`` suffix, mirrors diff_corpora_results.parse_csv.""" + if not s: + return set() + out = set() + for item in s.split(","): + item = item.strip() + if not item: + continue + if "." in item: + item = item.split(".", 1)[0] + out.add(item.lower()) + return out + + +def load_findings(path): + """Returns dict: detector -> dict[decoder] -> set(raw identities).""" + by_dd = defaultdict(lambda: defaultdict(set)) + try: + f = open(path, "r", encoding="utf-8", errors="replace") + except OSError: + return by_dd + with f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + detector = obj.get("DetectorName") or "" + decoder = obj.get("DecoderName") or "UNKNOWN" + raw = obj.get("Raw") or obj.get("RawV2") or "" + if not detector or not raw: + continue + by_dd[detector][decoder].add(raw) + return by_dd + + +def order_decoders(present): + """PLAIN/BASE64/... first when present, then any extras alphabetically.""" + canonical = [d for d in DECODER_ORDER if d in present] + extras = sorted(d for d in present if d not in DECODER_ORDER) + return canonical + extras + + +def build_grid(main, pr, changed): + """Returns (rows, cols, deltas) where deltas[i][j] is the signed Δ count + for row detector i and column decoder j. Detectors and decoders that + never appear on either side are dropped.""" + detectors = sorted( + d for d in (set(main) | set(pr)) + if d.lower() in changed + ) + + decoders_present = set() + for d in detectors: + decoders_present.update(main.get(d, {}).keys()) + decoders_present.update(pr.get(d, {}).keys()) + decoders = order_decoders(decoders_present) + + deltas = [] + row_abs_totals = [] + for d in detectors: + row = [] + row_abs = 0 + for dec in decoders: + m_set = main.get(d, {}).get(dec, set()) + p_set = pr.get(d, {}).get(dec, set()) + delta = len(p_set - m_set) - len(m_set - p_set) + row.append(delta) + row_abs += abs(delta) + deltas.append(row) + row_abs_totals.append(row_abs) + + # Drop columns that are zero across every detector — they add no signal. + keep_cols = [j for j in range(len(decoders)) + if any(deltas[i][j] != 0 for i in range(len(detectors)))] + if not keep_cols: + return detectors, [], [] + decoders = [decoders[j] for j in keep_cols] + deltas = [[row[j] for j in keep_cols] for row in deltas] + + # Sort rows by total |Δ| desc, ties broken alphabetically. Detectors with + # no Δ in any kept column drop off the bottom of the figure. + order = sorted( + range(len(detectors)), + key=lambda i: (-row_abs_totals[i], detectors[i]), + ) + detectors = [detectors[i] for i in order if row_abs_totals[i] > 0] + deltas = [deltas[i] for i in order if row_abs_totals[i] > 0] + + return detectors, decoders, deltas + + +def render(detectors, decoders, deltas, output_path): + """Write the heatmap PNG. Caller has already verified the grid is non-empty.""" + # Lazy import: this script is only invoked from the workflow when + # matplotlib has been pip-installed in CI; importing at module top would + # break unit-test-style invocations that just want to assert grid shape. + import matplotlib + + matplotlib.use("Agg") # No display in CI. + import matplotlib.pyplot as plt + from matplotlib.colors import SymLogNorm + + n_rows = len(detectors) + n_cols = len(decoders) + + # Figure size: aim for a tight PNG well under the 50 KB inline budget. + # Width scales with column count; height scales with row count, with + # generous lower bounds so labels don't clip on tiny grids. + width = max(5.5, 1.4 * n_cols + 2.5) + height = max(2.5, 0.55 * n_rows + 1.6) + fig, ax = plt.subplots(figsize=(width, height), dpi=100) + + max_abs = max((abs(v) for row in deltas for v in row), default=1) + if max_abs < 1: + max_abs = 1 + # SymLogNorm linthresh=1 keeps integer Δ in [-1,1] in the linear band so + # zero stays white; outside that we go log-ish so a Δ=200 cell doesn't + # saturate everything else to faint pastel. + norm = SymLogNorm(linthresh=1.0, vmin=-max_abs, vmax=max_abs, base=10) + + im = ax.imshow(deltas, aspect="auto", cmap="RdBu_r", norm=norm) + + ax.set_xticks(range(n_cols)) + ax.set_xticklabels(decoders, rotation=30, ha="right", fontsize=9) + ax.set_yticks(range(n_rows)) + ax.set_yticklabels(detectors, fontsize=9) + ax.set_xlabel("Decoder", fontsize=10) + ax.set_ylabel("Detector", fontsize=10) + ax.set_title("PR vs main — Δ unique findings per (detector, decoder)", fontsize=11) + + # Annotate every cell with its integer Δ. Text color flips to white + # on saturated cells so the number stays readable. + for i in range(n_rows): + for j in range(n_cols): + v = deltas[i][j] + if v == 0: + label = "0" + color = "#888888" + else: + label = f"{v:+d}" + rgba = im.cmap(im.norm(v)) + # Perceived luminance — flip text color on dark cells. + lum = 0.299 * rgba[0] + 0.587 * rgba[1] + 0.114 * rgba[2] + color = "white" if lum < 0.45 else "black" + ax.text(j, i, label, ha="center", va="center", + color=color, fontsize=9) + + cbar = fig.colorbar(im, ax=ax, shrink=0.85, pad=0.02) + cbar.set_label("Δ unique findings (PR − main)", fontsize=9) + cbar.ax.tick_params(labelsize=8) + + fig.tight_layout() + fig.savefig(output_path, dpi=100, format="png", + bbox_inches="tight", pad_inches=0.15) + plt.close(fig) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("main_jsonl") + parser.add_argument("pr_jsonl") + parser.add_argument("--changed-detectors", default="", + help="CSV of detectors changed in PR; restricts heatmap rows.") + parser.add_argument("--output", default="/tmp/heatmap.png", + help="PNG output path (default /tmp/heatmap.png).") + args = parser.parse_args() + + changed = parse_csv(args.changed_detectors) + if not changed: + print("[render_heatmap] no changed detectors supplied; nothing to render", + file=sys.stderr) + return 0 + + main_findings = load_findings(args.main_jsonl) + pr_findings = load_findings(args.pr_jsonl) + detectors, decoders, deltas = build_grid(main_findings, pr_findings, changed) + + if not detectors or not decoders: + print("[render_heatmap] grid is empty or all-zero; skipping render", + file=sys.stderr) + return 0 + + render(detectors, decoders, deltas, args.output) + print(f"[render_heatmap] wrote {args.output} " + f"({len(detectors)} rows × {len(decoders)} cols)", + file=sys.stderr) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From c20a5dde82e4529b1c8d6c43c89d584487cc4656 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 22:36:25 +0500 Subject: [PATCH 19/43] Phase 4 rework (1/2): emit heatmap-grid.json sidecar from render_heatmap.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a JSON sidecar that captures the same Δ matrix the PNG renders. The diff script consumes this to render an emoji-bucketed Markdown table — GitHub's PR-comment Markdown sanitizer strips data: URLs and serves artifact zips behind auth, so neither inline base64 nor an artifact embed actually displays. The PNG stays for artifact archival and click-through. Sidecar shape: {detectors, decoders, deltas, _layout}, with _layout documenting the deltas[i][j] orientation inline so future readers don't have to reverse-engineer it. Emitted whenever the grid is non-empty, even if matplotlib import fails — the comment never depends on the PNG. Co-Authored-By: Claude Opus 4.7 --- scripts/render_heatmap.py | 60 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/scripts/render_heatmap.py b/scripts/render_heatmap.py index bce62890ed94..5b26d981b300 100644 --- a/scripts/render_heatmap.py +++ b/scripts/render_heatmap.py @@ -44,12 +44,28 @@ That's the desired behavior — the heatmap diagnoses *which decoder path changed*, not *how many distinct secrets changed overall*. -Skips rendering and exits with code 0 (no file written) when the grid would -be all-zero or empty. The diff script handles a missing file gracefully. +Outputs: + + - PNG (``--output``, default /tmp/heatmap.png): the matplotlib render. + Archived as a workflow artifact for reviewers who want the colored + version; not embedded inline in the comment because GitHub's + Markdown sanitizer strips ``data:`` URLs and artifact-zip URLs are + auth-gated, neither of which renders as ```` in PR comments. + + - Grid JSON (``--grid-output``, default /tmp/heatmap-grid.json): same + Δ matrix as the PNG. The diff script reads this and renders an + emoji-bucketed Markdown table — that's what actually shows up in the + PR comment. Always emitted when a non-empty grid exists, even if + matplotlib isn't available, so the comment renders without the PNG + if needed. + +Skips both outputs (no files written) when the grid would be all-zero or +empty. The diff script handles missing files gracefully. Usage: render_heatmap.py --changed-detectors= [--output=/tmp/heatmap.png] + [--grid-output=/tmp/heatmap-grid.json] """ from __future__ import annotations @@ -227,6 +243,38 @@ def render(detectors, decoders, deltas, output_path): plt.close(fig) +def write_grid_json(path, detectors, decoders, deltas): + """Persist the grid the diff script renders the emoji table from. + + The ``_layout`` field is a human-readable note for future readers — it + has no behavioral effect. We emit it inline rather than relying solely + on this docstring because the JSON is the long-lived contract between + the renderer and the diff script. + """ + payload = { + "detectors": detectors, + "decoders": decoders, + "deltas": deltas, + "_layout": "deltas[i][j] = (PR - main) unique-finding count for detectors[i] / decoders[j]", + } + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2) + f.write("\n") + + +def try_render_png(detectors, decoders, deltas, output_path): + """Attempt to render the PNG; on matplotlib import failure, log and + move on. The PNG is artifact-only — the comment doesn't need it — so a + missing matplotlib should not fail the workflow.""" + try: + render(detectors, decoders, deltas, output_path) + except ImportError as exc: + print(f"[render_heatmap] matplotlib unavailable, skipping PNG: {exc}", + file=sys.stderr) + return False + return True + + def main(): parser = argparse.ArgumentParser() parser.add_argument("main_jsonl") @@ -235,6 +283,8 @@ def main(): help="CSV of detectors changed in PR; restricts heatmap rows.") parser.add_argument("--output", default="/tmp/heatmap.png", help="PNG output path (default /tmp/heatmap.png).") + parser.add_argument("--grid-output", default="/tmp/heatmap-grid.json", + help="Grid JSON output path; consumed by diff_corpora_results.py.") args = parser.parse_args() changed = parse_csv(args.changed_detectors) @@ -252,8 +302,10 @@ def main(): file=sys.stderr) return 0 - render(detectors, decoders, deltas, args.output) - print(f"[render_heatmap] wrote {args.output} " + write_grid_json(args.grid_output, detectors, decoders, deltas) + png_ok = try_render_png(detectors, decoders, deltas, args.output) + suffix = f" + {args.output}" if png_ok else " (PNG skipped)" + print(f"[render_heatmap] wrote {args.grid_output}{suffix} " f"({len(detectors)} rows × {len(decoders)} cols)", file=sys.stderr) return 0 From 80f67478eb4b4e3ba6cfd3c43b4b038265e69933 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 22:36:49 +0500 Subject: [PATCH 20/43] Phase 4 rework (2/2): replace data-URL embed with emoji-bucketed Markdown table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub's PR-comment Markdown sanitizer strips data: URLs from user content, so the inline base64 PNG embed shipped in the prior commit rendered as a broken link in the comment DOM (no tag emitted). Artifact-zip URLs require auth to download, so a fallback ![](...) link is also a non-starter — it would render as a broken image. Switch to a per-(detector, decoder) Δ table built from the grid JSON sidecar render_heatmap.py now emits. Cells use emoji buckets aligned with the existing status-emoji thresholds so the visual weight matches the summary table: 🟥 Δ ≥ +6 (matches NEW > 5 → 🔴) 🟧 +1..+5 ⬜ 0 🟦 ≤ −1 Renders identically on web, mobile, email notifications, and CI log replay — every surface a PR comment lands on. The colored matplotlib PNG stays as a workflow artifact; when --heatmap-artifact-url is supplied, the table is followed by a click-through link for reviewers who want the rich version. Workflow YAML mirrors the rename (--heatmap-png → --heatmap-grid) and drops the base64-blob log filter — the report is back to plain human-readable text. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/detector-corpora-test.yml | 44 +++--- scripts/diff_corpora_results.py | 166 +++++++++++++------- 2 files changed, 131 insertions(+), 79 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 9a96710ed3f7..21d3350ce71b 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -216,10 +216,20 @@ jobs: # Render the per-(detector × decoder) Δ heatmap. Decoder columns are # the only stable per-finding signal post-stdin (file metadata is lost # — see render_heatmap.py docstring), and they're real diagnostic - # signal: regressions in the BASE64 path mean something different from - # regressions in PLAIN. The script silently no-ops (no PNG written) - # when the grid would be empty or all-zero, so the diff step's - # `[[ -s ... ]]` guard handles "nothing to embed" cleanly. + # signal: regressions in the BASE64 path mean something different + # from regressions in PLAIN. Two outputs: + # + # - heatmap-grid.json: the actual data the comment renders from + # (via diff_corpora_results.py's emoji table). The comment never + # embeds the PNG — GitHub's PR-comment Markdown sanitizer strips + # ``data:`` URLs and serves artifact zips behind auth, so neither + # inline nor artifact-link images render. + # - heatmap.png: archived as a workflow artifact for reviewers + # who want the colored matplotlib version. The comment includes + # a click-through link to it. + # + # Both outputs are skipped when the grid would be empty/all-zero; + # the diff step's `[[ -s ... ]]` guards handle that cleanly. - name: Render heatmap if: steps.detect.outputs.any_changed == 'true' shell: bash @@ -230,18 +240,17 @@ jobs: python3 scripts/render_heatmap.py \ /tmp/results-main.jsonl /tmp/results-pr.jsonl \ --changed-detectors="$CHANGED" \ - --output=/tmp/heatmap.png || true - if [[ -s /tmp/heatmap.png ]]; then - ls -lh /tmp/heatmap.png + --output=/tmp/heatmap.png \ + --grid-output=/tmp/heatmap-grid.json || true + if [[ -s /tmp/heatmap-grid.json ]]; then + ls -lh /tmp/heatmap-grid.json /tmp/heatmap.png 2>/dev/null || true else echo "No heatmap produced (empty grid or no diff)." fi - # Archive the heatmap regardless of whether it inlines into the - # comment — storage is cheap, comments can be pruned. Capturing - # artifact-url lets the diff step fall back to a clickable link if - # the inline base64 would push the comment past GitHub's 65 KB body - # cap. + # Archive the colored PNG so reviewers who want the rich version can + # click through. The comment never embeds the image — see the render + # step comment for why — so this is artifact-only. - name: Upload heatmap artifact id: upload_heatmap if: steps.detect.outputs.any_changed == 'true' @@ -270,8 +279,8 @@ jobs: META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json) fi HEATMAP_ARG=() - if [[ -s /tmp/heatmap.png ]]; then - HEATMAP_ARG=(--heatmap-png=/tmp/heatmap.png) + if [[ -s /tmp/heatmap-grid.json ]]; then + HEATMAP_ARG=(--heatmap-grid=/tmp/heatmap-grid.json) if [[ -n "$HEATMAP_ARTIFACT_URL" ]]; then HEATMAP_ARG+=(--heatmap-artifact-url="$HEATMAP_ARTIFACT_URL") fi @@ -284,12 +293,7 @@ jobs: "${META_ARG[@]}" \ "${HEATMAP_ARG[@]}" \ > /tmp/diff-report.md - # Skip dumping report to logs when the heatmap is inlined — a - # 60+ KB base64 blob makes CI logs unreadable. Print size + the - # non-image lines instead. - REPORT_BYTES=$(wc -c < /tmp/diff-report.md) - echo "diff-report.md: ${REPORT_BYTES} bytes" - grep -v '^!\[.*data:image' /tmp/diff-report.md || true + cat /tmp/diff-report.md - name: Post results to PR if: steps.detect.outputs.any_changed == 'true' diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py index c153e1555090..ead2637138b3 100755 --- a/scripts/diff_corpora_results.py +++ b/scripts/diff_corpora_results.py @@ -24,12 +24,15 @@ know the bench's verdict for those detectors leans entirely on the S3 corpus and may be under-sampled. -Phase 4: --heatmap-png and --heatmap-artifact-url embed the heatmap -rendered by scripts/render_heatmap.py at the top of the report. We try -inline base64 first (no auth, renders everywhere GitHub Markdown does); -if the encoded image would push the comment near GitHub's 65 KB body -limit, we fall back to a clickable artifact link instead. Missing PNG -or zero-length file → no heatmap section, no error. +Phase 4: --heatmap-grid points at the JSON sidecar emitted by +scripts/render_heatmap.py — same Δ matrix as the PNG render. We turn that +into an emoji-bucketed Markdown table at the top of the report so reviewers +can see the per-(detector, decoder) breakdown at a glance. Inline image +embedding was tried and abandoned: GitHub strips ``data:`` URLs from PR +comments, and artifact-zip URLs are auth-gated rather than served as +images. The PNG still ships as a workflow artifact — when +--heatmap-artifact-url is supplied we link to it under the table for +reviewers who want the colored version. Usage: diff_corpora_results.py @@ -37,11 +40,10 @@ [--new-detectors=] [--corpus-bytes=] [--keyword-corpus-meta=] - [--heatmap-png=] + [--heatmap-grid=] [--heatmap-artifact-url=] """ import argparse -import base64 import json import os import sys @@ -62,11 +64,14 @@ SAMPLE_LIMIT = 10 SAMPLE_TRUNCATE = 120 -# GitHub issue/PR comment bodies are capped at 65536 chars. The summary table -# plus per-detector details routinely consume 5-15 KB; budgeting ~50 KB for the -# inline PNG keeps comfortable headroom while still permitting most heatmaps -# to embed directly. Larger images fall back to the artifact-URL link. -HEATMAP_INLINE_LIMIT_BYTES = 50 * 1024 +# Heatmap cell color buckets, applied to per-(detector, decoder) Δ unique +# findings. Thresholds line up with status_emoji's NEW threshold (>5 trips +# 🔴) so a 🟥 cell carries the same "this is real" weight as a 🔴 row in the +# summary table. 🟦 covers any decrease — recall regressions are rarer than +# FP regressions, and we haven't seen real data justifying a removal +# gradient yet. +HEATMAP_BUCKET_HOT = 6 # Δ ≥ this → 🟥 +HEATMAP_BUCKET_WARM = 1 # +1..+5 → 🟧 def parse_csv(s): @@ -165,56 +170,98 @@ def load_keyword_corpus_meta(path): return {"thin_l1": thin, "reports": raw.get("reports") or []} -def build_heatmap_section(png_path, artifact_url): - """Return the Markdown lines that embed the heatmap, or [] if there's - nothing to embed. +def heatmap_cell(delta): + """Render one (emoji, signed-int) cell for the heatmap table.""" + if delta >= HEATMAP_BUCKET_HOT: + emoji = "🟥" + elif delta >= HEATMAP_BUCKET_WARM: + emoji = "🟧" + elif delta == 0: + emoji = "⬜" + else: + emoji = "🟦" + sign = "+" if delta > 0 else "" # Negative numbers carry their own minus. + return f"{emoji} {sign}{delta}" + - Strategy: - 1. If the PNG file is missing/empty, nothing to do. - 2. Try inline base64 if the encoded body fits the inline budget. - Inline survives comment pruning of artifacts and renders without - GitHub auth on mobile/desktop alike. - 3. Otherwise fall back to a plain Markdown link to the workflow - artifact URL with a one-line explainer. Artifact URLs require - GitHub login, so no embedded ``![](...)`` — that would render as - a broken image. +def load_heatmap_grid(path): + """Read the grid sidecar from render_heatmap.py. + + Returns ``None`` for any failure mode (missing file, malformed JSON, + schema mismatch). The heatmap is informational; if the grid isn't + readable the diff still posts without it. """ - if not png_path or not os.path.isfile(png_path): - return [] + if not path or not os.path.isfile(path): + return None try: - size = os.path.getsize(png_path) - except OSError: - return [] - if size <= 0: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + return None + detectors = data.get("detectors") or [] + decoders = data.get("decoders") or [] + deltas = data.get("deltas") or [] + if not detectors or not decoders or not deltas: + return None + if len(deltas) != len(detectors): + return None + if any(len(row) != len(decoders) for row in deltas): + return None + return {"detectors": detectors, "decoders": decoders, "deltas": deltas} + + +def build_heatmap_section(grid_path, artifact_url): + """Return the Markdown lines for the per-(detector, decoder) Δ heatmap. + + Renders an emoji-bucketed table from the JSON grid emitted by + render_heatmap.py rather than an inline image, because GitHub's + PR-comment sanitizer strips ``data:`` URLs and artifact-zip URLs are + auth-gated. The table renders identically on web, mobile, email + notifications, and CI log replay — all surfaces a PR comment lands on. + + The colored PNG is still produced as a workflow artifact; when + ``artifact_url`` is supplied we tack a click-through link onto the + table for reviewers who want the rich version. + + Returns ``[]`` if no grid is available — the rest of the report + renders unchanged. + """ + grid = load_heatmap_grid(grid_path) + if grid is None: return [] - if size <= HEATMAP_INLINE_LIMIT_BYTES: - try: - with open(png_path, "rb") as f: - encoded = base64.b64encode(f.read()).decode("ascii") - except OSError: - return [] - return [ - "### Δ heatmap (changed detectors × decoder)", - "", - f"![PR vs main — Δ unique findings per (detector, decoder)]" - f"(data:image/png;base64,{encoded})", - "", - ] + detectors = grid["detectors"] + decoders = grid["decoders"] + deltas = grid["deltas"] + + lines = [ + "### Δ heatmap (changed detectors × decoder)", + "", + "_Per-cell Δ = unique findings (PR − main). 🟥 ≥+6, 🟧 +1..+5, " + "⬜ 0, 🟦 ≤−1._", + "", + ] + + header = ["Detector"] + decoders + align = ["---"] + ["---:" for _ in decoders] + lines.append("| " + " | ".join(header) + " |") + lines.append("|" + "|".join(align) + "|") + for i, det in enumerate(detectors): + row = [f"**{det}**"] + [heatmap_cell(deltas[i][j]) + for j in range(len(decoders))] + lines.append("| " + " | ".join(row) + " |") + lines.append("") if artifact_url: - return [ - "### Δ heatmap (changed detectors × decoder)", - "", - f"_Heatmap PNG too large to inline ({size // 1024} KB); " - f"[download from workflow artifacts]({artifact_url})._", - "", - ] - return [] + lines.append( + f"_Colored PNG version: [download from workflow artifacts]({artifact_url})._" + ) + lines.append("") + return lines def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, - keyword_meta=None, heatmap_png=None, heatmap_artifact_url=None): + keyword_meta=None, heatmap_grid=None, heatmap_artifact_url=None): new_detectors = new_detectors or set() keyword_meta = keyword_meta or {"thin_l1": set(), "reports": []} @@ -275,7 +322,7 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, ) parts.append("") - parts += build_heatmap_section(heatmap_png, heatmap_artifact_url) + parts += build_heatmap_section(heatmap_grid, heatmap_artifact_url) if not rows and not missing: parts += ["_(No findings on either side for the changed detectors.)_", ""] @@ -421,11 +468,12 @@ def main(): help="Total uncompressed bytes scanned; enables blast-radius column.") parser.add_argument("--keyword-corpus-meta", default="", help="Path to build_keyword_corpus.py sidecar; surfaces thin-L1 warnings.") - parser.add_argument("--heatmap-png", default="", - help="Path to heatmap PNG produced by render_heatmap.py.") + parser.add_argument("--heatmap-grid", default="", + help="Path to grid JSON produced by render_heatmap.py; " + "drives the emoji-bucketed Δ table.") parser.add_argument("--heatmap-artifact-url", default="", - help="Workflow artifact URL for the heatmap; used as fallback link " - "when the PNG exceeds the inline budget.") + help="Workflow artifact URL for the colored PNG; " + "rendered as a click-through link below the table.") args = parser.parse_args() main_findings = load_findings(args.main_jsonl) @@ -442,7 +490,7 @@ def main(): new_detectors=new_detectors, corpus_bytes=corpus_bytes, keyword_meta=keyword_meta, - heatmap_png=args.heatmap_png or None, + heatmap_grid=args.heatmap_grid or None, heatmap_artifact_url=args.heatmap_artifact_url or None, )) From 02ae97ba65b87997ac10ce1c858f1864f82ff716 Mon Sep 17 00:00:00 2001 From: Shahzad Haider Date: Sat, 2 May 2026 23:22:10 +0500 Subject: [PATCH 21/43] Phase 5 complete - Polish --- .github/workflows/detector-corpora-test.yml | 71 +++++++++++++----- .../diff_corpora_results.cpython-310.pyc | Bin 0 -> 16606 bytes .../render_heatmap.cpython-310.pyc | Bin 0 -> 11852 bytes scripts/diff_corpora_results.py | 31 +++++++- 4 files changed, 81 insertions(+), 21 deletions(-) create mode 100644 scripts/__pycache__/diff_corpora_results.cpython-310.pyc create mode 100644 scripts/__pycache__/render_heatmap.cpython-310.pyc diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 21d3350ce71b..03c2044e2e7f 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -91,18 +91,34 @@ jobs: echo "any_changed=false" >> "$GITHUB_OUTPUT" fi - - name: Skip comment (no detector source changed) - if: steps.detect.outputs.any_changed != 'true' - uses: actions/github-script@v7 + # Sticky comment: find any prior detector-bench comment on the PR by + # the marker substring and update it in place. The marker — kept in + # sync with STICKY_COMMENT_MARKER in scripts/diff_corpora_results.py — + # has to appear in BOTH the skip body and the diff body so the same + # comment flips between them as iterative pushes change which path + # fires. Skip body is only posted on pull_request events; the original + # workflow_dispatch early-return is preserved by the event-name guard. + - name: Find existing skip comment + if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' + id: find_skip_comment + uses: peter-evans/find-comment@v3 with: - script: | - if (context.eventName === 'workflow_dispatch') return; - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: '## Corpora Test Results\n\nNo detector source files changed in this PR. Bench skipped.', - }); + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-includes: '' + + - name: Post or update skip comment + if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@v4 + with: + comment-id: ${{ steps.find_skip_comment.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + edit-mode: replace + body: | + + ## Corpora Test Results + + No detector source files changed in this PR. Bench skipped. # Layer 1 keyword corpus — fetch real-world snippets from GitHub Code # Search for each changed detector's pre-filter keywords. Output is a @@ -295,13 +311,15 @@ jobs: > /tmp/diff-report.md cat /tmp/diff-report.md - - name: Post results to PR + # workflow_dispatch runs don't carry an issue context, so resolve the + # PR number by branch lookup. pull_request events fall through to the + # event's issue number. Output feeds the find/update pair below. + - name: Resolve PR number if: steps.detect.outputs.any_changed == 'true' + id: resolve_pr uses: actions/github-script@v7 with: script: | - const fs = require('fs'); - const body = fs.readFileSync('/tmp/diff-report.md', 'utf8'); let issue_number; if (context.eventName === 'workflow_dispatch') { const pulls = await github.rest.pulls.list({ @@ -318,9 +336,22 @@ jobs: } else { issue_number = context.issue.number; } - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number, - body, - }); + core.setOutput('issue_number', issue_number); + + - name: Find existing diff comment + if: steps.detect.outputs.any_changed == 'true' + id: find_diff_comment + uses: peter-evans/find-comment@v3 + with: + issue-number: ${{ steps.resolve_pr.outputs.issue_number }} + comment-author: 'github-actions[bot]' + body-includes: '' + + - name: Post or update diff comment + if: steps.detect.outputs.any_changed == 'true' + uses: peter-evans/create-or-update-comment@v4 + with: + comment-id: ${{ steps.find_diff_comment.outputs.comment-id }} + issue-number: ${{ steps.resolve_pr.outputs.issue_number }} + edit-mode: replace + body-path: /tmp/diff-report.md diff --git a/scripts/__pycache__/diff_corpora_results.cpython-310.pyc b/scripts/__pycache__/diff_corpora_results.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..025935b14addee84f5b0f58a0abc9f8228595440 GIT binary patch literal 16606 zcma)jYj7Obm0ovG&jXFYlOPF#6xAjz2}5FrqGVYz31LNolti0^XaJI8+Vr3?w+9$t zUcB8y2(yi>wkX?F62)9an`Bqg3RJD_Qq}%UB~G$`wp_KT{K)1}l|TI}rBtQjawY53 z##=5{yv%pb?Vbk+WdqdQzJ2@lz2}~L?s<1(Xvos=_xMYHUHP*GP5ZBO)BBge%}e-s ze}O`1H#DIOqpIJ~`D)xS_?ozp;H!DV#5GY()>1c8x+csUX_35<5vdykB7MUWnHz&* z;6_$hH-^NZ$iAc97#2ff7}pUoB1Ul?6=Px?*D-gntcwF;;vN0QxOhSw5{FScAST5T zJe?3z;wY{M#WC?1u7|{N@i?xB2aEwjJSlAPCG?&YC&Wp#9&x9{mp2XRx<~G4fXZ0n zU+$^ap^T|s%qP^Ox zR7K8q>cVzw6+Cq8YmVF$jjg&R-DX4jw%@QlU*P3}WnB?&-LLrDcGH!1xl$LE`r0{r z`jYFrCBGpToSHjhFFIRxL(=7qXY8_5t*$zy&0N7=a(&x#YYs*zdG_aTe_)ALvsx)R zzU$dr6@ML2Iktx(tFG;poVs0VRI5(YbL~nUP&DeOL;%d#UIU$_yXL-Sm)4#7nrpcj z0$@2p&@$@omOU*Zz|h?-N!QPH1|SHfTWj3L+!z>Zd&8BLGC@;m)a{CAi;Cy0R$XB` z0GpukD>c`i_FMu^R+_$dCLX?6YDnP9DPkpl!CTMGSOD)f59o?Bt$JLwg@A7}~b$4b0M#Wv7H0fg2lZ0SM)?RdXFY0GMjAHODWlM@uHqtJ{G6`nm%wJ#)_9 zT6gPqKCdt$@-cuOEvP9Qw}C$~!(&IgTW*wE9^-&ot?moa5ccXecfY<^urEc}^z3rg zSz803w<`{21FCyw!rDMOb>T{Mpl!8(`N{WxA2j6nEvy`r=B#>+YRkv^+`329G+l4l zOq8owMzvCRK`n1$B^Fi!%(lGz>NXKc-y$4)wd#0&UOJ+Jfk7*+TAdcT0b2Ah9`J;B zK&jT*r|nlR+Kn>rV;x|X8g(DY?_lm(=N#6t>27Z|q=@INxxQmJ8=yUajdOqv@qoXQ zBl$qtTe9L~6ky1T34)JPF&c-dqU9Bu+iEp{&Gj(r4A!mA3&?)GfsMQBY=dymTGOvo z{8wA6_IyLQI4+JXt>^5rOB)X2ZAk*X?aBu3yjB%pt>H*Hw3<={!)!USj?>e{zOxGY zX25}k&KA-1PNm96kREP71!|0U6m%=y+ZA`qm7cX(Z*1}Ft8Trt{**@~Eh;77F5~q0 z40?AdRoAI|Hj%V+LC`ovgt4V(EwzQb^EGGNUUlsjQAgf$YE7K4&L(_?_F&z0{F>9u zugQwogM8v*ogE-<2K`eg*8t&$la0)sMK6)AHBzYhcO3 zmJR@?edcAz+cc|ZF-Ts)UXHNn`ITzb_SR`$;%qjK&}O*`!f~PrcpHc@;W)?J8$Zjp zWR;G!*J?r(V1_N%#>zKI>QsD1D2zXachDWEJ0edyJitg@9Zj7725}sas0XGBs(Rf6 z^3GW{{)mxwm~?b_h(KnE>l8LTu+cetwke}3zeDeRdmmQ%;%v#gJ$JLWPagx^=Xoy^ zdvUhj|MEUu=f&A3aNpl8#vC1t|JU6F%ea1@e?YAR4=Z-0`y*m=)wI-pr%J`h^* z44@$p5_Rzk?ZdNQ%I7)NfMKwr{wNoNHUUw0&UJ8}71&B@1$gjN^{5u6v#R~k~iA!|;x(grf> z#P&RwL>v+FQp^|%w?Thk4AC*c!7G<5Z$VBHL&3hevIH57^R0BoOVK2x+0oX3Bk*_- zvpAt{y;V~)O}mA)!VDPC*Dqb1Us=hWr-K}+Di|+csYCKX)>m#v+r#jY_A;}~0Hwbc>^Ia2ryVafn#2~@sUv6-{F@d_{I*B?&13dbwI$q+ z1oZ5x3qT6o`++_a>IF%gEoq-~$uI%Lg-H(_i7te?FafnPOfrIUiBR{#1i%T+itpAw z)8rW5d$4q7y-{<|ceg60?I(84AaP&cG2hhcmZ|xPjpX)e9kz$AeM`GF zwxtE;Zc3!@>&<6GW+xdWy|3OHj8)<)Mk@X_d)=U1m&Y37OdFAKq-#!sp z3>DWqagGkex3Ql$XY3Px!*{A^g22a}(nY8XohI(HQz!vveSZnuE#c>7QP5#0JMkk} zV7bHs)6Qo`XmuwHlRQaImjmd}MAB1?qb2SGQM5C-v7M5Z4=3C|$q&(&!ALx19zPH6l@31%v`gSA%vFe|81DeV zQHuedWA?QqL^9An)I=&UP^NjpugL5{h z6}}Btz{%yr`Zo9LBQ4@RMB5(yrcwMZNa*GWbSYgxEO^k5gbwjQx)ZdfHy{RXL31J< zDNI$oBA^SCyuCR?7Vy4&nhH8O3rzfA%tqm9IZ3S~tX07*cwlzTt-EhE<=p#t;1QJ@ zZ$GKEbIO>9jaMX06}=YpFS*^rj^p>fcqzv)7%ycYUJAh1Bk);n$ma=~E}-&jbl(r6 zJdd`2gx_B&;HLV|WgqfvbmJKZ;~ByB49sdovnGHvQ@((1@+=kqA5(vTe$mv&wFkzj zQ|)Z`$jsR76j2ARL*0!poH|8J3OLJ|p(z_%UO3P>W%5}J7!Gu(WY2QLL;GX?2YAba zk5d~T90&iO)FpAE5-mUeqdMV;S=5(N0e1&$3iM4~W&+4=aD0{oliK$TqNl@XBjHV0 zXb}Z|{`QYQ1K?G+4oV`f80wF(&kVG>I=Lq8Ctn|VhU z$q&tsNuWWG`;d-9O6ei;IHr1_pH@q{`+nP;DLh?V>7{;^Ku%S6*K#Ui{Yb zXuGK!yx+~a#FyBl3a0t4r z0pke<*Ny?LF&S%(^$c`IyH{s~{yGl*)%1;xWZBq$L1~lSu~X1zm$WU`?64aKv|4DW z&5LW0W;c)Sm_ByG&)hM*Z`_)PR*NWr$iRRbfG%s@*LDYC=3#H-ccDRNgCrYr^MM+N z^xty4g4|&1ZAiLB@L*=fu~)I{(^7u>g%sOSaMS3i`rj=ExJmYTFbK%T^0AsT(l2heM z72=YF$%^uXd4OZnZNi50@~|7pVOVa#%?9V6NABHg6?hfuYw(Rjj0!-4_T-Vk5jLd+`Qk&!>NFX1Uc79$%Mi0 zNX)9l8e~E`LK^GQ2zGc%LL33veWuH!xGYFg_$9F&W)iHf$z|N;hLl?WC3Z?#tVgHhlT;_^CrSSeO#&`7$&^8f zP1t+VCRuUr_cj;FYyBBs_h>Y8APa4IRL|7&mQ_!m|)}c+hz74*06J7fb zA+f9uAH9y70Ec2TDboU`V|^5NI_`$SjP&{dYtsgFXA^hOrO5+A`n7yaq`s5Df(hhv+Tw`Gp~WYjz~?(g=st;*FuY6`{gH} z{3x#cgttX;yFd9TZvC-*75yS!XcPB)_40+~YZtB;FTOtijmyi$S6^EW5A11s^TOga zDAq}i)F?Sg2-u7JYy?q}!2B6Hc%)2Zj1-uf`kBD?xGq=G+=pAz@QJ|@j=leaA^>^$ z+J?Ss1n~ZNmVmhnCYJyfdzqZM3&lBNa(W+=JF$n!4S~s}d~+iWCih%yGSjFq)s5YZ z0BdT_u93i@J7RJ3YudZ&8JY24$1|Ax@|$-x`OU!emgrr7fCLW8#2tMzA#ZY<^#cuo z&5Rg$Cli=T<(C`uBuLyPp=5nT>c(LEmmOZme*Ku?v12+C7ZE-wTGG$tR#p~3A}cFQ z3uHN!5xgKqQlluzbWJf5L?f#>>2QC6WNP4JWUxlQCB%Vhj*X7Y*uZ?2q3OmH1(s2% z(>u)Goa%YIqKpQHv_gI;<|r|nVvbGT8n8_Ad^a2LEHVp*LkK==#akA-j6}YUJq`zA zZtRgY%+7x=OsQ~SA9=(o4CzRojUv|Opvu6;^0m8AXf|UN81m{feL-GAKj$jdqWxv&95D&gO7~G~a|v0ZH+Uo$wcKE6G(2Wgq;(UfWe%3a1MFXI;aMSVM$>~gozr09ein}$_Tu2YRb4aXhBjw@{epy}C@J63zNF0}pS3Y$nq_5Hp$?YZdmL{z=pb~Xxg>)`N0#_(vH-Z)ga3~Z-h9(E|_=t72p6Hy>2)$9f-Z&J-t@MP(W;#~13V=POtmf;B zh~aXq%k~_&|KSmrO2c_1HbyE-CuGta31eWo(J;j{)FKqe0^i_d1o-AuxO2;01p{iV zG9v=VsZ^CJ(mlFz9K#(8=^(Y1N#Tz;E<;X91XFu5-+%j@DLa)jnNZ-^O3Fr7)nUZ*v!L?(P*u$ ztDqdB=Ey6FLI9nJ9M-XKL{3%^=z(Guh2v=ISaDrh`SwkkxJ<9t8xAV z)IZi_iC~mk*+U{;kG09o3wzm>*&?^#$&)83{u}fA={T{W!;tNt{S)ZQDjv(qLhCzt zt#};~Th>AF@{1Hjf;TPBc|j5lw&?S_?<>b%0iKaheq?|C?vGA?b%yFcu}_~Z6iz=k zV}Jf9AK6cH+mG@5kMYXcLXl?)Gnb;^%7a86aYpc@&L0T@;#8!2vfZ=>E#W5gE1~_K zaf=)W`U4X$onzh^bAG+JS4**G$(;>SYR+XbK0BC4JFbnG|06es@=$t2KZ86?$RIBf#_?&u0^1+G$xI#6$XhHNIh80r-T zKMO30mQjM448oLV=o1eaZSAiW=c7}&fSZKM zo_RQBSh@*sEy{;h0>0Ys;gu5EDDfZJ|735oxSIwFgG3XqZiFWjL9&_c+PphSD0$Db z8V8{VVFN-VErWE8i?Dm))y=?zn|LP@n7vQlMHC5Q3SJ12dM9(2d!zqA-Rd7@Cj*6Y z06z=Hz#y#m42!)FcCta1VogEfwze~buxTP-$Vj35$hdFp40BktzilKK5f-Rwn950Q zXH?ZxEAGdFA=ws#JLC0X7M!D4O)?nclHY+KP3(*ZquvL>7=FVdi_u1?gj)NL#n8?m zM)=fEsWvFFX)*j>V&_18sME&pOp=0&d_Q^?Scqn2gP~x&cRb7+&4?O9%}{h9(gW0m z=4Z$m1-t;t_%1jgMn1~258;YGAdpG8me`pHEcriz3Ev7V^c-6=p4ZSe;Sa|3yV^Zn zrh5lXvvH#qz2m_EmlI+l9!nhD9SSDylQNGU z@)yA%dKCO&WG|-R;kT5!k0;Lu*2ai9EGFO4#F6)OU8@fVl+zf)M^BG-p8{)IOTVW# zuA!FMGJYAv;d(eYh*3+yA;5c_;XRDn;|g-hh6G%PSnW9h)#s!*xjQ7j z%rG4$nE1)b!&-zLzNEr&L``uVr?Q z)Sv9_&3#YP^X^+*o9VRi+uWA#v=yjEqw%`_eQ*S{GAYgwG|hh}rq#NzLbb9Z@ye+E zo__1^@@}eqBfR_qxP7{V+ktxq9Gers+}TfcriKBmH;a=+yZua%#r{95_8(*Yb$2XK zTXHuczH(o`XEeu%AK>Ipfqo46ICF#N#8+X|O)&?U3Z~Hh62JX?2i_OBh8~^yzScLN zajVdsPn^Se5!LJ*4UXPR$PujMd}k#`G0vZ2p9t<*g>}IGB5K$t)aE{Bk6L#`Jozo) z(2)NeHDDWOj9XwGnP|N$Uh1~s{R^rWY8O9F*3)9X=RUD>jK{qc9K-X=kDM1C7w`@5 zJjVT=@yCM4_-w_vHEw;{2iePjEFlw^1G~>~pe2IrKZ{qoEfHks)q(6)^%md%?T}&I zzXh_t?17B1d!@63OjY-cTZ`SZp8NWOT!i3wM8=cLr6c3lNgaWV2Y=S?L(WrJCeYt! z)r2J-3{diHL9(+=stY*P(~;)Y74qT0EPR!$o}466PZBHxXF8lzdi#h{vwO6XxbMEO zv8ptI0T{Ma3|kLO^80uyDT<&V;2;a@8h;;c)3JUu@#xw~iav`U>o;tVb36y@)@j{Q zWChxqEO$nyc@P4zkN^m`EG5=-*Ui>=4im~*^x9{oab)Rb9al}6IY<>UtIds2_ z%z|q)O;(Wg5w;oa<~alXAWoC8CnXeu>lOO2O-mr_hLJ!ZDI^ef7rJYaAK=M;E4fW8q4zUL%Lr3ToK!CBXlN^%XM?P* zXdd*`j?y4hb42@S4-*LChv_H@1KBU6Fsq_TwJ$1K6ce~edFM!v<-GH0n1^tW*qmdY zQ!jcngil3_irXmqkU$HdANAYAQy|TCP=nS-b`A7XWY;m8oW0ng81e4{JIqqLtblPE zMK(W>{}7$z52<*M3StOdE+F5>ZR<}_6#G`#NvVfh<+hin5I)d}q;xo0ka&V~if|tTto|tNVm{(B9VVdO;zatGh_R#|l4U|@ zA;|#wu}InP;Y48L?0AMp2OO>QZ3HKR06OJ?0X*@z&Kj@stVwzNYQ{F z3+MwTE30gUa%v-(o`YnoEkvwl=zA4nvNd29BO#K+x&GYd0EP4&43cZB4f>vdW^?OE zICK$Fi2`Qu*7^WmMI4cz-4OOPT<7p^q!A>J7)1wDqpB3Tx|@1n1|WdSU4`_uSnPipO{N7+;C$7ua) z^qq_M_SC6m`fMorfC7k7fxvU8PQ@wl^WASLfTrjp3ha zvDq_RnLQIfP#_k(VE|6{9hS{YH*REf^T)cYa+Lh(-;L`Gy z`EPuyIRDzUYnK<6i`OnJe&g~Yr;~)4>x-8!T)TMnGCNh|Z_{9#G=xbL$#2m;3CG;= z&?HtE8qlQv9(5QZB`U_NC$CalHjd6xST^hg4as*DPQ=ciKEKicT)~{a>LDXT#kS;2 zG*kxPcoj(&%e%N`4j<~ZP;Z85nz({2&B0i6!;K1E5=#%0$RYMUsG``OBI|OY2_G_? zRqUivLGLh0&Q6cdIDIETR8i!_og|#HZk3Z^n9v}DXua|uP(jBxOsV7g7!exj(H26V zrg2=KO2TIaKNb0^viP6CKW8f=OnS_i^={!c2M)iVQ!&~^y#j}r_DK=jZ`W)N<&^kKB3 zhi;>eK8fQPf$=wWZ%PZ++zqmhI@L9;C{5{ehl~H@jZQ# z0ADUb&l@Rt2n?>JQA_yHR5k`sLrx58@JJ#1Mc)_{7DE3;s*xVElY*~F-x$Kvv_HHt zvO5af*+5o|DVqW1Ga)2Jz9{q=Z@xvzSsMqMfq>WQy$r2UKvT?3EQA)FwIYA16K0xH z)!GO4`94R9tTXdVZ}cX?-~+*~u|al*l85y934P^SKz0B!ocQCI&|G(`&GyKi4;x!A z(Z1{*HTtHHOc+=`ln=^=3V=P2p(&IFk=ywSvd|vx|D=)ayY}2NC8Lw+15drmM1^ia zUp&%>r^voK4{=VB5h&J=e)CvBx`OF%LkaztfKdJ`6zzlie$CjPzm808Qq`3AIc82u zGocTnqR9ODZZL|HQ1VyL#<{Qfhz3iDI2V7V*ILGe@wgnz;n-eB8-kLg5YZFUJ8ugK z|@HxXtWxyGMmgu@r)8sY|ZpFh%(VHR3$5l^`-OdyXX zwA2S2X!JNsUH%Ii>8Dg2KoO2bblD?0x>HkPPI#byn>fP_ z0EKlH_~Oevye$8v<_aPM$;sM(#> zomo-b&ZZ9NKy(2iNMWRH+@Mr|1Q1%JK#^a4q(xH{4bY|q`ft!bNzordfucpw01Z;B z-#K@7xs;TI#LnEgbLT$JJ?A^G8;*_^HT-?&`Jee;Kcs2@Mh}C3CLUhWG(Gw~6ohtF z6S^=0y#4g+?yd7ftx*=cja+{Oh#JYA@$N1&~ z|4O^tzdfR9m%XO&q~kg*PtMK=FZQZ&C=WWqtA=<{a(?u_Q}^7s;kKM`8P%$Rv+Os8 z->gODV)1;l)s7>_m7WvVJtuM-p7YAW#rX?p6nK%-lA&le{^Ik9XnF9%*dtT|DP z4xFg!HY0rJ#>K@&;V&;&s-bL!(yd4@Y6o#tZmlgYI@PclwHxScrs3kNn~`(ra;faR zj`8RtbR+#%?D)cK#{RPJMaA9(Y1GBVnX}o%=P|p3&SiJi2_@a$c&>!*SN*u|L|%+v z!$l9(=qRrxTY2%i=e5)@Kt`HG=KdXpEeWSBD!F~h$(!5PJFDlNAZCz;Vx>RLlJ8T z!pJ+hzgRroA5OZlA2!{|** z&tOCkA4us1%sKS_!eI#Z8-COV9jf)vuX+*l%&c=3^O7}U>&xPFyCSjYK_~$aN6?pW zj$=k(VPd9#qq>BKM1 zSwE^fArKK@CnaFnz+qI_F3juMP%`gV92t`C6eBn1^uLc4C(feiR zv=_v)Sov(!j#{i4@ChMJq{iUA27f@pvL*LMVZrWA70ucW!1@eG)Q18vqKEZiB&N(V zgo1!ikx(dFWFQTEp)Ca_wJ=ixB6QUBPzJ|KrLhD^?nEuO>X8a!+-FeEK$p2ysRL9W zIqpCMuts}~0>Ib=6U-A(cW9inWQA6lfPe`xlzJl6oDadA49!GN0FCskF&5Py3-Hq9 zt%kyCxS%~l(K8Xg{dIci2z3ZP0^=%0;4Cu(61FfDGmoMpV$KX`AnBL_m;ekG0u#38 zq0|tM&dMrYMUAS=^cr!<>p;V26mOE2AQ@ir>h4V+`$Lo`xa%1J86sk&1B^p`sUw&} zU)f$BIEejZ)ZY;p4p%N;APFVhQL~%v_^qr~lJ>yH zx8wROfxQ5YGjW1hrglR~2D+e5kNJ(-33N)xcA2pRb0c$&K;dNx`+{sVMdTVDe+{K) zsZDt$BIAKgJf04u1$3AGEgF+(fPH(^DkG?%*Ni_R+a6je_u)0d6@ON#dC;^^D8;yn z1(`)xS#MWk=p@qc;I4?&CKia1xvQ;(mYq|Xf^YaS!yW4YY+rL$86mWuzWQf1;V*M1 z1B~QHWP!LhT|Xdmf{nPro$e8w%twXWOabo3UuMMe1W3Mcoi>^!Sn+cY8p%9fl{CX< zNvSF7o~O1NwgVDHSSrCR*d>G&QRJyVN?;(4pspG|Y!YZUWp~#kz;U?^VqpqbBDdxp z;hCvfSNbBf<{Tr4O}HXy4$ual z%}n(uHNk1A1xu=NPm@#l?&q(~)nSd#MRm8{am5^A;aujA&4tp3P3H!)(rbmiT6k%5 z%CU;(9$}hvsMVX|>*zh2#>Fyh-PXtSf^O*st`qpTbV;waZAAeUv~u36k}|KFoh*@!oFj5jW@OCu%*Su+DCMa{uABS-om!6YKigHi+DD$WoNo(Vs4t) zGTfsBH=gR`V{0>)n0NKAg&vCN!CJA`j;cQ1(r!G3@2$L4y27OqDxpHE(B@)HMec*z6ezhc(me@J~wRDu}l2 z&%o0HaW>I5)?$4fgWuEVONJ!ImvHqJ8!#`Z%P!QVg=2x+SQ73D4^I(wQ_wme3qn^^ zdZt_XSP;TCk4~VV+NQevd9EEV&%T)E!7%}VrevfAuL%GuvvxW%=ys=CIy_*6 zrTHuKug+h5b3QfUjifnt#^*~zJm1s`VSlFiiwozdCpkt_wMhLClS7@Pb+|pVFQlT%W7U^k-#L`My=+QdOXGBg-Q1@Z7jBcX#~cMGw0mgOKuph#HnY8(6v$pF|S#%}B~0Ol;C;G6=%xpKJ-%$IPY*wtZ=qZuD0 zy$Ht#MmD@=Nd{2Rs=G@FN|2|uR^B!v+Ib7Voy0_YP^bYQuN44``i2JR!{}c#ZWs## zTy)P!jq;&X7x(lupA9;|S;b=k>^^ugtrK5Df6QBZw9bA63OrfKh#w6ySFiaa6C+YZ zG_idse3>RTL=%g(n>tPG4G7i^Lx41LHyS``jGjG5{`KX`#ml(L=TZCc4i6oxc}?$D zOP=^G>YdhZyz_+CdFq$Wuh$+j^%=BI?FbH+CrYM#fgXntun3z(Tkt;cDJvlxRw;xw%QxhG0<{de^0Mz?-}^zxR=vitEP2x!tB}`q#Ead zfLlG9NuEZ$Yjg{&3RRC=wL`I;3?+pgBl`C?P5A?Z#HLqhfn60&8MPJ)6Yv9*qgf?&MkRzMg zZCLzXzipu=&o$5!_`(1@QXg3Bx_;ZdokPzQU@eH^9iuy()eO@Y$uNImVti^fGEir1 zm>c?rL1UIi+kb&w>OuO9gk?(Zdw{EeN9`hD2EpVg1wFm(UweDynv=z=C@KZJ7p8O- z&YtqK*(-sY(!(mpXTL!C)u?4p)>$b#J$zMuLr$A-@~R3J-;M1$c9L3jmMQ}2<8=m1lofzZJ$dTeYM zJ!V-4T5M!#rCG~YaDj?4 zA(s!$uciyMR8KjnE?&k*SmY2z29dG|BKaEiMKI5Fr=D5kJ^AP=iO+bI zlPGKe#`kK&kSe+_lY%83#-m_2PoWI%`W$ZQ7#2`*m!e9H7#>FC#qo%wum2c79&u9Z z?0fubl<|gXn+Vwd@~w8z6BB;~TMeOS*0)G>wzeG{NE@c+@Di0S^&k60_%5n=tDWsc zpxy6p)&hO^djI2>P|EmPe!gGZ@A6Nm>I$_f%A-^!9hutg7J?{nedxi$;v!b2^0WMs z;IAnlL`#u8hnmtzYLUmA8Z9YLQj>`uuU2|y%%@N-=jmNOqk3whQJQO1*!gOy8;g3H zW4*4n%;p~DtMokyCvTm+NF`nltt2gO?aw-%BWgHe!Et(E=L-5Xn~;`av3W_?X|5~6 z$SeXf_nDSa)F<(8SoCavIAu-gMMGB6O_d1S`y;*>#t)dFN7EoXnV5kF!@zUM}pxO`*D z$jVS+h@E08856ruGoFlW7USWhunrF$M#IR;NTQ2rv0LmBd+%6OvpI_Q`TqMcIFWnA z6L*kvMeA|#5%k@Anh^U2o_2_jiYL`O)dS3&SGp5wGfxg=*X)6 zk@#45r*OJc%}=1_sSm8~&SZRZ62G0v-x$ zPNu~tK!<-4&op&0)1AUxr;;7XM6y$qL@AlV8V=k+PWZ+*ZoHoiuk1=d3Gu8rC}u&G z-O28iX;JR4bN9;bl|5ok9J)g^8s6M{mt4-_l_wGtJ91b&C!QCdyn~&F`SpqzdU&#V58Ok@gL~#Ur(kv zPU^uUN2uU>YF81923lKkje0e1x@nH1?L7C|^ZfT8Cu_es_1%|$+Ip;TCNqtOHB(rwIx@k+=$!Duh9Ep8l-|KpWBGi5hhtBaMviVn2+3>9wZanM&}Po zdz6;zE4_4B#kuh{`2MTZo7oCtUVmygE94>3oeU@NhA$AUvFje~N@}8CHGw>6MD~uF zuHiS>8d3(XPkrXpY~SV8$l8(%Y2p2r@+?FZP9F~4OXrum*!>8tVyyaI&pqZSmF0(jXtX$81U_2N9f zeVX2U0+-Yh?M4g2LwdVp%LDY{;htOW8pSc?L3+u`jW2A2j`Nr7aaSm?`ZKhN$Rja^ z48$)D-TuXdJpl>X|DpUdRBz*Ia@6_}e6suzwFf6F-d=|}25!wQKtIR&+A}%~PF?Hl zhz+49)(6@*jo4hVgh8g+n}~`dwq$+D5*8gyqtdFrs+H{a-=f=7NR8oW!9d=hHVKx= zQHo}4VXvc3Kj8d~AAkoB(m@Q`Q)Zb4(#&YC@@AA-CNA!!SottRFVRg$E+~a7LXT5_ zeC7bGN$+teNu43-W!VNs2X<4oj!lncOFDd>PkE?_cdiBg2Vh2P08<>NL(m=Vw)24eHmI1I-mU7x z`#%=TH5{^|<9`(>#UbEEMBHImNZRDae$j)p!aK(Q$-lu1@-F&+wE1L-uxGIk-^R$E zA^8F(*u*xojZP&)l~#~5%Mv;SY8sy5JBT49czVz97~m&l&z#87V=K{Davb_Lq9d^_ zM&tZSL5vaOh;iiYhVUMt!>-AtVU(eu?C2`KpLpndiFFGh&4QF-2R#wf!akpb*JF#F^wkGiGW3Bars5q5W{A=*lx!|d ze-qx);5;#Fh-rE zrGF|&{yCP;4oK(dBb{+t9ip%D|8)E)%d8)DqMqC-dFYr(+jhnP^`H>HLzKU>`;i`3 zVS{^4$8cuaBnu90fb)ajbviaje@mswwBX7bCrZjq-5uxfjS+gl~xq}S#nc5d!Tqo zsR#>IO~K;fI07=t?Ah}I(qV9D@8Kat!Pm`NC4?2P64?*oioA+3rU^_au2w+6qR9C$ z`#}M@v#o0p;SKt?A0U{);fw;;(DMW=tSRJ<5gA5wm}=|~Et7jr3zZ5_wNly%%~^pN zP^qNFN<|qTm5L 0 but below the + 🔴 threshold). ⚠️ folds in here because the bench's whole job is + flagging regex behavior changes; a separate "warned" bucket would + split the headline and dilute the reviewer signal. + - new: 🆕 detectors added by the PR. + - unchanged: ✅ no diff vs main. + """ + regressed = sum(1 for r in rows if not r["is_new"] and r["emoji"] in ("🔴", "⚠️")) + new_count = sum(1 for r in rows if r["is_new"]) + unchanged = sum(1 for r in rows if r["emoji"] == "✅") + return f"**{regressed} regressed, {new_count} new, {unchanged} unchanged.**" + + def truncate(s, n=SAMPLE_TRUNCATE): if len(s) <= n: return s @@ -314,7 +336,14 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, "blast": blast, }) - parts = ["## Corpora Test Results — Diff (PR vs main)", "", PREAMBLE, ""] + parts = [ + STICKY_COMMENT_MARKER, + "## Corpora Test Results — Diff (PR vs main)", + "", + ] + if rows: + parts += [build_top_line_summary(rows), ""] + parts += [PREAMBLE, ""] if changed: parts.append( f"_Scoped to {len(changed)} detector(s) changed in this PR; " From 5186d126f2049528181afe25cb61e2cb83dab7c3 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 12:35:13 +0500 Subject: [PATCH 22/43] cleanup, enable verification --- .github/workflows/detector-corpora-test.yml | 89 +---- scripts/render_heatmap.py | 315 ------------------ scripts/test-last-changed-detector.sh | 17 - scripts/{ => test}/build_keyword_corpus.py | 0 .../{ => test}/detect_changed_detectors.sh | 0 scripts/{ => test}/detector_corpora_test.sh | 23 +- scripts/{ => test}/diff_corpora_results.py | 219 +++--------- scripts/test_changed_detectors.sh | 17 - 8 files changed, 55 insertions(+), 625 deletions(-) delete mode 100644 scripts/render_heatmap.py delete mode 100755 scripts/test-last-changed-detector.sh rename scripts/{ => test}/build_keyword_corpus.py (100%) rename scripts/{ => test}/detect_changed_detectors.sh (100%) rename scripts/{ => test}/detector_corpora_test.sh (77%) rename scripts/{ => test}/diff_corpora_results.py (62%) delete mode 100755 scripts/test_changed_detectors.sh diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 03c2044e2e7f..7edc8008cb4b 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -8,11 +8,10 @@ on: - 'pkg/detectors/**' - 'pkg/engine/defaults/defaults.go' - '.github/workflows/detector-corpora-test.yml' - - 'scripts/detector_corpora_test.sh' - - 'scripts/diff_corpora_results.py' - - 'scripts/detect_changed_detectors.sh' - - 'scripts/build_keyword_corpus.py' - - 'scripts/render_heatmap.py' + - 'scripts/test/detector_corpora_test.sh' + - 'scripts/test/diff_corpora_results.py' + - 'scripts/test/detect_changed_detectors.sh' + - 'scripts/test/build_keyword_corpus.py' - 'hack/extract-keywords/**' env: @@ -40,11 +39,6 @@ jobs: - name: Install dependencies run: sudo apt-get install -y zstd jq - # matplotlib drives the Phase 4 heatmap render. Pinned-major install - # to keep CI deterministic; ~30s cold install, fine at current scale. - - name: Install matplotlib - run: pip install --quiet 'matplotlib>=3.7,<4' - - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -74,10 +68,10 @@ jobs: BASE_REF: ${{ steps.merge_base.outputs.sha }} run: | set -o pipefail - chmod +x scripts/detect_changed_detectors.sh - PR_CSV=$(./scripts/detect_changed_detectors.sh --pr-csv || true) - MAIN_CSV=$(./scripts/detect_changed_detectors.sh --main-csv || true) - NEW_LIST=$(./scripts/detect_changed_detectors.sh --new-only || true) + chmod +x scripts/test/detect_changed_detectors.sh + PR_CSV=$(./scripts/test/detect_changed_detectors.sh --pr-csv || true) + MAIN_CSV=$(./scripts/test/detect_changed_detectors.sh --main-csv || true) + NEW_LIST=$(./scripts/test/detect_changed_detectors.sh --new-only || true) NEW_CSV=$(echo "$NEW_LIST" | paste -sd, -) echo "PR detectors: $PR_CSV" echo "Main detectors: $MAIN_CSV" @@ -93,7 +87,7 @@ jobs: # Sticky comment: find any prior detector-bench comment on the PR by # the marker substring and update it in place. The marker — kept in - # sync with STICKY_COMMENT_MARKER in scripts/diff_corpora_results.py — + # sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py — # has to appear in BOTH the skip body and the diff body so the same # comment flips between them as iterative pushes change which path # fires. Skip body is only posted on pull_request events; the original @@ -142,7 +136,7 @@ jobs: DETECTORS: ${{ steps.detect.outputs.pr_csv }} run: | set -o pipefail - python3 scripts/build_keyword_corpus.py \ + python3 scripts/test/build_keyword_corpus.py \ --detectors="$DETECTORS" \ --extract-keywords-bin=/tmp/extract-keywords \ --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ @@ -201,7 +195,7 @@ jobs: [[ -z "$dataset" ]] && continue files+=("$dataset") done <<< "$DATASETS" - ./scripts/detector_corpora_test.sh "${files[@]}" + ./scripts/test/detector_corpora_test.sh "${files[@]}" # Main scan is skipped when main_csv is empty (PR adds only new # detectors — nothing to compare against on main). The diff step is @@ -227,55 +221,7 @@ jobs: [[ -z "$dataset" ]] && continue files+=("$dataset") done <<< "$DATASETS" - ./scripts/detector_corpora_test.sh "${files[@]}" - - # Render the per-(detector × decoder) Δ heatmap. Decoder columns are - # the only stable per-finding signal post-stdin (file metadata is lost - # — see render_heatmap.py docstring), and they're real diagnostic - # signal: regressions in the BASE64 path mean something different - # from regressions in PLAIN. Two outputs: - # - # - heatmap-grid.json: the actual data the comment renders from - # (via diff_corpora_results.py's emoji table). The comment never - # embeds the PNG — GitHub's PR-comment Markdown sanitizer strips - # ``data:`` URLs and serves artifact zips behind auth, so neither - # inline nor artifact-link images render. - # - heatmap.png: archived as a workflow artifact for reviewers - # who want the colored matplotlib version. The comment includes - # a click-through link to it. - # - # Both outputs are skipped when the grid would be empty/all-zero; - # the diff step's `[[ -s ... ]]` guards handle that cleanly. - - name: Render heatmap - if: steps.detect.outputs.any_changed == 'true' - shell: bash - env: - CHANGED: ${{ steps.detect.outputs.pr_csv }} - run: | - set -o pipefail - python3 scripts/render_heatmap.py \ - /tmp/results-main.jsonl /tmp/results-pr.jsonl \ - --changed-detectors="$CHANGED" \ - --output=/tmp/heatmap.png \ - --grid-output=/tmp/heatmap-grid.json || true - if [[ -s /tmp/heatmap-grid.json ]]; then - ls -lh /tmp/heatmap-grid.json /tmp/heatmap.png 2>/dev/null || true - else - echo "No heatmap produced (empty grid or no diff)." - fi - - # Archive the colored PNG so reviewers who want the rich version can - # click through. The comment never embeds the image — see the render - # step comment for why — so this is artifact-only. - - name: Upload heatmap artifact - id: upload_heatmap - if: steps.detect.outputs.any_changed == 'true' - uses: actions/upload-artifact@v4 - with: - name: detector-bench-heatmap - path: /tmp/heatmap.png - if-no-files-found: ignore - retention-days: 14 + ./scripts/test/detector_corpora_test.sh "${files[@]}" - name: Diff results if: steps.detect.outputs.any_changed == 'true' @@ -283,7 +229,6 @@ jobs: env: CHANGED: ${{ steps.detect.outputs.pr_csv }} NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }} - HEATMAP_ARTIFACT_URL: ${{ steps.upload_heatmap.outputs.artifact-url }} run: | set -o pipefail CORPUS_BYTES=0 @@ -294,20 +239,12 @@ jobs: if [[ -s /tmp/keyword-corpus-meta.json ]]; then META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json) fi - HEATMAP_ARG=() - if [[ -s /tmp/heatmap-grid.json ]]; then - HEATMAP_ARG=(--heatmap-grid=/tmp/heatmap-grid.json) - if [[ -n "$HEATMAP_ARTIFACT_URL" ]]; then - HEATMAP_ARG+=(--heatmap-artifact-url="$HEATMAP_ARTIFACT_URL") - fi - fi - python3 scripts/diff_corpora_results.py \ + python3 scripts/test/diff_corpora_results.py \ /tmp/results-main.jsonl /tmp/results-pr.jsonl \ --changed-detectors="$CHANGED" \ --new-detectors="$NEW_DETECTORS" \ --corpus-bytes="$CORPUS_BYTES" \ "${META_ARG[@]}" \ - "${HEATMAP_ARG[@]}" \ > /tmp/diff-report.md cat /tmp/diff-report.md diff --git a/scripts/render_heatmap.py b/scripts/render_heatmap.py deleted file mode 100644 index 5b26d981b300..000000000000 --- a/scripts/render_heatmap.py +++ /dev/null @@ -1,315 +0,0 @@ -#!/usr/bin/env python3 -"""Render a per-(detector, decoder) Δ heatmap of detector findings. - -Inputs are the same JSONL files produced by trufflehog stdin scans that -``diff_corpora_results.py`` consumes (main vs PR). The diff script identifies -findings by ``(DetectorName, Raw or RawV2)`` with set semantics; the heatmap -keeps that identity but adds ``DecoderName`` to the bucketing key, so each -cell answers "how many unique secrets did this (detector, decoder) cell gain -or lose?" - -Bucketing rationale (Phase 4 design decision): - - Stdin scans drop file metadata — both Layer 0 (S3 corpus) and Layer 1 - (keyword corpus) findings come back with empty ``SourceMetadata.Data.Stdin``, - so we can't bucket by file extension. ``DecoderName`` is the only stable - per-finding signal that always exists, and it carries real diagnostic - meaning: "the regression came in via the BASE64 decode path" or "the - ESCAPED_UNICODE path lit up new false positives" tells reviewers which - lane to investigate. Robust-by-construction beats heuristic-on-Raw or - reverse-correlation-on-L1. - -Visual choices: - - - Diverging RdBu_r colormap: red = increase (regression-likely), blue = - decrease (lost recall), white = 0. - - SymLogNorm: cells with Δ=1 in a rare decoder remain visible even when a - sibling cell has Δ=200 in PLAIN. Linear band around 0 keeps the white - "no change" reading; log-ish outside it preserves the rare-decoder - diagnostic. Without this, common-decoder outliers wash out small but - important signals. - - Every cell is annotated with its integer Δ. Belt-and-suspenders against - color-only readings. - - Empty decoder columns (no findings on either side for any changed - detector) are dropped — no need to render dead space. - -Identity bucketing: - - identity := (DecoderName, Raw or RawV2) - per-cell Δ := |pr_only| - |main_only| - - Note this is a stricter identity than the summary table's - ``(DetectorName, Raw or RawV2)``: a single secret found via both PLAIN - and BASE64 contributes one identity to the table but two to the heatmap. - That's the desired behavior — the heatmap diagnoses *which decoder path - changed*, not *how many distinct secrets changed overall*. - -Outputs: - - - PNG (``--output``, default /tmp/heatmap.png): the matplotlib render. - Archived as a workflow artifact for reviewers who want the colored - version; not embedded inline in the comment because GitHub's - Markdown sanitizer strips ``data:`` URLs and artifact-zip URLs are - auth-gated, neither of which renders as ```` in PR comments. - - - Grid JSON (``--grid-output``, default /tmp/heatmap-grid.json): same - Δ matrix as the PNG. The diff script reads this and renders an - emoji-bucketed Markdown table — that's what actually shows up in the - PR comment. Always emitted when a non-empty grid exists, even if - matplotlib isn't available, so the comment renders without the PNG - if needed. - -Skips both outputs (no files written) when the grid would be all-zero or -empty. The diff script handles missing files gracefully. - -Usage: - render_heatmap.py --changed-detectors= - [--output=/tmp/heatmap.png] - [--grid-output=/tmp/heatmap-grid.json] -""" -from __future__ import annotations - -import argparse -import json -import sys -from collections import defaultdict - - -# Standard decoders emitted by trufflehog. Ordered by expected frequency so -# the heatmap reads left-to-right common→rare. Any decoder not in this list -# falls through to alphabetical ordering after the canonical ones. -DECODER_ORDER = ["PLAIN", "BASE64", "UTF8", "UTF16", "ESCAPED_UNICODE"] - - -def parse_csv(s): - """Lowercase + strip ``.v`` suffix, mirrors diff_corpora_results.parse_csv.""" - if not s: - return set() - out = set() - for item in s.split(","): - item = item.strip() - if not item: - continue - if "." in item: - item = item.split(".", 1)[0] - out.add(item.lower()) - return out - - -def load_findings(path): - """Returns dict: detector -> dict[decoder] -> set(raw identities).""" - by_dd = defaultdict(lambda: defaultdict(set)) - try: - f = open(path, "r", encoding="utf-8", errors="replace") - except OSError: - return by_dd - with f: - for line in f: - line = line.strip() - if not line: - continue - try: - obj = json.loads(line) - except json.JSONDecodeError: - continue - detector = obj.get("DetectorName") or "" - decoder = obj.get("DecoderName") or "UNKNOWN" - raw = obj.get("Raw") or obj.get("RawV2") or "" - if not detector or not raw: - continue - by_dd[detector][decoder].add(raw) - return by_dd - - -def order_decoders(present): - """PLAIN/BASE64/... first when present, then any extras alphabetically.""" - canonical = [d for d in DECODER_ORDER if d in present] - extras = sorted(d for d in present if d not in DECODER_ORDER) - return canonical + extras - - -def build_grid(main, pr, changed): - """Returns (rows, cols, deltas) where deltas[i][j] is the signed Δ count - for row detector i and column decoder j. Detectors and decoders that - never appear on either side are dropped.""" - detectors = sorted( - d for d in (set(main) | set(pr)) - if d.lower() in changed - ) - - decoders_present = set() - for d in detectors: - decoders_present.update(main.get(d, {}).keys()) - decoders_present.update(pr.get(d, {}).keys()) - decoders = order_decoders(decoders_present) - - deltas = [] - row_abs_totals = [] - for d in detectors: - row = [] - row_abs = 0 - for dec in decoders: - m_set = main.get(d, {}).get(dec, set()) - p_set = pr.get(d, {}).get(dec, set()) - delta = len(p_set - m_set) - len(m_set - p_set) - row.append(delta) - row_abs += abs(delta) - deltas.append(row) - row_abs_totals.append(row_abs) - - # Drop columns that are zero across every detector — they add no signal. - keep_cols = [j for j in range(len(decoders)) - if any(deltas[i][j] != 0 for i in range(len(detectors)))] - if not keep_cols: - return detectors, [], [] - decoders = [decoders[j] for j in keep_cols] - deltas = [[row[j] for j in keep_cols] for row in deltas] - - # Sort rows by total |Δ| desc, ties broken alphabetically. Detectors with - # no Δ in any kept column drop off the bottom of the figure. - order = sorted( - range(len(detectors)), - key=lambda i: (-row_abs_totals[i], detectors[i]), - ) - detectors = [detectors[i] for i in order if row_abs_totals[i] > 0] - deltas = [deltas[i] for i in order if row_abs_totals[i] > 0] - - return detectors, decoders, deltas - - -def render(detectors, decoders, deltas, output_path): - """Write the heatmap PNG. Caller has already verified the grid is non-empty.""" - # Lazy import: this script is only invoked from the workflow when - # matplotlib has been pip-installed in CI; importing at module top would - # break unit-test-style invocations that just want to assert grid shape. - import matplotlib - - matplotlib.use("Agg") # No display in CI. - import matplotlib.pyplot as plt - from matplotlib.colors import SymLogNorm - - n_rows = len(detectors) - n_cols = len(decoders) - - # Figure size: aim for a tight PNG well under the 50 KB inline budget. - # Width scales with column count; height scales with row count, with - # generous lower bounds so labels don't clip on tiny grids. - width = max(5.5, 1.4 * n_cols + 2.5) - height = max(2.5, 0.55 * n_rows + 1.6) - fig, ax = plt.subplots(figsize=(width, height), dpi=100) - - max_abs = max((abs(v) for row in deltas for v in row), default=1) - if max_abs < 1: - max_abs = 1 - # SymLogNorm linthresh=1 keeps integer Δ in [-1,1] in the linear band so - # zero stays white; outside that we go log-ish so a Δ=200 cell doesn't - # saturate everything else to faint pastel. - norm = SymLogNorm(linthresh=1.0, vmin=-max_abs, vmax=max_abs, base=10) - - im = ax.imshow(deltas, aspect="auto", cmap="RdBu_r", norm=norm) - - ax.set_xticks(range(n_cols)) - ax.set_xticklabels(decoders, rotation=30, ha="right", fontsize=9) - ax.set_yticks(range(n_rows)) - ax.set_yticklabels(detectors, fontsize=9) - ax.set_xlabel("Decoder", fontsize=10) - ax.set_ylabel("Detector", fontsize=10) - ax.set_title("PR vs main — Δ unique findings per (detector, decoder)", fontsize=11) - - # Annotate every cell with its integer Δ. Text color flips to white - # on saturated cells so the number stays readable. - for i in range(n_rows): - for j in range(n_cols): - v = deltas[i][j] - if v == 0: - label = "0" - color = "#888888" - else: - label = f"{v:+d}" - rgba = im.cmap(im.norm(v)) - # Perceived luminance — flip text color on dark cells. - lum = 0.299 * rgba[0] + 0.587 * rgba[1] + 0.114 * rgba[2] - color = "white" if lum < 0.45 else "black" - ax.text(j, i, label, ha="center", va="center", - color=color, fontsize=9) - - cbar = fig.colorbar(im, ax=ax, shrink=0.85, pad=0.02) - cbar.set_label("Δ unique findings (PR − main)", fontsize=9) - cbar.ax.tick_params(labelsize=8) - - fig.tight_layout() - fig.savefig(output_path, dpi=100, format="png", - bbox_inches="tight", pad_inches=0.15) - plt.close(fig) - - -def write_grid_json(path, detectors, decoders, deltas): - """Persist the grid the diff script renders the emoji table from. - - The ``_layout`` field is a human-readable note for future readers — it - has no behavioral effect. We emit it inline rather than relying solely - on this docstring because the JSON is the long-lived contract between - the renderer and the diff script. - """ - payload = { - "detectors": detectors, - "decoders": decoders, - "deltas": deltas, - "_layout": "deltas[i][j] = (PR - main) unique-finding count for detectors[i] / decoders[j]", - } - with open(path, "w", encoding="utf-8") as f: - json.dump(payload, f, indent=2) - f.write("\n") - - -def try_render_png(detectors, decoders, deltas, output_path): - """Attempt to render the PNG; on matplotlib import failure, log and - move on. The PNG is artifact-only — the comment doesn't need it — so a - missing matplotlib should not fail the workflow.""" - try: - render(detectors, decoders, deltas, output_path) - except ImportError as exc: - print(f"[render_heatmap] matplotlib unavailable, skipping PNG: {exc}", - file=sys.stderr) - return False - return True - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("main_jsonl") - parser.add_argument("pr_jsonl") - parser.add_argument("--changed-detectors", default="", - help="CSV of detectors changed in PR; restricts heatmap rows.") - parser.add_argument("--output", default="/tmp/heatmap.png", - help="PNG output path (default /tmp/heatmap.png).") - parser.add_argument("--grid-output", default="/tmp/heatmap-grid.json", - help="Grid JSON output path; consumed by diff_corpora_results.py.") - args = parser.parse_args() - - changed = parse_csv(args.changed_detectors) - if not changed: - print("[render_heatmap] no changed detectors supplied; nothing to render", - file=sys.stderr) - return 0 - - main_findings = load_findings(args.main_jsonl) - pr_findings = load_findings(args.pr_jsonl) - detectors, decoders, deltas = build_grid(main_findings, pr_findings, changed) - - if not detectors or not decoders: - print("[render_heatmap] grid is empty or all-zero; skipping render", - file=sys.stderr) - return 0 - - write_grid_json(args.grid_output, detectors, decoders, deltas) - png_ok = try_render_png(detectors, decoders, deltas, args.output) - suffix = f" + {args.output}" if png_ok else " (PNG skipped)" - print(f"[render_heatmap] wrote {args.grid_output}{suffix} " - f"({len(detectors)} rows × {len(decoders)} cols)", - file=sys.stderr) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/test-last-changed-detector.sh b/scripts/test-last-changed-detector.sh deleted file mode 100755 index 4ba03aec6991..000000000000 --- a/scripts/test-last-changed-detector.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -uo pipefail - -CHANGED=$(git diff --name-only --no-commit-id origin/main | grep pkg/detectors | grep -v test) -while IFS= read -r FILE; do - DIRECTORY=$(basename $FILE ".go") - if [ -d "pkg/detectors/$DIRECTORY" ] - then - echo $DIRECTORY - go test -v "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/$DIRECTORY" - retVal=$? - if [ $retVal -ne 0 ]; then - exit 1 - fi - fi -done <<< "$CHANGED" diff --git a/scripts/build_keyword_corpus.py b/scripts/test/build_keyword_corpus.py similarity index 100% rename from scripts/build_keyword_corpus.py rename to scripts/test/build_keyword_corpus.py diff --git a/scripts/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh similarity index 100% rename from scripts/detect_changed_detectors.sh rename to scripts/test/detect_changed_detectors.sh diff --git a/scripts/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh similarity index 77% rename from scripts/detector_corpora_test.sh rename to scripts/test/detector_corpora_test.sh index 50a2f78f27c3..9653633fb55e 100755 --- a/scripts/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -47,21 +47,10 @@ fi CORPUS_BYTES_FILE="${CORPUS_BYTES_FILE:-}" TOTAL_BYTES=0 -# --no-verification and --allow-verification-overlap are paired intentionally. -# This bench measures per-detector regex behavior in isolation: -# - --no-verification: avoids network-flake noise (rate limits, transient 5xx -# errors) that would otherwise produce verified/unverified deltas -# indistinguishable from real regex regressions. Verifier behavior is -# covered by detector unit tests. -# - --allow-verification-overlap: bypasses the engine's cross-detector -# overlap routing (pkg/engine/engine.go:862-872 + likelyDuplicate). That -# routing exists for verification safety — when one chunk has matches from -# multiple detectors, it dedups near-identical results so the same secret -# isn't sent to multiple verifiers. With verification off, the routing has -# no purpose, but its dedup side-effect (silently dropping a detector's -# other matches in a multi-match chunk) makes a regex change in detector A -# shift raw match counts in unrelated detector B, contaminating the diff. -# Bypassing it gives each detector independent regex measurement. +# Verification is enabled intentionally. The scan is scoped via --include-detectors +# to only the detectors changed in the PR (typically 1-3), so the number of +# verification API calls is small and rate limiting is not a concern. This lets +# the bench surface verified/unknown deltas alongside regex match changes. scan() { local input="$1" local bytes_tmp="" @@ -78,8 +67,6 @@ scan() { | awk -v BF="$bytes_tmp" '{ b += length($0) + 1; print } END { printf "%d", b > BF; close(BF) }' \ | "$TRUFFLEHOG_BIN" \ --no-update \ - --no-verification \ - --allow-verification-overlap \ --log-level=3 \ --concurrency=6 \ --json \ @@ -91,8 +78,6 @@ scan() { | jq -r .content 2>> "$STDERR_FILE" \ | "$TRUFFLEHOG_BIN" \ --no-update \ - --no-verification \ - --allow-verification-overlap \ --log-level=3 \ --concurrency=6 \ --json \ diff --git a/scripts/diff_corpora_results.py b/scripts/test/diff_corpora_results.py similarity index 62% rename from scripts/diff_corpora_results.py rename to scripts/test/diff_corpora_results.py index 470726f4a772..2864335467fb 100755 --- a/scripts/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -7,9 +7,9 @@ duplicates within a single scan collapse into one identity, so a regex change either adds a new (detector, secret) identity or removes one. -Verification is disabled at scan time (see scripts/detector_corpora_test.sh), -so verified/unverified deltas are intentionally not surfaced — the diff -measures regex match changes only. +Verification is enabled at scan time and scoped to the changed detectors via +--include-detectors (typically 1-3 per PR). The report surfaces verified and +unknown (verification error) counts alongside regex match deltas. Phase 2: when --changed-detectors is provided, the report focuses on the detectors changed by the PR. Detectors flagged via --new-detectors are @@ -20,19 +20,9 @@ Phase 3a: --keyword-corpus-meta points at the sidecar JSON written by scripts/build_keyword_corpus.py. When present, detectors whose Layer 1 (GitHub Code Search) fetch returned zero results get a concise warning -rendered above the per-detector details — they're flagged so reviewers -know the bench's verdict for those detectors leans entirely on the S3 -corpus and may be under-sampled. - -Phase 4: --heatmap-grid points at the JSON sidecar emitted by -scripts/render_heatmap.py — same Δ matrix as the PNG render. We turn that -into an emoji-bucketed Markdown table at the top of the report so reviewers -can see the per-(detector, decoder) breakdown at a glance. Inline image -embedding was tried and abandoned: GitHub strips ``data:`` URLs from PR -comments, and artifact-zip URLs are auth-gated rather than served as -images. The PNG still ships as a workflow artifact — when ---heatmap-artifact-url is supplied we link to it under the table for -reviewers who want the colored version. +rendered above the summary table — they're flagged so reviewers know the +bench's verdict for those detectors leans entirely on the S3 corpus and +may be under-sampled. Usage: diff_corpora_results.py @@ -40,21 +30,17 @@ [--new-detectors=] [--corpus-bytes=] [--keyword-corpus-meta=] - [--heatmap-grid=] - [--heatmap-artifact-url=] """ import argparse import json -import os import sys from collections import defaultdict PREAMBLE = ( - "This bench measures regex match regressions only. It runs with " - "`--no-verification --allow-verification-overlap` so each detector's " - "regex behavior is measured independently — verifier behavior is tested " - "separately by detector unit tests." + "This bench measures regex match changes and verification behavior. " + "The scan is scoped to changed detectors only via `--include-detectors`, " + "so verification API calls are limited to the detectors touched by this PR." ) # Marker on the very first line of the body so peter-evans/find-comment can @@ -65,19 +51,6 @@ # 10 GB notional monorepo for blast-radius projection. BLAST_RADIUS_BYTES = 10 * 1024 * 1024 * 1024 -# Cap how many sample Raw values we render in the per-detector details. -SAMPLE_LIMIT = 10 -SAMPLE_TRUNCATE = 120 - -# Heatmap cell color buckets, applied to per-(detector, decoder) Δ unique -# findings. Thresholds line up with status_emoji's NEW threshold (>5 trips -# 🔴) so a 🟥 cell carries the same "this is real" weight as a 🔴 row in the -# summary table. 🟦 covers any decrease — recall regressions are rarer than -# FP regressions, and we haven't seen real data justifying a removal -# gradient yet. -HEATMAP_BUCKET_HOT = 6 # Δ ≥ this → 🟥 -HEATMAP_BUCKET_WARM = 1 # +1..+5 → 🟧 - def parse_csv(s): """Parse a comma-separated detector list into normalized name set. @@ -101,8 +74,12 @@ def parse_csv(s): def load_findings(path): - """Returns dict: detector_name -> {"identities": set[str], "total": int, "samples": list[str]}.""" - by_detector = defaultdict(lambda: {"identities": set(), "total": 0}) + """Returns dict: detector_name -> {"identities": set[str], "total": int, + "verified": int, "unverified": int, "unknown": int}.""" + by_detector = defaultdict(lambda: { + "identities": set(), "total": 0, + "verified": 0, "unverified": 0, "unknown": 0, + }) with open(path, "r", encoding="utf-8", errors="replace") as f: for line in f: line = line.strip() @@ -118,6 +95,12 @@ def load_findings(path): raw = obj.get("Raw") or obj.get("RawV2") or "" by_detector[detector]["identities"].add(raw) by_detector[detector]["total"] += 1 + if obj.get("VerificationError"): + by_detector[detector]["unknown"] += 1 + elif obj.get("Verified"): + by_detector[detector]["verified"] += 1 + else: + by_detector[detector]["unverified"] += 1 return by_detector @@ -149,12 +132,6 @@ def build_top_line_summary(rows): return f"**{regressed} regressed, {new_count} new, {unchanged} unchanged.**" -def truncate(s, n=SAMPLE_TRUNCATE): - if len(s) <= n: - return s - return s[: n - 1] + "…" - - def render_blast_radius(matches, corpus_bytes, signed=False): if corpus_bytes is None or corpus_bytes <= 0: return "" @@ -192,98 +169,9 @@ def load_keyword_corpus_meta(path): return {"thin_l1": thin, "reports": raw.get("reports") or []} -def heatmap_cell(delta): - """Render one (emoji, signed-int) cell for the heatmap table.""" - if delta >= HEATMAP_BUCKET_HOT: - emoji = "🟥" - elif delta >= HEATMAP_BUCKET_WARM: - emoji = "🟧" - elif delta == 0: - emoji = "⬜" - else: - emoji = "🟦" - sign = "+" if delta > 0 else "" # Negative numbers carry their own minus. - return f"{emoji} {sign}{delta}" - - -def load_heatmap_grid(path): - """Read the grid sidecar from render_heatmap.py. - - Returns ``None`` for any failure mode (missing file, malformed JSON, - schema mismatch). The heatmap is informational; if the grid isn't - readable the diff still posts without it. - """ - if not path or not os.path.isfile(path): - return None - try: - with open(path, "r", encoding="utf-8") as f: - data = json.load(f) - except (OSError, json.JSONDecodeError): - return None - detectors = data.get("detectors") or [] - decoders = data.get("decoders") or [] - deltas = data.get("deltas") or [] - if not detectors or not decoders or not deltas: - return None - if len(deltas) != len(detectors): - return None - if any(len(row) != len(decoders) for row in deltas): - return None - return {"detectors": detectors, "decoders": decoders, "deltas": deltas} - - -def build_heatmap_section(grid_path, artifact_url): - """Return the Markdown lines for the per-(detector, decoder) Δ heatmap. - - Renders an emoji-bucketed table from the JSON grid emitted by - render_heatmap.py rather than an inline image, because GitHub's - PR-comment sanitizer strips ``data:`` URLs and artifact-zip URLs are - auth-gated. The table renders identically on web, mobile, email - notifications, and CI log replay — all surfaces a PR comment lands on. - - The colored PNG is still produced as a workflow artifact; when - ``artifact_url`` is supplied we tack a click-through link onto the - table for reviewers who want the rich version. - - Returns ``[]`` if no grid is available — the rest of the report - renders unchanged. - """ - grid = load_heatmap_grid(grid_path) - if grid is None: - return [] - - detectors = grid["detectors"] - decoders = grid["decoders"] - deltas = grid["deltas"] - - lines = [ - "### Δ heatmap (changed detectors × decoder)", - "", - "_Per-cell Δ = unique findings (PR − main). 🟥 ≥+6, 🟧 +1..+5, " - "⬜ 0, 🟦 ≤−1._", - "", - ] - - header = ["Detector"] + decoders - align = ["---"] + ["---:" for _ in decoders] - lines.append("| " + " | ".join(header) + " |") - lines.append("|" + "|".join(align) + "|") - for i, det in enumerate(detectors): - row = [f"**{det}**"] + [heatmap_cell(deltas[i][j]) - for j in range(len(decoders))] - lines.append("| " + " | ".join(row) + " |") - lines.append("") - - if artifact_url: - lines.append( - f"_Colored PNG version: [download from workflow artifacts]({artifact_url})._" - ) - lines.append("") - return lines - def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, - keyword_meta=None, heatmap_grid=None, heatmap_artifact_url=None): + keyword_meta=None): new_detectors = new_detectors or set() keyword_meta = keyword_meta or {"thin_l1": set(), "reports": []} @@ -299,12 +187,13 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, all_names = set(main) | set(pr) missing = [] + _empty = {"identities": set(), "total": 0, "verified": 0, "unverified": 0, "unknown": 0} rows = [] has_diff = False for d in sorted(all_names): is_new = d.lower() in new_detectors - m = main.get(d, {"identities": set(), "total": 0}) - p = pr.get(d, {"identities": set(), "total": 0}) + m = main.get(d, _empty) + p = pr.get(d, _empty) new_ids = p["identities"] - m["identities"] removed_ids = m["identities"] - p["identities"] @@ -331,8 +220,9 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, "unique_pr": len(p["identities"]), "new_count": len(new_ids), "removed_count": len(removed_ids), - "new_samples": sorted(new_ids)[:SAMPLE_LIMIT], - "removed_samples": sorted(removed_ids)[:SAMPLE_LIMIT], + "verified_main": m["verified"], + "verified_pr": p["verified"], + "unknown_pr": p["unknown"], "blast": blast, }) @@ -351,8 +241,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, ) parts.append("") - parts += build_heatmap_section(heatmap_grid, heatmap_artifact_url) - if not rows and not missing: parts += ["_(No findings on either side for the changed detectors.)_", ""] return "\n".join(parts) @@ -375,8 +263,10 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, show_blast = corpus_bytes is not None and corpus_bytes > 0 cols = ["Status", "Detector", "total main", "total PR", - "unique main", "unique PR", "NEW", "REMOVED"] - aligns = ["", "", "---:", "---:", "---:", "---:", "---:", "---:"] + "unique main", "unique PR", "NEW", "REMOVED", + "verified (main)", "verified (PR)", "unknown (PR)"] + aligns = ["", "", "---:", "---:", "---:", "---:", "---:", "---:", + "---:", "---:", "---:"] if show_blast: cols.append("Blast radius (Δ per 10 GB)") aligns.append("---:") @@ -396,6 +286,9 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, str(r["unique_pr"]), "—", "—", + "—", + str(r["verified_pr"]), + str(r["unknown_pr"]), ] else: cells = [ @@ -407,6 +300,9 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, str(r["unique_pr"]), str(r["new_count"]), str(r["removed_count"]), + str(r["verified_main"]), + str(r["verified_pr"]), + str(r["unknown_pr"]), ] if show_blast: cells.append(r["blast"] or "—") @@ -451,37 +347,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, parts.append(f"> - `{d}`") parts.append("") - detail_rows = [r for r in rows if r["new_samples"] or r["removed_samples"]] - if detail_rows: - parts += ["### Per-detector details", ""] - for r in detail_rows: - parts.append(f"
{r['emoji']} {r['detector']}") - parts.append("") - if r["new_samples"]: - label = ( - f"NEW findings (showing {len(r['new_samples'])} of {r['new_count']})" - if r["new_count"] > len(r["new_samples"]) - else f"NEW findings ({r['new_count']})" - ) - parts.append(f"**{label}:**") - parts.append("") - for s in r["new_samples"]: - parts.append(f"- `{truncate(s)}`") - parts.append("") - if r["removed_samples"]: - label = ( - f"REMOVED findings (showing {len(r['removed_samples'])} of {r['removed_count']})" - if r["removed_count"] > len(r["removed_samples"]) - else f"REMOVED findings ({r['removed_count']})" - ) - parts.append(f"**{label}:**") - parts.append("") - for s in r["removed_samples"]: - parts.append(f"- `{truncate(s)}`") - parts.append("") - parts.append("
") - parts.append("") - return "\n".join(parts) @@ -497,12 +362,6 @@ def main(): help="Total uncompressed bytes scanned; enables blast-radius column.") parser.add_argument("--keyword-corpus-meta", default="", help="Path to build_keyword_corpus.py sidecar; surfaces thin-L1 warnings.") - parser.add_argument("--heatmap-grid", default="", - help="Path to grid JSON produced by render_heatmap.py; " - "drives the emoji-bucketed Δ table.") - parser.add_argument("--heatmap-artifact-url", default="", - help="Workflow artifact URL for the colored PNG; " - "rendered as a click-through link below the table.") args = parser.parse_args() main_findings = load_findings(args.main_jsonl) @@ -519,8 +378,6 @@ def main(): new_detectors=new_detectors, corpus_bytes=corpus_bytes, keyword_meta=keyword_meta, - heatmap_grid=args.heatmap_grid or None, - heatmap_artifact_url=args.heatmap_artifact_url or None, )) diff --git a/scripts/test_changed_detectors.sh b/scripts/test_changed_detectors.sh deleted file mode 100755 index a6cd5fdcba5c..000000000000 --- a/scripts/test_changed_detectors.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -uo pipefail - -CHANGED=$(git diff --name-only --no-commit-id origin/master | grep pkg/detectors | grep -v test) -while IFS= read -r FILE; do - DIRECTORY=$(basename $FILE ".go") - if [ -d "pkg/detectors/$DIRECTORY" ] - then - echo $DIRECTORY - go test -v "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/$DIRECTORY" - retVal=$? - if [ $retVal -ne 0 ]; then - exit 1 - fi - fi -done <<< "$CHANGED" From bbaa4afc07b212783b08b92597ad826b825ab352 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 12:53:45 +0500 Subject: [PATCH 23/43] fix bug --- scripts/test/detect_changed_detectors.sh | 3 +-- scripts/test/detector_corpora_test.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/test/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh index 77655e26cdda..6f6b3ad33df2 100755 --- a/scripts/test/detect_changed_detectors.sh +++ b/scripts/test/detect_changed_detectors.sh @@ -38,8 +38,7 @@ MODE="${1:-list}" BASE_REF="${BASE_REF:-origin/main}" HEAD_REF="${HEAD_REF:-HEAD}" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(dirname "$SCRIPT_DIR")" +REPO_ROOT="$(git rev-parse --show-toplevel)" cd "$REPO_ROOT" # Resolve BASE to a concrete commit. Workflow already runs `git fetch origin diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index 9653633fb55e..c00ee2d57191 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -21,8 +21,7 @@ fi STDERR_FILE="${STDERR_FILE:-/tmp/corpora-stderr.txt}" > "$STDERR_FILE" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(dirname "$SCRIPT_DIR")" +REPO_ROOT="$(git rev-parse --show-toplevel)" TRUFFLEHOG_BIN="${TRUFFLEHOG_BIN:-${REPO_ROOT}/trufflehog}" if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then From 648ae6a4e05e84dc85c8865d4f5b57cef9820f23 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 13:20:59 +0500 Subject: [PATCH 24/43] optimizations --- .github/workflows/detector-corpora-test.yml | 155 +++++++++----------- scripts/test/detector_corpora_test.sh | 11 +- scripts/test/diff_corpora_results.py | 44 ++---- 3 files changed, 90 insertions(+), 120 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 7edc8008cb4b..ebce547f180d 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -114,80 +114,65 @@ jobs: No detector source files changed in this PR. Bench skipped. - # Layer 1 keyword corpus — fetch real-world snippets from GitHub Code - # Search for each changed detector's pre-filter keywords. Output is a - # zstd-compressed JSONL whose shape matches the S3 corpus, so the - # corpora script picks it up unchanged via the DATASETS append below. - # The same corpus file is fed to both PR and main builds; thin-L1 - # detectors and per-detector counts are written to a sidecar JSON the - # diff step renders. - - name: Build extract-keywords helper - if: steps.detect.outputs.any_changed == 'true' - shell: bash - run: | - set -o pipefail - CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords - - - name: Build keyword corpus (Layer 1) + # Three independent chains run in parallel: + # A) extract-keywords → keyword corpus (network I/O via GitHub API) + # B) prepare main worktree → build main binary (git I/O then CPU) + # C) build PR binary (CPU, no dependencies) + # A and B+C are complementary workloads (network vs CPU), so they + # overlap efficiently. The GITHUB_ENV write for DATASETS happens inside + # chain A's subshell and is flushed before the step exits (after wait), + # so the scan step sees the updated value. + - name: Build binaries and keyword corpus if: steps.detect.outputs.any_changed == 'true' shell: bash env: + MERGE_BASE: ${{ steps.merge_base.outputs.sha }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DETECTORS: ${{ steps.detect.outputs.pr_csv }} run: | set -o pipefail - python3 scripts/test/build_keyword_corpus.py \ - --detectors="$DETECTORS" \ - --extract-keywords-bin=/tmp/extract-keywords \ - --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ - --output-meta=/tmp/keyword-corpus-meta.json \ - --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" - # Append to DATASETS for downstream scan steps. The python script - # always writes a (possibly empty) corpus, so the path is safe to - # append unconditionally — empty zstd frames decompress to 0 - # bytes and pass through the existing scan pipeline cleanly. - echo "DATASETS<> "$GITHUB_ENV" - echo "$DATASETS" >> "$GITHUB_ENV" - echo "/tmp/keyword-corpus.jsonl.zstd" >> "$GITHUB_ENV" - echo "EOF" >> "$GITHUB_ENV" - - name: Prepare main worktree - if: steps.detect.outputs.any_changed == 'true' - shell: bash - env: - MERGE_BASE: ${{ steps.merge_base.outputs.sha }} - run: | - set -o pipefail - git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" + # Chain A: build extract-keywords, then fetch keyword corpus from GitHub. + ( + CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords + python3 scripts/test/build_keyword_corpus.py \ + --detectors="$DETECTORS" \ + --extract-keywords-bin=/tmp/extract-keywords \ + --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ + --output-meta=/tmp/keyword-corpus-meta.json \ + --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" + printf 'DATASETS<> "$GITHUB_ENV" + ) & + PID_CORPUS=$! - - name: Build trufflehog (PR HEAD) - if: steps.detect.outputs.any_changed == 'true' - shell: bash - run: | - set -o pipefail - CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . + # Chain B: prepare worktree, then build main binary. + ( + git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" + cd /tmp/trufflehog-main-src + CGO_ENABLED=0 go build -o /tmp/trufflehog-main . + ) & + PID_MAIN_BUILD=$! - - name: Build trufflehog (main merge-base) - if: steps.detect.outputs.any_changed == 'true' - shell: bash - working-directory: /tmp/trufflehog-main-src - run: | - set -o pipefail - CGO_ENABLED=0 go build -o /tmp/trufflehog-main . + # Chain C: build PR binary (no dependencies). + CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . & + PID_PR_BUILD=$! - # The PR scan always runs (any_changed=true means at least one detector - # is in pr_csv). It also captures the corpus byte total for the diff - # script's blast-radius column — same content streams to both binaries, - # so measuring once is enough. - - name: Run corpora test (PR build) + wait $PID_CORPUS || { echo "Keyword corpus build failed" >&2; exit 1; } + wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } + wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + + # PR and main scans run in parallel. Each streams the corpus files + # independently from S3 — no shared state, different output files, + # different binaries. The main scan is skipped when main_csv is empty + # (PR adds only new detectors). CORPUS_BYTES_FILE is only written by + # the PR scan (blast-radius needs one consistent byte count). + - name: Run corpora tests if: steps.detect.outputs.any_changed == 'true' shell: bash env: - TRUFFLEHOG_BIN: /tmp/trufflehog-pr - OUTPUT_JSONL: /tmp/results-pr.jsonl - STDERR_FILE: /tmp/corpora-stderr-pr.txt - INCLUDE_DETECTORS: ${{ steps.detect.outputs.pr_csv }} - CORPUS_BYTES_FILE: /tmp/corpus-bytes.txt + PR_CSV: ${{ steps.detect.outputs.pr_csv }} + MAIN_CSV: ${{ steps.detect.outputs.main_csv }} run: | set -o pipefail files=() @@ -195,33 +180,35 @@ jobs: [[ -z "$dataset" ]] && continue files+=("$dataset") done <<< "$DATASETS" - ./scripts/test/detector_corpora_test.sh "${files[@]}" - # Main scan is skipped when main_csv is empty (PR adds only new - # detectors — nothing to compare against on main). The diff step is - # safe with an empty main JSONL: every PR finding is treated as NEW, - # which is correct semantics for new detectors. - - name: Run corpora test (main build) - if: steps.detect.outputs.any_changed == 'true' - shell: bash - env: - TRUFFLEHOG_BIN: /tmp/trufflehog-main - OUTPUT_JSONL: /tmp/results-main.jsonl - STDERR_FILE: /tmp/corpora-stderr-main.txt - INCLUDE_DETECTORS: ${{ steps.detect.outputs.main_csv }} - run: | - set -o pipefail - if [[ -z "$INCLUDE_DETECTORS" ]]; then + # PR scan. + ( + export TRUFFLEHOG_BIN=/tmp/trufflehog-pr + export OUTPUT_JSONL=/tmp/results-pr.jsonl + export STDERR_FILE=/tmp/corpora-stderr-pr.txt + export INCLUDE_DETECTORS="$PR_CSV" + export CORPUS_BYTES_FILE=/tmp/corpus-bytes.txt + ./scripts/test/detector_corpora_test.sh "${files[@]}" + ) & + PID_PR=$! + + # Main scan (skipped when no detectors overlap with main). + if [[ -n "$MAIN_CSV" ]]; then + ( + export TRUFFLEHOG_BIN=/tmp/trufflehog-main + export OUTPUT_JSONL=/tmp/results-main.jsonl + export STDERR_FILE=/tmp/corpora-stderr-main.txt + export INCLUDE_DETECTORS="$MAIN_CSV" + ./scripts/test/detector_corpora_test.sh "${files[@]}" + ) & + PID_MAIN=$! + else echo "No overlapping detectors in main; skipping main scan." - : > "$OUTPUT_JSONL" - exit 0 + : > /tmp/results-main.jsonl fi - files=() - while IFS= read -r dataset; do - [[ -z "$dataset" ]] && continue - files+=("$dataset") - done <<< "$DATASETS" - ./scripts/test/detector_corpora_test.sh "${files[@]}" + + wait $PID_PR || { echo "PR scan failed" >&2; exit 1; } + [[ -n "${PID_MAIN:-}" ]] && { wait $PID_MAIN || { echo "Main scan failed" >&2; exit 1; }; } - name: Diff results if: steps.detect.outputs.any_changed == 'true' diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index c00ee2d57191..78fa1dd58624 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -46,10 +46,9 @@ fi CORPUS_BYTES_FILE="${CORPUS_BYTES_FILE:-}" TOTAL_BYTES=0 -# Verification is enabled intentionally. The scan is scoped via --include-detectors -# to only the detectors changed in the PR (typically 1-3), so the number of -# verification API calls is small and rate limiting is not a concern. This lets -# the bench surface verified/unknown deltas alongside regex match changes. +# --no-verification avoids network calls against a large corpus where thousands +# of matches could trigger API calls, dominating runtime. Verifier behavior is +# covered by detector unit and integration tests. scan() { local input="$1" local bytes_tmp="" @@ -66,6 +65,8 @@ scan() { | awk -v BF="$bytes_tmp" '{ b += length($0) + 1; print } END { printf "%d", b > BF; close(BF) }' \ | "$TRUFFLEHOG_BIN" \ --no-update \ + --no-verification \ + --allow-verification-overlap \ --log-level=3 \ --concurrency=6 \ --json \ @@ -77,6 +78,8 @@ scan() { | jq -r .content 2>> "$STDERR_FILE" \ | "$TRUFFLEHOG_BIN" \ --no-update \ + --no-verification \ + --allow-verification-overlap \ --log-level=3 \ --concurrency=6 \ --json \ diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index 2864335467fb..c5ab47faa406 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -7,9 +7,9 @@ duplicates within a single scan collapse into one identity, so a regex change either adds a new (detector, secret) identity or removes one. -Verification is enabled at scan time and scoped to the changed detectors via ---include-detectors (typically 1-3 per PR). The report surfaces verified and -unknown (verification error) counts alongside regex match deltas. +Verification is disabled at scan time (--no-verification) to avoid network +calls against a large corpus where thousands of matches could dominate runtime. +The diff measures regex match changes only. Phase 2: when --changed-detectors is provided, the report focuses on the detectors changed by the PR. Detectors flagged via --new-detectors are @@ -38,9 +38,10 @@ PREAMBLE = ( - "This bench measures regex match changes and verification behavior. " - "The scan is scoped to changed detectors only via `--include-detectors`, " - "so verification API calls are limited to the detectors touched by this PR." + "This bench measures regex match regressions only. It runs with " + "`--no-verification --allow-verification-overlap` so each detector's " + "regex behavior is measured independently — verifier behavior is tested " + "separately by detector unit tests." ) # Marker on the very first line of the body so peter-evans/find-comment can @@ -74,12 +75,8 @@ def parse_csv(s): def load_findings(path): - """Returns dict: detector_name -> {"identities": set[str], "total": int, - "verified": int, "unverified": int, "unknown": int}.""" - by_detector = defaultdict(lambda: { - "identities": set(), "total": 0, - "verified": 0, "unverified": 0, "unknown": 0, - }) + """Returns dict: detector_name -> {"identities": set[str], "total": int}.""" + by_detector = defaultdict(lambda: {"identities": set(), "total": 0}) with open(path, "r", encoding="utf-8", errors="replace") as f: for line in f: line = line.strip() @@ -95,12 +92,6 @@ def load_findings(path): raw = obj.get("Raw") or obj.get("RawV2") or "" by_detector[detector]["identities"].add(raw) by_detector[detector]["total"] += 1 - if obj.get("VerificationError"): - by_detector[detector]["unknown"] += 1 - elif obj.get("Verified"): - by_detector[detector]["verified"] += 1 - else: - by_detector[detector]["unverified"] += 1 return by_detector @@ -187,7 +178,7 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, all_names = set(main) | set(pr) missing = [] - _empty = {"identities": set(), "total": 0, "verified": 0, "unverified": 0, "unknown": 0} + _empty = {"identities": set(), "total": 0} rows = [] has_diff = False for d in sorted(all_names): @@ -220,9 +211,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, "unique_pr": len(p["identities"]), "new_count": len(new_ids), "removed_count": len(removed_ids), - "verified_main": m["verified"], - "verified_pr": p["verified"], - "unknown_pr": p["unknown"], "blast": blast, }) @@ -263,10 +251,8 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, show_blast = corpus_bytes is not None and corpus_bytes > 0 cols = ["Status", "Detector", "total main", "total PR", - "unique main", "unique PR", "NEW", "REMOVED", - "verified (main)", "verified (PR)", "unknown (PR)"] - aligns = ["", "", "---:", "---:", "---:", "---:", "---:", "---:", - "---:", "---:", "---:"] + "unique main", "unique PR", "NEW", "REMOVED"] + aligns = ["", "", "---:", "---:", "---:", "---:", "---:", "---:"] if show_blast: cols.append("Blast radius (Δ per 10 GB)") aligns.append("---:") @@ -286,9 +272,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, str(r["unique_pr"]), "—", "—", - "—", - str(r["verified_pr"]), - str(r["unknown_pr"]), ] else: cells = [ @@ -300,9 +283,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, str(r["unique_pr"]), str(r["new_count"]), str(r["removed_count"]), - str(r["verified_main"]), - str(r["verified_pr"]), - str(r["unknown_pr"]), ] if show_blast: cells.append(r["blast"] or "—") From b56b46be5447d4ad771f8a40cc8305966231ce3d Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 13:30:39 +0500 Subject: [PATCH 25/43] cache keywords corpus --- .github/workflows/detector-corpora-test.yml | 92 ++++++++++++++++----- 1 file changed, 72 insertions(+), 20 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index ebce547f180d..51068899fada 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -114,14 +114,60 @@ jobs: No detector source files changed in this PR. Bench skipped. + # Build extract-keywords first — it's a fast go build (~5s) and is + # needed to extract keywords for the cache key before we can restore + # the corpus cache. + - name: Build extract-keywords helper + if: steps.detect.outputs.any_changed == 'true' + shell: bash + run: CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords + + # Compute a cache key from the actual Keywords() output of each changed + # detector. This is more precise than hashing detector source files — + # the cache only invalidates when keywords change, not when regex or + # verification logic changes. + - name: Compute keyword corpus cache key + id: corpus_key + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + MERGE_BASE: ${{ steps.merge_base.outputs.sha }} + run: | + set -o pipefail + KEYWORDS="" + while IFS= read -r dir; do + [[ -z "$dir" ]] && continue + kws=$(/tmp/extract-keywords "$dir" 2>/dev/null || echo "[]") + KEYWORDS+="$dir:$kws"$'\n' + done < <( + git diff --name-only "$MERGE_BASE"...HEAD -- 'pkg/detectors/**/*.go' \ + | grep -Ev '_test\.go$' \ + | grep -Ev '^pkg/detectors/(common|custom_detectors)/' \ + | sed -E 's|^(pkg/detectors/[^/]+(/v[0-9]+)?)/[^/]+\.go$|\1|' \ + | sort -u + ) + HASH=$(echo "$KEYWORDS" | sha256sum | cut -d' ' -f1) + echo "Keyword corpus cache key: keyword-corpus-v1-${HASH}" + echo "key=keyword-corpus-v1-${HASH}" >> "$GITHUB_OUTPUT" + + - name: Restore keyword corpus cache + id: corpus_cache + if: steps.detect.outputs.any_changed == 'true' + uses: actions/cache@v4 + with: + path: | + /tmp/keyword-corpus.jsonl.zstd + /tmp/keyword-corpus-meta.json + key: ${{ steps.corpus_key.outputs.key }} + # Three independent chains run in parallel: - # A) extract-keywords → keyword corpus (network I/O via GitHub API) + # A) fetch keyword corpus from GitHub API (skipped on cache hit) # B) prepare main worktree → build main binary (git I/O then CPU) # C) build PR binary (CPU, no dependencies) # A and B+C are complementary workloads (network vs CPU), so they - # overlap efficiently. The GITHUB_ENV write for DATASETS happens inside - # chain A's subshell and is flushed before the step exits (after wait), - # so the scan step sees the updated value. + # overlap efficiently on a cache miss. On a cache hit, only B and C + # run. The GITHUB_ENV write for DATASETS is always performed so the + # scan step picks up the corpus regardless of cache hit/miss. - name: Build binaries and keyword corpus if: steps.detect.outputs.any_changed == 'true' shell: bash @@ -129,22 +175,24 @@ jobs: MERGE_BASE: ${{ steps.merge_base.outputs.sha }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DETECTORS: ${{ steps.detect.outputs.pr_csv }} + CORPUS_CACHE_HIT: ${{ steps.corpus_cache.outputs.cache-hit }} run: | set -o pipefail - # Chain A: build extract-keywords, then fetch keyword corpus from GitHub. - ( - CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords - python3 scripts/test/build_keyword_corpus.py \ - --detectors="$DETECTORS" \ - --extract-keywords-bin=/tmp/extract-keywords \ - --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ - --output-meta=/tmp/keyword-corpus-meta.json \ - --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" - printf 'DATASETS<> "$GITHUB_ENV" - ) & - PID_CORPUS=$! + # Chain A: fetch keyword corpus from GitHub (skipped on cache hit). + if [[ "$CORPUS_CACHE_HIT" != 'true' ]]; then + ( + python3 scripts/test/build_keyword_corpus.py \ + --detectors="$DETECTORS" \ + --extract-keywords-bin=/tmp/extract-keywords \ + --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ + --output-meta=/tmp/keyword-corpus-meta.json \ + --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" + ) & + PID_CORPUS=$! + else + echo "Keyword corpus cache hit; skipping GitHub fetch." + fi # Chain B: prepare worktree, then build main binary. ( @@ -158,9 +206,13 @@ jobs: CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . & PID_PR_BUILD=$! - wait $PID_CORPUS || { echo "Keyword corpus build failed" >&2; exit 1; } - wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } - wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + [[ -n "${PID_CORPUS:-}" ]] && { wait $PID_CORPUS || { echo "Keyword corpus build failed" >&2; exit 1; }; } + wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } + wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + + # Always append keyword corpus to DATASETS for the scan step. + printf 'DATASETS<> "$GITHUB_ENV" # PR and main scans run in parallel. Each streams the corpus files # independently from S3 — no shared state, different output files, From 32846021ccd98be666e23f3798549e87e44f985c Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 14:36:27 +0500 Subject: [PATCH 26/43] rewrite comment message --- scripts/test/diff_corpora_results.py | 77 +++++++++++----------------- 1 file changed, 31 insertions(+), 46 deletions(-) diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index c5ab47faa406..eab856e0b15f 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -38,10 +38,20 @@ PREAMBLE = ( - "This bench measures regex match regressions only. It runs with " - "`--no-verification --allow-verification-overlap` so each detector's " - "regex behavior is measured independently — verifier behavior is tested " - "separately by detector unit tests." + "Scans a corpus of real-world public code against only the detectors " + "changed in this PR, then compares unique match counts between the PR " + "build and the main baseline to catch regex regressions. Verification " + "is disabled — each detector's regex is measured independently." +) + +STATUS_KEY = ( + "🔴 regression: >5 new, >20% increase over main, or any removed" + " \u00a0·\u00a0 " + "⚠️ warning: 1–5 new" + " \u00a0·\u00a0 " + "✅ clean" + " \u00a0·\u00a0 " + "🆕 new detector (no baseline)" ) # Marker on the very first line of the body so peter-evans/find-comment can @@ -106,21 +116,15 @@ def status_emoji(new_count, removed_count, unique_main): return "✅" -def build_top_line_summary(rows): - """One-line bold verdict rendered above the preamble. - - Three buckets keyed off the locked emoji semantics: - - regressed: 🔴 (severe) + ⚠️ (soft warning, NEW > 0 but below the - 🔴 threshold). ⚠️ folds in here because the bench's whole job is - flagging regex behavior changes; a separate "warned" bucket would - split the headline and dilute the reviewer signal. - - new: 🆕 detectors added by the PR. - - unchanged: ✅ no diff vs main. - """ +def build_top_line_summary(rows, changed): regressed = sum(1 for r in rows if not r["is_new"] and r["emoji"] in ("🔴", "⚠️")) new_count = sum(1 for r in rows if r["is_new"]) - unchanged = sum(1 for r in rows if r["emoji"] == "✅") - return f"**{regressed} regressed, {new_count} new, {unchanged} unchanged.**" + clean = sum(1 for r in rows if r["emoji"] == "✅") + scoped = ", ".join(f"`{d}`" for d in sorted(changed)) if changed else "" + summary = f"**{regressed} regressed · {new_count} new · {clean} clean**" + if scoped: + summary += f" \u00a0|\u00a0 Scoped to: {scoped}" + return summary def render_blast_radius(matches, corpus_bytes, signed=False): @@ -216,18 +220,13 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, parts = [ STICKY_COMMENT_MARKER, - "## Corpora Test Results — Diff (PR vs main)", + "## Corpora Test Results", + "", + PREAMBLE, "", ] if rows: - parts += [build_top_line_summary(rows), ""] - parts += [PREAMBLE, ""] - if changed: - parts.append( - f"_Scoped to {len(changed)} detector(s) changed in this PR; " - f"unchanged detectors are not measured._" - ) - parts.append("") + parts += [build_top_line_summary(rows, changed), ""] if not rows and not missing: parts += ["_(No findings on either side for the changed detectors.)_", ""] @@ -243,18 +242,14 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, ) ) else: - parts += [ - "✅ No diff vs main — regex matches are identical across both builds.", - "", - ] rows.sort(key=lambda r: r["detector"]) show_blast = corpus_bytes is not None and corpus_bytes > 0 - cols = ["Status", "Detector", "total main", "total PR", - "unique main", "unique PR", "NEW", "REMOVED"] - aligns = ["", "", "---:", "---:", "---:", "---:", "---:", "---:"] + cols = ["Status", "Detector", "Unique matches (main)", "Unique matches (PR)", + "New", "Removed"] + aligns = ["", "", "---:", "---:", "---:", "---:"] if show_blast: - cols.append("Blast radius (Δ per 10 GB)") + cols.append("Δ per 10 GB") aligns.append("---:") parts += [ "| " + " | ".join(cols) + " |", @@ -267,8 +262,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, r["emoji"], r["detector"], "—", - str(r["total_pr"]), - "—", str(r["unique_pr"]), "—", "—", @@ -277,8 +270,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, cells = [ r["emoji"], r["detector"], - str(r["total_main"]), - str(r["total_pr"]), str(r["unique_main"]), str(r["unique_pr"]), str(r["new_count"]), @@ -288,14 +279,8 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, cells.append(r["blast"] or "—") parts.append("| " + " | ".join(cells) + " |") parts.append("") - - if show_blast: - parts += [ - "_Blast radius projects PR-vs-main match-count delta to a 10 GB " - "monorepo (positive = added matches, negative = removed). For 🆕 " - "rows it shows absolute projected matches with no baseline._", - "", - ] + parts.append(STATUS_KEY) + parts.append("") if missing: parts += [ From 24212123423adddb563b1deb3ff348fd5f3692e8 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 14:52:12 +0500 Subject: [PATCH 27/43] cache github api corpus per keyword --- scripts/test/build_keyword_corpus.py | 111 ++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 20 deletions(-) diff --git a/scripts/test/build_keyword_corpus.py b/scripts/test/build_keyword_corpus.py index 2f232a8ee62c..be9d0e13de69 100755 --- a/scripts/test/build_keyword_corpus.py +++ b/scripts/test/build_keyword_corpus.py @@ -36,6 +36,7 @@ from __future__ import annotations import argparse +import hashlib import json import math import os @@ -94,6 +95,7 @@ class DetectorReport: detector: str keywords: list[str] = field(default_factory=list) fetched: int = 0 + cache_hits: int = 0 keyword_failures: list[str] = field(default_factory=list) thin_l1: bool = False @@ -125,8 +127,12 @@ def main() -> int: # Anything below can take time and touch the network. We want a written # corpus + meta sidecar regardless of whether we got partway through, so # downstream workflow steps stay deterministic even on fetch failures. + cache_dir = getattr(args, "cache_dir", None) or None + if cache_dir: + os.makedirs(cache_dir, exist_ok=True) + try: - run_main_loop(args, detectors, token, rate, reports, corpus_lines, seen_global) + run_main_loop(args, detectors, token, rate, reports, corpus_lines, seen_global, cache_dir) finally: summary = build_summary(reports) write_outputs(args.output_corpus, args.output_meta, corpus_lines, summary) @@ -145,6 +151,7 @@ def build_summary(reports: list[DetectorReport]) -> dict[str, Any]: "detector": r.detector, "keywords": r.keywords, "fetched": r.fetched, + "cache_hits": r.cache_hits, "keyword_failures": r.keyword_failures, "thin_l1": r.thin_l1, } @@ -162,6 +169,7 @@ def run_main_loop( reports: list[DetectorReport], corpus_lines: list[dict[str, Any]], seen_global: set[tuple[str, str, str]], + cache_dir: str | None = None, ) -> None: for raw_name in detectors: # detect_changed_detectors.sh emits names like "github.v2"; the @@ -210,7 +218,7 @@ def run_main_loop( if cap_remaining <= 0: break try: - added = fetch_keyword_results( + added, from_cache = fetch_keyword_results( keyword=kw, detector_label=raw_name, cap_remaining=cap_remaining, @@ -219,6 +227,7 @@ def run_main_loop( token=token, seen_global=seen_global, corpus_lines=corpus_lines, + cache_dir=cache_dir, ) except KeywordFetchError as exc: print( @@ -238,7 +247,10 @@ def run_main_loop( ) report.keyword_failures.append(kw) continue - report.fetched += added + if from_cache: + report.cache_hits += added + else: + report.fetched += added cap_remaining -= added if report.fetched == 0: @@ -272,6 +284,11 @@ def parse_args() -> argparse.Namespace: default="/tmp/keyword-corpus-meta.json", help="Path for the per-detector meta sidecar JSON.", ) + p.add_argument( + "--cache-dir", + default="", + help="Directory for per-keyword result cache. Populated on fetch, read on subsequent runs.", + ) p.add_argument( "--max-results-per-detector", type=int, @@ -344,6 +361,11 @@ class KeywordFetchError(Exception): """Wraps a fatal failure for a single keyword lookup.""" +def _keyword_cache_key(keyword: str) -> str: + """Stable filename-safe key for a keyword's cache file.""" + return hashlib.sha256(keyword.encode()).hexdigest() + + def fetch_keyword_results( *, keyword: str, @@ -354,9 +376,45 @@ def fetch_keyword_results( token: str, seen_global: set[tuple[str, str, str]], corpus_lines: list[dict[str, Any]], -) -> int: - """Returns the number of new corpus lines added for this keyword.""" + cache_dir: str | None = None, +) -> tuple[int, bool]: + """Returns (added, from_cache). + + added: number of corpus lines added for this keyword. + from_cache: True if results came from the on-disk keyword cache. + """ + # --- cache read --- + if cache_dir: + cache_file = os.path.join(cache_dir, _keyword_cache_key(keyword) + ".json") + if os.path.isfile(cache_file): + try: + with open(cache_file, encoding="utf-8") as f: + cached_lines: list[dict[str, Any]] = json.load(f) + added = 0 + for line in cached_lines: + if added >= per_kw_cap or (cap_remaining - added) <= 0: + break + prov = line.get("provenance") or {} + key = (prov.get("repo", ""), prov.get("path", ""), prov.get("sha", "")) + if not key[0] or key in seen_global: + continue + seen_global.add(key) + corpus_lines.append(line) + added += 1 + print( + f"[build_keyword_corpus] cache hit: keyword='{keyword}' loaded {added} lines", + file=sys.stderr, + ) + return added, True + except (OSError, json.JSONDecodeError) as exc: + print( + f"[build_keyword_corpus] cache read failed for '{keyword}': {exc}; fetching fresh", + file=sys.stderr, + ) + + # --- fresh fetch --- added = 0 + fetched_lines: list[dict[str, Any]] = [] page = 1 while added < per_kw_cap and (cap_remaining - added) > 0: items, has_more = search_code(keyword, page, rate, token) @@ -380,25 +438,38 @@ def fetch_keyword_results( if content is None: continue seen_global.add(key) - corpus_lines.append( - { - "provenance": { - "layer": "L1", - "detector": detector_label, - "keyword": keyword, - "repo": repo, - "path": path, - "sha": sha, - "url": download_url or "", - }, - "content": content, - } - ) + line = { + "provenance": { + "layer": "L1", + "detector": detector_label, + "keyword": keyword, + "repo": repo, + "path": path, + "sha": sha, + "url": download_url or "", + }, + "content": content, + } + corpus_lines.append(line) + fetched_lines.append(line) added += 1 if not has_more: break page += 1 - return added + + # --- cache write --- + if cache_dir and fetched_lines: + cache_file = os.path.join(cache_dir, _keyword_cache_key(keyword) + ".json") + try: + with open(cache_file, "w", encoding="utf-8") as f: + json.dump(fetched_lines, f) + except OSError as exc: + print( + f"[build_keyword_corpus] cache write failed for '{keyword}': {exc}", + file=sys.stderr, + ) + + return added, False def search_code( From 6ba26616aa5a831f84ea6be1f99985aeaf302f60 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 15:06:43 +0500 Subject: [PATCH 28/43] cleanup --- .github/workflows/detector-corpora-test.yml | 76 +++++++++++------- .gitignore | 4 + .../diff_corpora_results.cpython-310.pyc | Bin 16606 -> 0 bytes .../render_heatmap.cpython-310.pyc | Bin 11852 -> 0 bytes scripts/test-last-changed-detector.sh | 17 ++++ scripts/test/detector_corpora_test.sh | 8 +- scripts/test/diff_corpora_results.py | 2 +- scripts/test_changed_detectors.sh | 17 ++++ 8 files changed, 88 insertions(+), 36 deletions(-) delete mode 100644 scripts/__pycache__/diff_corpora_results.cpython-310.pyc delete mode 100644 scripts/__pycache__/render_heatmap.cpython-310.pyc create mode 100755 scripts/test-last-changed-detector.sh create mode 100755 scripts/test_changed_detectors.sh diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 51068899fada..c5aa69d979ce 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -3,7 +3,6 @@ name: Corpora Test on: workflow_dispatch: pull_request: - # types: [opened, reopened] TODO: Decide if we should run this on every push paths: - 'pkg/detectors/**' - 'pkg/engine/defaults/defaults.go' @@ -90,8 +89,8 @@ jobs: # sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py — # has to appear in BOTH the skip body and the diff body so the same # comment flips between them as iterative pushes change which path - # fires. Skip body is only posted on pull_request events; the original - # workflow_dispatch early-return is preserved by the event-name guard. + # fires. Skip body is only posted on pull_request events; workflow_dispatch + # runs with no changed detectors silently finish without posting. - name: Find existing skip comment if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' id: find_skip_comment @@ -153,46 +152,42 @@ jobs: - name: Restore keyword corpus cache id: corpus_cache if: steps.detect.outputs.any_changed == 'true' - uses: actions/cache@v4 + uses: actions/cache/restore@v4 with: - path: | - /tmp/keyword-corpus.jsonl.zstd - /tmp/keyword-corpus-meta.json + path: /tmp/keyword-corpus-cache key: ${{ steps.corpus_key.outputs.key }} + # Partial hit: fetch only the keywords not already in the cache dir. + restore-keys: keyword-corpus-v1- # Three independent chains run in parallel: - # A) fetch keyword corpus from GitHub API (skipped on cache hit) + # A) build keyword corpus — loads per-keyword cache files, fetches + # only keywords not already cached. Always runs (fast on hit). # B) prepare main worktree → build main binary (git I/O then CPU) # C) build PR binary (CPU, no dependencies) - # A and B+C are complementary workloads (network vs CPU), so they - # overlap efficiently on a cache miss. On a cache hit, only B and C - # run. The GITHUB_ENV write for DATASETS is always performed so the - # scan step picks up the corpus regardless of cache hit/miss. - name: Build binaries and keyword corpus + id: build if: steps.detect.outputs.any_changed == 'true' shell: bash env: MERGE_BASE: ${{ steps.merge_base.outputs.sha }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DETECTORS: ${{ steps.detect.outputs.pr_csv }} - CORPUS_CACHE_HIT: ${{ steps.corpus_cache.outputs.cache-hit }} run: | set -o pipefail - # Chain A: fetch keyword corpus from GitHub (skipped on cache hit). - if [[ "$CORPUS_CACHE_HIT" != 'true' ]]; then - ( - python3 scripts/test/build_keyword_corpus.py \ - --detectors="$DETECTORS" \ - --extract-keywords-bin=/tmp/extract-keywords \ - --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ - --output-meta=/tmp/keyword-corpus-meta.json \ - --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" - ) & - PID_CORPUS=$! - else - echo "Keyword corpus cache hit; skipping GitHub fetch." - fi + # Chain A: build keyword corpus. Loads per-keyword cache files from + # /tmp/keyword-corpus-cache/ and only calls the GitHub API for keywords + # not present there, so it's fast when most keywords are cached. + ( + python3 scripts/test/build_keyword_corpus.py \ + --detectors="$DETECTORS" \ + --extract-keywords-bin=/tmp/extract-keywords \ + --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ + --output-meta=/tmp/keyword-corpus-meta.json \ + --cache-dir=/tmp/keyword-corpus-cache \ + --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" + ) & + PID_CORPUS=$! # Chain B: prepare worktree, then build main binary. ( @@ -206,14 +201,35 @@ jobs: CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . & PID_PR_BUILD=$! - [[ -n "${PID_CORPUS:-}" ]] && { wait $PID_CORPUS || { echo "Keyword corpus build failed" >&2; exit 1; }; } - wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } - wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + wait $PID_CORPUS || { echo "Keyword corpus build failed" >&2; exit 1; } + wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } + wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + + # Save the per-keyword cache dir only when there were no keyword + # failures. A partial run leaves the cache dir incomplete — skipping + # the save lets the next push retry the missing keywords. + CORPUS_USEFUL=false + if [[ -s /tmp/keyword-corpus-meta.json ]]; then + failures=$(jq '[.reports[].keyword_failures | length] | add // 0' /tmp/keyword-corpus-meta.json) + [[ "$failures" -eq 0 ]] && CORPUS_USEFUL=true + fi + echo "corpus_useful=$CORPUS_USEFUL" >> "$GITHUB_OUTPUT" # Always append keyword corpus to DATASETS for the scan step. printf 'DATASETS<> "$GITHUB_ENV" + # Save the per-keyword cache dir when the run completed without failures. + # Uses the exact key so subsequent pushes with the same keyword set get + # an exact hit and the Python script runs in cache-only mode (no API calls). + # Skipped on exact cache hit (cache-hit == 'true') since the dir is unchanged. + - name: Save keyword corpus cache + if: steps.detect.outputs.any_changed == 'true' && steps.corpus_cache.outputs.cache-hit != 'true' && steps.build.outputs.corpus_useful == 'true' + uses: actions/cache/save@v4 + with: + path: /tmp/keyword-corpus-cache + key: ${{ steps.corpus_key.outputs.key }} + # PR and main scans run in parallel. Each streams the corpus files # independently from S3 — no shared state, different output files, # different binaries. The main scan is skipped when main_csv is empty diff --git a/.gitignore b/.gitignore index 6abf4e766574..48d76df3f2df 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,7 @@ tmp/go-test.json .captain/detectors/quarantines.yaml .captain/detectors/flakes.yaml .vscode + +# Python +__pycache__/ +*.pyc diff --git a/scripts/__pycache__/diff_corpora_results.cpython-310.pyc b/scripts/__pycache__/diff_corpora_results.cpython-310.pyc deleted file mode 100644 index 025935b14addee84f5b0f58a0abc9f8228595440..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16606 zcma)jYj7Obm0ovG&jXFYlOPF#6xAjz2}5FrqGVYz31LNolti0^XaJI8+Vr3?w+9$t zUcB8y2(yi>wkX?F62)9an`Bqg3RJD_Qq}%UB~G$`wp_KT{K)1}l|TI}rBtQjawY53 z##=5{yv%pb?Vbk+WdqdQzJ2@lz2}~L?s<1(Xvos=_xMYHUHP*GP5ZBO)BBge%}e-s ze}O`1H#DIOqpIJ~`D)xS_?ozp;H!DV#5GY()>1c8x+csUX_35<5vdykB7MUWnHz&* z;6_$hH-^NZ$iAc97#2ff7}pUoB1Ul?6=Px?*D-gntcwF;;vN0QxOhSw5{FScAST5T zJe?3z;wY{M#WC?1u7|{N@i?xB2aEwjJSlAPCG?&YC&Wp#9&x9{mp2XRx<~G4fXZ0n zU+$^ap^T|s%qP^Ox zR7K8q>cVzw6+Cq8YmVF$jjg&R-DX4jw%@QlU*P3}WnB?&-LLrDcGH!1xl$LE`r0{r z`jYFrCBGpToSHjhFFIRxL(=7qXY8_5t*$zy&0N7=a(&x#YYs*zdG_aTe_)ALvsx)R zzU$dr6@ML2Iktx(tFG;poVs0VRI5(YbL~nUP&DeOL;%d#UIU$_yXL-Sm)4#7nrpcj z0$@2p&@$@omOU*Zz|h?-N!QPH1|SHfTWj3L+!z>Zd&8BLGC@;m)a{CAi;Cy0R$XB` z0GpukD>c`i_FMu^R+_$dCLX?6YDnP9DPkpl!CTMGSOD)f59o?Bt$JLwg@A7}~b$4b0M#Wv7H0fg2lZ0SM)?RdXFY0GMjAHODWlM@uHqtJ{G6`nm%wJ#)_9 zT6gPqKCdt$@-cuOEvP9Qw}C$~!(&IgTW*wE9^-&ot?moa5ccXecfY<^urEc}^z3rg zSz803w<`{21FCyw!rDMOb>T{Mpl!8(`N{WxA2j6nEvy`r=B#>+YRkv^+`329G+l4l zOq8owMzvCRK`n1$B^Fi!%(lGz>NXKc-y$4)wd#0&UOJ+Jfk7*+TAdcT0b2Ah9`J;B zK&jT*r|nlR+Kn>rV;x|X8g(DY?_lm(=N#6t>27Z|q=@INxxQmJ8=yUajdOqv@qoXQ zBl$qtTe9L~6ky1T34)JPF&c-dqU9Bu+iEp{&Gj(r4A!mA3&?)GfsMQBY=dymTGOvo z{8wA6_IyLQI4+JXt>^5rOB)X2ZAk*X?aBu3yjB%pt>H*Hw3<={!)!USj?>e{zOxGY zX25}k&KA-1PNm96kREP71!|0U6m%=y+ZA`qm7cX(Z*1}Ft8Trt{**@~Eh;77F5~q0 z40?AdRoAI|Hj%V+LC`ovgt4V(EwzQb^EGGNUUlsjQAgf$YE7K4&L(_?_F&z0{F>9u zugQwogM8v*ogE-<2K`eg*8t&$la0)sMK6)AHBzYhcO3 zmJR@?edcAz+cc|ZF-Ts)UXHNn`ITzb_SR`$;%qjK&}O*`!f~PrcpHc@;W)?J8$Zjp zWR;G!*J?r(V1_N%#>zKI>QsD1D2zXachDWEJ0edyJitg@9Zj7725}sas0XGBs(Rf6 z^3GW{{)mxwm~?b_h(KnE>l8LTu+cetwke}3zeDeRdmmQ%;%v#gJ$JLWPagx^=Xoy^ zdvUhj|MEUu=f&A3aNpl8#vC1t|JU6F%ea1@e?YAR4=Z-0`y*m=)wI-pr%J`h^* z44@$p5_Rzk?ZdNQ%I7)NfMKwr{wNoNHUUw0&UJ8}71&B@1$gjN^{5u6v#R~k~iA!|;x(grf> z#P&RwL>v+FQp^|%w?Thk4AC*c!7G<5Z$VBHL&3hevIH57^R0BoOVK2x+0oX3Bk*_- zvpAt{y;V~)O}mA)!VDPC*Dqb1Us=hWr-K}+Di|+csYCKX)>m#v+r#jY_A;}~0Hwbc>^Ia2ryVafn#2~@sUv6-{F@d_{I*B?&13dbwI$q+ z1oZ5x3qT6o`++_a>IF%gEoq-~$uI%Lg-H(_i7te?FafnPOfrIUiBR{#1i%T+itpAw z)8rW5d$4q7y-{<|ceg60?I(84AaP&cG2hhcmZ|xPjpX)e9kz$AeM`GF zwxtE;Zc3!@>&<6GW+xdWy|3OHj8)<)Mk@X_d)=U1m&Y37OdFAKq-#!sp z3>DWqagGkex3Ql$XY3Px!*{A^g22a}(nY8XohI(HQz!vveSZnuE#c>7QP5#0JMkk} zV7bHs)6Qo`XmuwHlRQaImjmd}MAB1?qb2SGQM5C-v7M5Z4=3C|$q&(&!ALx19zPH6l@31%v`gSA%vFe|81DeV zQHuedWA?QqL^9An)I=&UP^NjpugL5{h z6}}Btz{%yr`Zo9LBQ4@RMB5(yrcwMZNa*GWbSYgxEO^k5gbwjQx)ZdfHy{RXL31J< zDNI$oBA^SCyuCR?7Vy4&nhH8O3rzfA%tqm9IZ3S~tX07*cwlzTt-EhE<=p#t;1QJ@ zZ$GKEbIO>9jaMX06}=YpFS*^rj^p>fcqzv)7%ycYUJAh1Bk);n$ma=~E}-&jbl(r6 zJdd`2gx_B&;HLV|WgqfvbmJKZ;~ByB49sdovnGHvQ@((1@+=kqA5(vTe$mv&wFkzj zQ|)Z`$jsR76j2ARL*0!poH|8J3OLJ|p(z_%UO3P>W%5}J7!Gu(WY2QLL;GX?2YAba zk5d~T90&iO)FpAE5-mUeqdMV;S=5(N0e1&$3iM4~W&+4=aD0{oliK$TqNl@XBjHV0 zXb}Z|{`QYQ1K?G+4oV`f80wF(&kVG>I=Lq8Ctn|VhU z$q&tsNuWWG`;d-9O6ei;IHr1_pH@q{`+nP;DLh?V>7{;^Ku%S6*K#Ui{Yb zXuGK!yx+~a#FyBl3a0t4r z0pke<*Ny?LF&S%(^$c`IyH{s~{yGl*)%1;xWZBq$L1~lSu~X1zm$WU`?64aKv|4DW z&5LW0W;c)Sm_ByG&)hM*Z`_)PR*NWr$iRRbfG%s@*LDYC=3#H-ccDRNgCrYr^MM+N z^xty4g4|&1ZAiLB@L*=fu~)I{(^7u>g%sOSaMS3i`rj=ExJmYTFbK%T^0AsT(l2heM z72=YF$%^uXd4OZnZNi50@~|7pVOVa#%?9V6NABHg6?hfuYw(Rjj0!-4_T-Vk5jLd+`Qk&!>NFX1Uc79$%Mi0 zNX)9l8e~E`LK^GQ2zGc%LL33veWuH!xGYFg_$9F&W)iHf$z|N;hLl?WC3Z?#tVgHhlT;_^CrSSeO#&`7$&^8f zP1t+VCRuUr_cj;FYyBBs_h>Y8APa4IRL|7&mQ_!m|)}c+hz74*06J7fb zA+f9uAH9y70Ec2TDboU`V|^5NI_`$SjP&{dYtsgFXA^hOrO5+A`n7yaq`s5Df(hhv+Tw`Gp~WYjz~?(g=st;*FuY6`{gH} z{3x#cgttX;yFd9TZvC-*75yS!XcPB)_40+~YZtB;FTOtijmyi$S6^EW5A11s^TOga zDAq}i)F?Sg2-u7JYy?q}!2B6Hc%)2Zj1-uf`kBD?xGq=G+=pAz@QJ|@j=leaA^>^$ z+J?Ss1n~ZNmVmhnCYJyfdzqZM3&lBNa(W+=JF$n!4S~s}d~+iWCih%yGSjFq)s5YZ z0BdT_u93i@J7RJ3YudZ&8JY24$1|Ax@|$-x`OU!emgrr7fCLW8#2tMzA#ZY<^#cuo z&5Rg$Cli=T<(C`uBuLyPp=5nT>c(LEmmOZme*Ku?v12+C7ZE-wTGG$tR#p~3A}cFQ z3uHN!5xgKqQlluzbWJf5L?f#>>2QC6WNP4JWUxlQCB%Vhj*X7Y*uZ?2q3OmH1(s2% z(>u)Goa%YIqKpQHv_gI;<|r|nVvbGT8n8_Ad^a2LEHVp*LkK==#akA-j6}YUJq`zA zZtRgY%+7x=OsQ~SA9=(o4CzRojUv|Opvu6;^0m8AXf|UN81m{feL-GAKj$jdqWxv&95D&gO7~G~a|v0ZH+Uo$wcKE6G(2Wgq;(UfWe%3a1MFXI;aMSVM$>~gozr09ein}$_Tu2YRb4aXhBjw@{epy}C@J63zNF0}pS3Y$nq_5Hp$?YZdmL{z=pb~Xxg>)`N0#_(vH-Z)ga3~Z-h9(E|_=t72p6Hy>2)$9f-Z&J-t@MP(W;#~13V=POtmf;B zh~aXq%k~_&|KSmrO2c_1HbyE-CuGta31eWo(J;j{)FKqe0^i_d1o-AuxO2;01p{iV zG9v=VsZ^CJ(mlFz9K#(8=^(Y1N#Tz;E<;X91XFu5-+%j@DLa)jnNZ-^O3Fr7)nUZ*v!L?(P*u$ ztDqdB=Ey6FLI9nJ9M-XKL{3%^=z(Guh2v=ISaDrh`SwkkxJ<9t8xAV z)IZi_iC~mk*+U{;kG09o3wzm>*&?^#$&)83{u}fA={T{W!;tNt{S)ZQDjv(qLhCzt zt#};~Th>AF@{1Hjf;TPBc|j5lw&?S_?<>b%0iKaheq?|C?vGA?b%yFcu}_~Z6iz=k zV}Jf9AK6cH+mG@5kMYXcLXl?)Gnb;^%7a86aYpc@&L0T@;#8!2vfZ=>E#W5gE1~_K zaf=)W`U4X$onzh^bAG+JS4**G$(;>SYR+XbK0BC4JFbnG|06es@=$t2KZ86?$RIBf#_?&u0^1+G$xI#6$XhHNIh80r-T zKMO30mQjM448oLV=o1eaZSAiW=c7}&fSZKM zo_RQBSh@*sEy{;h0>0Ys;gu5EDDfZJ|735oxSIwFgG3XqZiFWjL9&_c+PphSD0$Db z8V8{VVFN-VErWE8i?Dm))y=?zn|LP@n7vQlMHC5Q3SJ12dM9(2d!zqA-Rd7@Cj*6Y z06z=Hz#y#m42!)FcCta1VogEfwze~buxTP-$Vj35$hdFp40BktzilKK5f-Rwn950Q zXH?ZxEAGdFA=ws#JLC0X7M!D4O)?nclHY+KP3(*ZquvL>7=FVdi_u1?gj)NL#n8?m zM)=fEsWvFFX)*j>V&_18sME&pOp=0&d_Q^?Scqn2gP~x&cRb7+&4?O9%}{h9(gW0m z=4Z$m1-t;t_%1jgMn1~258;YGAdpG8me`pHEcriz3Ev7V^c-6=p4ZSe;Sa|3yV^Zn zrh5lXvvH#qz2m_EmlI+l9!nhD9SSDylQNGU z@)yA%dKCO&WG|-R;kT5!k0;Lu*2ai9EGFO4#F6)OU8@fVl+zf)M^BG-p8{)IOTVW# zuA!FMGJYAv;d(eYh*3+yA;5c_;XRDn;|g-hh6G%PSnW9h)#s!*xjQ7j z%rG4$nE1)b!&-zLzNEr&L``uVr?Q z)Sv9_&3#YP^X^+*o9VRi+uWA#v=yjEqw%`_eQ*S{GAYgwG|hh}rq#NzLbb9Z@ye+E zo__1^@@}eqBfR_qxP7{V+ktxq9Gers+}TfcriKBmH;a=+yZua%#r{95_8(*Yb$2XK zTXHuczH(o`XEeu%AK>Ipfqo46ICF#N#8+X|O)&?U3Z~Hh62JX?2i_OBh8~^yzScLN zajVdsPn^Se5!LJ*4UXPR$PujMd}k#`G0vZ2p9t<*g>}IGB5K$t)aE{Bk6L#`Jozo) z(2)NeHDDWOj9XwGnP|N$Uh1~s{R^rWY8O9F*3)9X=RUD>jK{qc9K-X=kDM1C7w`@5 zJjVT=@yCM4_-w_vHEw;{2iePjEFlw^1G~>~pe2IrKZ{qoEfHks)q(6)^%md%?T}&I zzXh_t?17B1d!@63OjY-cTZ`SZp8NWOT!i3wM8=cLr6c3lNgaWV2Y=S?L(WrJCeYt! z)r2J-3{diHL9(+=stY*P(~;)Y74qT0EPR!$o}466PZBHxXF8lzdi#h{vwO6XxbMEO zv8ptI0T{Ma3|kLO^80uyDT<&V;2;a@8h;;c)3JUu@#xw~iav`U>o;tVb36y@)@j{Q zWChxqEO$nyc@P4zkN^m`EG5=-*Ui>=4im~*^x9{oab)Rb9al}6IY<>UtIds2_ z%z|q)O;(Wg5w;oa<~alXAWoC8CnXeu>lOO2O-mr_hLJ!ZDI^ef7rJYaAK=M;E4fW8q4zUL%Lr3ToK!CBXlN^%XM?P* zXdd*`j?y4hb42@S4-*LChv_H@1KBU6Fsq_TwJ$1K6ce~edFM!v<-GH0n1^tW*qmdY zQ!jcngil3_irXmqkU$HdANAYAQy|TCP=nS-b`A7XWY;m8oW0ng81e4{JIqqLtblPE zMK(W>{}7$z52<*M3StOdE+F5>ZR<}_6#G`#NvVfh<+hin5I)d}q;xo0ka&V~if|tTto|tNVm{(B9VVdO;zatGh_R#|l4U|@ zA;|#wu}InP;Y48L?0AMp2OO>QZ3HKR06OJ?0X*@z&Kj@stVwzNYQ{F z3+MwTE30gUa%v-(o`YnoEkvwl=zA4nvNd29BO#K+x&GYd0EP4&43cZB4f>vdW^?OE zICK$Fi2`Qu*7^WmMI4cz-4OOPT<7p^q!A>J7)1wDqpB3Tx|@1n1|WdSU4`_uSnPipO{N7+;C$7ua) z^qq_M_SC6m`fMorfC7k7fxvU8PQ@wl^WASLfTrjp3ha zvDq_RnLQIfP#_k(VE|6{9hS{YH*REf^T)cYa+Lh(-;L`Gy z`EPuyIRDzUYnK<6i`OnJe&g~Yr;~)4>x-8!T)TMnGCNh|Z_{9#G=xbL$#2m;3CG;= z&?HtE8qlQv9(5QZB`U_NC$CalHjd6xST^hg4as*DPQ=ciKEKicT)~{a>LDXT#kS;2 zG*kxPcoj(&%e%N`4j<~ZP;Z85nz({2&B0i6!;K1E5=#%0$RYMUsG``OBI|OY2_G_? zRqUivLGLh0&Q6cdIDIETR8i!_og|#HZk3Z^n9v}DXua|uP(jBxOsV7g7!exj(H26V zrg2=KO2TIaKNb0^viP6CKW8f=OnS_i^={!c2M)iVQ!&~^y#j}r_DK=jZ`W)N<&^kKB3 zhi;>eK8fQPf$=wWZ%PZ++zqmhI@L9;C{5{ehl~H@jZQ# z0ADUb&l@Rt2n?>JQA_yHR5k`sLrx58@JJ#1Mc)_{7DE3;s*xVElY*~F-x$Kvv_HHt zvO5af*+5o|DVqW1Ga)2Jz9{q=Z@xvzSsMqMfq>WQy$r2UKvT?3EQA)FwIYA16K0xH z)!GO4`94R9tTXdVZ}cX?-~+*~u|al*l85y934P^SKz0B!ocQCI&|G(`&GyKi4;x!A z(Z1{*HTtHHOc+=`ln=^=3V=P2p(&IFk=ywSvd|vx|D=)ayY}2NC8Lw+15drmM1^ia zUp&%>r^voK4{=VB5h&J=e)CvBx`OF%LkaztfKdJ`6zzlie$CjPzm808Qq`3AIc82u zGocTnqR9ODZZL|HQ1VyL#<{Qfhz3iDI2V7V*ILGe@wgnz;n-eB8-kLg5YZFUJ8ugK z|@HxXtWxyGMmgu@r)8sY|ZpFh%(VHR3$5l^`-OdyXX zwA2S2X!JNsUH%Ii>8Dg2KoO2bblD?0x>HkPPI#byn>fP_ z0EKlH_~Oevye$8v<_aPM$;sM(#> zomo-b&ZZ9NKy(2iNMWRH+@Mr|1Q1%JK#^a4q(xH{4bY|q`ft!bNzordfucpw01Z;B z-#K@7xs;TI#LnEgbLT$JJ?A^G8;*_^HT-?&`Jee;Kcs2@Mh}C3CLUhWG(Gw~6ohtF z6S^=0y#4g+?yd7ftx*=cja+{Oh#JYA@$N1&~ z|4O^tzdfR9m%XO&q~kg*PtMK=FZQZ&C=WWqtA=<{a(?u_Q}^7s;kKM`8P%$Rv+Os8 z->gODV)1;l)s7>_m7WvVJtuM-p7YAW#rX?p6nK%-lA&le{^Ik9XnF9%*dtT|DP z4xFg!HY0rJ#>K@&;V&;&s-bL!(yd4@Y6o#tZmlgYI@PclwHxScrs3kNn~`(ra;faR zj`8RtbR+#%?D)cK#{RPJMaA9(Y1GBVnX}o%=P|p3&SiJi2_@a$c&>!*SN*u|L|%+v z!$l9(=qRrxTY2%i=e5)@Kt`HG=KdXpEeWSBD!F~h$(!5PJFDlNAZCz;Vx>RLlJ8T z!pJ+hzgRroA5OZlA2!{|** z&tOCkA4us1%sKS_!eI#Z8-COV9jf)vuX+*l%&c=3^O7}U>&xPFyCSjYK_~$aN6?pW zj$=k(VPd9#qq>BKM1 zSwE^fArKK@CnaFnz+qI_F3juMP%`gV92t`C6eBn1^uLc4C(feiR zv=_v)Sov(!j#{i4@ChMJq{iUA27f@pvL*LMVZrWA70ucW!1@eG)Q18vqKEZiB&N(V zgo1!ikx(dFWFQTEp)Ca_wJ=ixB6QUBPzJ|KrLhD^?nEuO>X8a!+-FeEK$p2ysRL9W zIqpCMuts}~0>Ib=6U-A(cW9inWQA6lfPe`xlzJl6oDadA49!GN0FCskF&5Py3-Hq9 zt%kyCxS%~l(K8Xg{dIci2z3ZP0^=%0;4Cu(61FfDGmoMpV$KX`AnBL_m;ekG0u#38 zq0|tM&dMrYMUAS=^cr!<>p;V26mOE2AQ@ir>h4V+`$Lo`xa%1J86sk&1B^p`sUw&} zU)f$BIEejZ)ZY;p4p%N;APFVhQL~%v_^qr~lJ>yH zx8wROfxQ5YGjW1hrglR~2D+e5kNJ(-33N)xcA2pRb0c$&K;dNx`+{sVMdTVDe+{K) zsZDt$BIAKgJf04u1$3AGEgF+(fPH(^DkG?%*Ni_R+a6je_u)0d6@ON#dC;^^D8;yn z1(`)xS#MWk=p@qc;I4?&CKia1xvQ;(mYq|Xf^YaS!yW4YY+rL$86mWuzWQf1;V*M1 z1B~QHWP!LhT|Xdmf{nPro$e8w%twXWOabo3UuMMe1W3Mcoi>^!Sn+cY8p%9fl{CX< zNvSF7o~O1NwgVDHSSrCR*d>G&QRJyVN?;(4pspG|Y!YZUWp~#kz;U?^VqpqbBDdxp z;hCvfSNbBf<{Tr4O}HXy4$ual z%}n(uHNk1A1xu=NPm@#l?&q(~)nSd#MRm8{am5^A;aujA&4tp3P3H!)(rbmiT6k%5 z%CU;(9$}hvsMVX|>*zh2#>Fyh-PXtSf^O*st`qpTbV;waZAAeUv~u36k}|KFoh*@!oFj5jW@OCu%*Su+DCMa{uABS-om!6YKigHi+DD$WoNo(Vs4t) zGTfsBH=gR`V{0>)n0NKAg&vCN!CJA`j;cQ1(r!G3@2$L4y27OqDxpHE(B@)HMec*z6ezhc(me@J~wRDu}l2 z&%o0HaW>I5)?$4fgWuEVONJ!ImvHqJ8!#`Z%P!QVg=2x+SQ73D4^I(wQ_wme3qn^^ zdZt_XSP;TCk4~VV+NQevd9EEV&%T)E!7%}VrevfAuL%GuvvxW%=ys=CIy_*6 zrTHuKug+h5b3QfUjifnt#^*~zJm1s`VSlFiiwozdCpkt_wMhLClS7@Pb+|pVFQlT%W7U^k-#L`My=+QdOXGBg-Q1@Z7jBcX#~cMGw0mgOKuph#HnY8(6v$pF|S#%}B~0Ol;C;G6=%xpKJ-%$IPY*wtZ=qZuD0 zy$Ht#MmD@=Nd{2Rs=G@FN|2|uR^B!v+Ib7Voy0_YP^bYQuN44``i2JR!{}c#ZWs## zTy)P!jq;&X7x(lupA9;|S;b=k>^^ugtrK5Df6QBZw9bA63OrfKh#w6ySFiaa6C+YZ zG_idse3>RTL=%g(n>tPG4G7i^Lx41LHyS``jGjG5{`KX`#ml(L=TZCc4i6oxc}?$D zOP=^G>YdhZyz_+CdFq$Wuh$+j^%=BI?FbH+CrYM#fgXntun3z(Tkt;cDJvlxRw;xw%QxhG0<{de^0Mz?-}^zxR=vitEP2x!tB}`q#Ead zfLlG9NuEZ$Yjg{&3RRC=wL`I;3?+pgBl`C?P5A?Z#HLqhfn60&8MPJ)6Yv9*qgf?&MkRzMg zZCLzXzipu=&o$5!_`(1@QXg3Bx_;ZdokPzQU@eH^9iuy()eO@Y$uNImVti^fGEir1 zm>c?rL1UIi+kb&w>OuO9gk?(Zdw{EeN9`hD2EpVg1wFm(UweDynv=z=C@KZJ7p8O- z&YtqK*(-sY(!(mpXTL!C)u?4p)>$b#J$zMuLr$A-@~R3J-;M1$c9L3jmMQ}2<8=m1lofzZJ$dTeYM zJ!V-4T5M!#rCG~YaDj?4 zA(s!$uciyMR8KjnE?&k*SmY2z29dG|BKaEiMKI5Fr=D5kJ^AP=iO+bI zlPGKe#`kK&kSe+_lY%83#-m_2PoWI%`W$ZQ7#2`*m!e9H7#>FC#qo%wum2c79&u9Z z?0fubl<|gXn+Vwd@~w8z6BB;~TMeOS*0)G>wzeG{NE@c+@Di0S^&k60_%5n=tDWsc zpxy6p)&hO^djI2>P|EmPe!gGZ@A6Nm>I$_f%A-^!9hutg7J?{nedxi$;v!b2^0WMs z;IAnlL`#u8hnmtzYLUmA8Z9YLQj>`uuU2|y%%@N-=jmNOqk3whQJQO1*!gOy8;g3H zW4*4n%;p~DtMokyCvTm+NF`nltt2gO?aw-%BWgHe!Et(E=L-5Xn~;`av3W_?X|5~6 z$SeXf_nDSa)F<(8SoCavIAu-gMMGB6O_d1S`y;*>#t)dFN7EoXnV5kF!@zUM}pxO`*D z$jVS+h@E08856ruGoFlW7USWhunrF$M#IR;NTQ2rv0LmBd+%6OvpI_Q`TqMcIFWnA z6L*kvMeA|#5%k@Anh^U2o_2_jiYL`O)dS3&SGp5wGfxg=*X)6 zk@#45r*OJc%}=1_sSm8~&SZRZ62G0v-x$ zPNu~tK!<-4&op&0)1AUxr;;7XM6y$qL@AlV8V=k+PWZ+*ZoHoiuk1=d3Gu8rC}u&G z-O28iX;JR4bN9;bl|5ok9J)g^8s6M{mt4-_l_wGtJ91b&C!QCdyn~&F`SpqzdU&#V58Ok@gL~#Ur(kv zPU^uUN2uU>YF81923lKkje0e1x@nH1?L7C|^ZfT8Cu_es_1%|$+Ip;TCNqtOHB(rwIx@k+=$!Duh9Ep8l-|KpWBGi5hhtBaMviVn2+3>9wZanM&}Po zdz6;zE4_4B#kuh{`2MTZo7oCtUVmygE94>3oeU@NhA$AUvFje~N@}8CHGw>6MD~uF zuHiS>8d3(XPkrXpY~SV8$l8(%Y2p2r@+?FZP9F~4OXrum*!>8tVyyaI&pqZSmF0(jXtX$81U_2N9f zeVX2U0+-Yh?M4g2LwdVp%LDY{;htOW8pSc?L3+u`jW2A2j`Nr7aaSm?`ZKhN$Rja^ z48$)D-TuXdJpl>X|DpUdRBz*Ia@6_}e6suzwFf6F-d=|}25!wQKtIR&+A}%~PF?Hl zhz+49)(6@*jo4hVgh8g+n}~`dwq$+D5*8gyqtdFrs+H{a-=f=7NR8oW!9d=hHVKx= zQHo}4VXvc3Kj8d~AAkoB(m@Q`Q)Zb4(#&YC@@AA-CNA!!SottRFVRg$E+~a7LXT5_ zeC7bGN$+teNu43-W!VNs2X<4oj!lncOFDd>PkE?_cdiBg2Vh2P08<>NL(m=Vw)24eHmI1I-mU7x z`#%=TH5{^|<9`(>#UbEEMBHImNZRDae$j)p!aK(Q$-lu1@-F&+wE1L-uxGIk-^R$E zA^8F(*u*xojZP&)l~#~5%Mv;SY8sy5JBT49czVz97~m&l&z#87V=K{Davb_Lq9d^_ zM&tZSL5vaOh;iiYhVUMt!>-AtVU(eu?C2`KpLpndiFFGh&4QF-2R#wf!akpb*JF#F^wkGiGW3Bars5q5W{A=*lx!|d ze-qx);5;#Fh-rE zrGF|&{yCP;4oK(dBb{+t9ip%D|8)E)%d8)DqMqC-dFYr(+jhnP^`H>HLzKU>`;i`3 zVS{^4$8cuaBnu90fb)ajbviaje@mswwBX7bCrZjq-5uxfjS+gl~xq}S#nc5d!Tqo zsR#>IO~K;fI07=t?Ah}I(qV9D@8Kat!Pm`NC4?2P64?*oioA+3rU^_au2w+6qR9C$ z`#}M@v#o0p;SKt?A0U{);fw;;(DMW=tSRJ<5gA5wm}=|~Et7jr3zZ5_wNly%%~^pN zP^qNFN<|qTm5L "$OUTPUT_JSONL" -# Captures trufflehog stderr (incl. --print-avg-detector-time output) for downstream phases. +# Captures trufflehog stderr (incl. --print-avg-detector-time output) for postmortem inspection. STDERR_FILE="${STDERR_FILE:-/tmp/corpora-stderr.txt}" > "$STDERR_FILE" @@ -30,7 +30,7 @@ fi # When set, scope the scan to specific detectors. Comma-separated, lowercase # proto enum names with optional ".v" suffix (matches the format produced -# by scripts/detect_changed_detectors.sh). +# by scripts/test/detect_changed_detectors.sh). INCLUDE_DETECTORS="${INCLUDE_DETECTORS:-}" INCLUDE_FLAG=() if [[ -n "$INCLUDE_DETECTORS" ]]; then @@ -40,9 +40,7 @@ fi # When set, total uncompressed content bytes streamed to trufflehog (across # all datasets in this run) are written to this path. Used by the diff # script to compute blast-radius density. Awk inline-counts the post-jq -# stream so we don't double-read; END block runs before stdin EOF -# propagates out of the pipeline, so the value is written by the time the -# scan exits. +# stream so we don't double-read the corpus for byte accounting. CORPUS_BYTES_FILE="${CORPUS_BYTES_FILE:-}" TOTAL_BYTES=0 diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index eab856e0b15f..ed710b446bc4 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -18,7 +18,7 @@ 10 GB of scanned content. Phase 3a: --keyword-corpus-meta points at the sidecar JSON written by -scripts/build_keyword_corpus.py. When present, detectors whose Layer 1 +scripts/test/build_keyword_corpus.py. When present, detectors whose Layer 1 (GitHub Code Search) fetch returned zero results get a concise warning rendered above the summary table — they're flagged so reviewers know the bench's verdict for those detectors leans entirely on the S3 corpus and diff --git a/scripts/test_changed_detectors.sh b/scripts/test_changed_detectors.sh new file mode 100755 index 000000000000..a6cd5fdcba5c --- /dev/null +++ b/scripts/test_changed_detectors.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -uo pipefail + +CHANGED=$(git diff --name-only --no-commit-id origin/master | grep pkg/detectors | grep -v test) +while IFS= read -r FILE; do + DIRECTORY=$(basename $FILE ".go") + if [ -d "pkg/detectors/$DIRECTORY" ] + then + echo $DIRECTORY + go test -v "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/$DIRECTORY" + retVal=$? + if [ $retVal -ne 0 ]; then + exit 1 + fi + fi +done <<< "$CHANGED" From a00d1296b7c9b350131249202ee5c7833a1cc6b0 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 15:20:02 +0500 Subject: [PATCH 29/43] remove github corpus --- .github/workflows/detector-corpora-test.yml | 114 +--- hack/extract-keywords/main.go | 316 --------- scripts/test/build_keyword_corpus.py | 722 -------------------- scripts/test/diff_corpora_results.py | 68 +- 4 files changed, 13 insertions(+), 1207 deletions(-) delete mode 100644 hack/extract-keywords/main.go delete mode 100755 scripts/test/build_keyword_corpus.py diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index c5aa69d979ce..28eaf22b84c8 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -10,8 +10,6 @@ on: - 'scripts/test/detector_corpora_test.sh' - 'scripts/test/diff_corpora_results.py' - 'scripts/test/detect_changed_detectors.sh' - - 'scripts/test/build_keyword_corpus.py' - - 'hack/extract-keywords/**' env: DATASETS: | @@ -113,83 +111,18 @@ jobs: No detector source files changed in this PR. Bench skipped. - # Build extract-keywords first — it's a fast go build (~5s) and is - # needed to extract keywords for the cache key before we can restore - # the corpus cache. - - name: Build extract-keywords helper - if: steps.detect.outputs.any_changed == 'true' - shell: bash - run: CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords - - # Compute a cache key from the actual Keywords() output of each changed - # detector. This is more precise than hashing detector source files — - # the cache only invalidates when keywords change, not when regex or - # verification logic changes. - - name: Compute keyword corpus cache key - id: corpus_key - if: steps.detect.outputs.any_changed == 'true' - shell: bash - env: - MERGE_BASE: ${{ steps.merge_base.outputs.sha }} - run: | - set -o pipefail - KEYWORDS="" - while IFS= read -r dir; do - [[ -z "$dir" ]] && continue - kws=$(/tmp/extract-keywords "$dir" 2>/dev/null || echo "[]") - KEYWORDS+="$dir:$kws"$'\n' - done < <( - git diff --name-only "$MERGE_BASE"...HEAD -- 'pkg/detectors/**/*.go' \ - | grep -Ev '_test\.go$' \ - | grep -Ev '^pkg/detectors/(common|custom_detectors)/' \ - | sed -E 's|^(pkg/detectors/[^/]+(/v[0-9]+)?)/[^/]+\.go$|\1|' \ - | sort -u - ) - HASH=$(echo "$KEYWORDS" | sha256sum | cut -d' ' -f1) - echo "Keyword corpus cache key: keyword-corpus-v1-${HASH}" - echo "key=keyword-corpus-v1-${HASH}" >> "$GITHUB_OUTPUT" - - - name: Restore keyword corpus cache - id: corpus_cache - if: steps.detect.outputs.any_changed == 'true' - uses: actions/cache/restore@v4 - with: - path: /tmp/keyword-corpus-cache - key: ${{ steps.corpus_key.outputs.key }} - # Partial hit: fetch only the keywords not already in the cache dir. - restore-keys: keyword-corpus-v1- - - # Three independent chains run in parallel: - # A) build keyword corpus — loads per-keyword cache files, fetches - # only keywords not already cached. Always runs (fast on hit). - # B) prepare main worktree → build main binary (git I/O then CPU) - # C) build PR binary (CPU, no dependencies) - - name: Build binaries and keyword corpus - id: build + # Two independent builds run in parallel: + # A) prepare main worktree → build main binary (git I/O then CPU) + # B) build PR binary (CPU, no dependencies) + - name: Build binaries if: steps.detect.outputs.any_changed == 'true' shell: bash env: MERGE_BASE: ${{ steps.merge_base.outputs.sha }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - DETECTORS: ${{ steps.detect.outputs.pr_csv }} run: | set -o pipefail - # Chain A: build keyword corpus. Loads per-keyword cache files from - # /tmp/keyword-corpus-cache/ and only calls the GitHub API for keywords - # not present there, so it's fast when most keywords are cached. - ( - python3 scripts/test/build_keyword_corpus.py \ - --detectors="$DETECTORS" \ - --extract-keywords-bin=/tmp/extract-keywords \ - --output-corpus=/tmp/keyword-corpus.jsonl.zstd \ - --output-meta=/tmp/keyword-corpus-meta.json \ - --cache-dir=/tmp/keyword-corpus-cache \ - --max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}" - ) & - PID_CORPUS=$! - - # Chain B: prepare worktree, then build main binary. + # Chain A: prepare worktree, then build main binary. ( git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" cd /tmp/trufflehog-main-src @@ -197,38 +130,12 @@ jobs: ) & PID_MAIN_BUILD=$! - # Chain C: build PR binary (no dependencies). + # Chain B: build PR binary (no dependencies). CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . & PID_PR_BUILD=$! - wait $PID_CORPUS || { echo "Keyword corpus build failed" >&2; exit 1; } - wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } - wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } - - # Save the per-keyword cache dir only when there were no keyword - # failures. A partial run leaves the cache dir incomplete — skipping - # the save lets the next push retry the missing keywords. - CORPUS_USEFUL=false - if [[ -s /tmp/keyword-corpus-meta.json ]]; then - failures=$(jq '[.reports[].keyword_failures | length] | add // 0' /tmp/keyword-corpus-meta.json) - [[ "$failures" -eq 0 ]] && CORPUS_USEFUL=true - fi - echo "corpus_useful=$CORPUS_USEFUL" >> "$GITHUB_OUTPUT" - - # Always append keyword corpus to DATASETS for the scan step. - printf 'DATASETS<> "$GITHUB_ENV" - - # Save the per-keyword cache dir when the run completed without failures. - # Uses the exact key so subsequent pushes with the same keyword set get - # an exact hit and the Python script runs in cache-only mode (no API calls). - # Skipped on exact cache hit (cache-hit == 'true') since the dir is unchanged. - - name: Save keyword corpus cache - if: steps.detect.outputs.any_changed == 'true' && steps.corpus_cache.outputs.cache-hit != 'true' && steps.build.outputs.corpus_useful == 'true' - uses: actions/cache/save@v4 - with: - path: /tmp/keyword-corpus-cache - key: ${{ steps.corpus_key.outputs.key }} + wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } + wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } # PR and main scans run in parallel. Each streams the corpus files # independently from S3 — no shared state, different output files, @@ -290,16 +197,11 @@ jobs: if [[ -s /tmp/corpus-bytes.txt ]]; then CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt) fi - META_ARG=() - if [[ -s /tmp/keyword-corpus-meta.json ]]; then - META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json) - fi python3 scripts/test/diff_corpora_results.py \ /tmp/results-main.jsonl /tmp/results-pr.jsonl \ --changed-detectors="$CHANGED" \ --new-detectors="$NEW_DETECTORS" \ --corpus-bytes="$CORPUS_BYTES" \ - "${META_ARG[@]}" \ > /tmp/diff-report.md cat /tmp/diff-report.md diff --git a/hack/extract-keywords/main.go b/hack/extract-keywords/main.go deleted file mode 100644 index 3a95773f86b8..000000000000 --- a/hack/extract-keywords/main.go +++ /dev/null @@ -1,316 +0,0 @@ -// extract-keywords parses a detector package directory and prints the -// strings returned by its `Keywords() []string` method as a JSON array. -// -// Used by scripts/build_keyword_corpus.py to fan out per-detector GitHub -// Code Search queries during the corpora bench. Static parsing is preferred -// over compile-and-import because each detector lives in its own package -// and importing them dynamically requires either codegen or `plugin`. -// -// Resolution order: -// 1. Walk all non-test *.go files via go/parser. -// 2. Find a method named Keywords with no parameters and a single -// []string return; take its first ReturnStmt. -// 3. If the return expr is a []string composite literal, collect string -// literal elements. -// 4. If it's an identifier, look up a package-level var with that name and -// extract from its initializer composite literal. -// 5. If AST extraction yields nothing, fall back to a regex over the body -// of the same Keywords function — handles oddities like build-tag-gated -// bodies that the parser may have skipped. -// -// Exit codes: -// -// 0 — keywords printed (possibly empty array). -// 1 — directory unreadable or no Keywords method found anywhere. -// -// An empty array on exit 0 is a deliberate signal to the caller that this -// detector should be marked thin-L1 and skipped, distinct from a hard -// failure (exit 1). -package main - -import ( - "encoding/json" - "fmt" - "go/ast" - "go/parser" - "go/token" - "os" - "path/filepath" - "regexp" - "strconv" - "strings" -) - -func main() { - if len(os.Args) != 2 { - fmt.Fprintln(os.Stderr, "usage: extract-keywords ") - os.Exit(1) - } - dir := os.Args[1] - info, err := os.Stat(dir) - if err != nil || !info.IsDir() { - fmt.Fprintf(os.Stderr, "extract-keywords: %s is not a readable directory\n", dir) - os.Exit(1) - } - - keywords, found, err := extractFromDir(dir) - if err != nil { - fmt.Fprintf(os.Stderr, "extract-keywords: %v\n", err) - os.Exit(1) - } - if !found { - fmt.Fprintf(os.Stderr, "extract-keywords: no Keywords() method found in %s\n", dir) - os.Exit(1) - } - - out, _ := json.Marshal(keywords) - fmt.Println(string(out)) -} - -// extractFromDir parses all non-test Go files in dir and returns the -// keyword list. The found return distinguishes "Keywords method exists but -// returns nothing extractable" (found=true, empty slice) from "no Keywords -// method at all" (found=false). -func extractFromDir(dir string) (keywords []string, found bool, err error) { - fset := token.NewFileSet() - // parser.ParseDir is deprecated in favour of go/packages, but we - // deliberately want a build-tag-agnostic union of every file in the - // directory rather than the type-checked, build-tag-respecting view that - // go/packages produces. Switching would force a new direct module - // dependency for marginal gain on a CI helper. - //nolint:staticcheck - pkgs, err := parser.ParseDir(fset, dir, func(fi os.FileInfo) bool { - return !strings.HasSuffix(fi.Name(), "_test.go") - }, 0) - if err != nil { - return nil, false, fmt.Errorf("parse %s: %w", dir, err) - } - - // Most detector dirs have one package; versioned dirs (e.g. github/v2) - // also have one. Iterating handles both without a special case. - for _, pkg := range pkgs { - fnDecl, fnFile := findKeywordsFunc(pkg) - if fnDecl == nil { - continue - } - found = true - kws := extractFromFunc(fnDecl, pkg) - if len(kws) > 0 { - return kws, true, nil - } - // AST resolution failed — fall back to regex over the source range - // of the Keywords function body. Handles cases the AST walker - // can't statically resolve (helper calls, build-tagged variants). - if grepped := grepFallback(fset, fnFile, fnDecl); len(grepped) > 0 { - return grepped, true, nil - } - } - - if !found { - // Last-ditch: pure-grep across all source files in the dir. Catches - // cases where parser.ParseDir filtered the file out (rare; e.g. - // build-tag exclusion with the default ParseDir filter). - grepped, ok := grepDirFallback(dir) - if ok { - return grepped, true, nil - } - } - - return nil, found, nil -} - -// findKeywordsFunc returns the Keywords method decl (if any) and the file -// containing it. -func findKeywordsFunc(pkg *ast.Package) (*ast.FuncDecl, *ast.File) { - for _, file := range pkg.Files { - for _, decl := range file.Decls { - fn, ok := decl.(*ast.FuncDecl) - if !ok { - continue - } - if fn.Name == nil || fn.Name.Name != "Keywords" { - continue - } - if fn.Recv == nil || len(fn.Recv.List) != 1 { - continue - } - // Must look like `Keywords() []string`. Don't be picky about the - // receiver — both Scanner and scanner are seen in the codebase. - if fn.Type.Params != nil && len(fn.Type.Params.List) > 0 { - continue - } - if fn.Type.Results == nil || len(fn.Type.Results.List) != 1 { - continue - } - return fn, file - } - } - return nil, nil -} - -// extractFromFunc walks the function body for a return statement whose -// expression is either a []string composite literal or an identifier -// referring to a package-level var initialised with one. -func extractFromFunc(fn *ast.FuncDecl, pkg *ast.Package) []string { - if fn.Body == nil { - return nil - } - var out []string - ast.Inspect(fn.Body, func(n ast.Node) bool { - ret, ok := n.(*ast.ReturnStmt) - if !ok || len(ret.Results) == 0 { - return true - } - switch expr := ret.Results[0].(type) { - case *ast.CompositeLit: - out = append(out, stringLitsFromComposite(expr)...) - case *ast.Ident: - if vals := lookupPackageStringSlice(pkg, expr.Name); len(vals) > 0 { - out = append(out, vals...) - } - } - return false - }) - return dedupNonEmpty(out) -} - -// stringLitsFromComposite extracts string literal elements from a -// `[]string{"a", "b", ...}` composite literal. Non-literal elements (e.g. -// helper calls) are silently dropped — the caller falls back to regex. -func stringLitsFromComposite(c *ast.CompositeLit) []string { - if c == nil { - return nil - } - if !isStringSliceType(c.Type) { - return nil - } - var out []string - for _, el := range c.Elts { - if lit, ok := el.(*ast.BasicLit); ok && lit.Kind == token.STRING { - if s, err := strconv.Unquote(lit.Value); err == nil { - out = append(out, s) - } - } - } - return out -} - -func isStringSliceType(expr ast.Expr) bool { - at, ok := expr.(*ast.ArrayType) - if !ok { - return false - } - id, ok := at.Elt.(*ast.Ident) - return ok && id.Name == "string" -} - -// lookupPackageStringSlice resolves a package-level -// `var = []string{...}` declaration into its string literals. -func lookupPackageStringSlice(pkg *ast.Package, name string) []string { - for _, file := range pkg.Files { - for _, decl := range file.Decls { - gen, ok := decl.(*ast.GenDecl) - if !ok || gen.Tok != token.VAR { - continue - } - for _, spec := range gen.Specs { - vs, ok := spec.(*ast.ValueSpec) - if !ok { - continue - } - for i, n := range vs.Names { - if n.Name != name || i >= len(vs.Values) { - continue - } - if c, ok := vs.Values[i].(*ast.CompositeLit); ok { - if vals := stringLitsFromComposite(c); len(vals) > 0 { - return vals - } - } - } - } - } - } - return nil -} - -// stringLitRE matches Go double-quoted string literals (including escapes -// and \u sequences). Backtick raw strings are uncommon in keyword lists -// and are intentionally not handled. -var stringLitRE = regexp.MustCompile(`"((?:\\.|[^"\\])*)"`) - -// grepFallback extracts string literals from the source span of the -// Keywords function body using a regex. Used when AST resolution fails. -func grepFallback(fset *token.FileSet, file *ast.File, fn *ast.FuncDecl) []string { - if fn.Body == nil { - return nil - } - tokFile := fset.File(file.Pos()) - if tokFile == nil { - return nil - } - src, err := os.ReadFile(tokFile.Name()) - if err != nil { - return nil - } - start := tokFile.Offset(fn.Body.Lbrace) - end := tokFile.Offset(fn.Body.Rbrace) - if start < 0 || end <= start || end > len(src) { - return nil - } - return matchStringLits(string(src[start:end])) -} - -// grepDirFallback scans every .go file in dir for a `Keywords() []string` -// signature and extracts string literals from its body. Used when -// parser.ParseDir didn't surface any package (build-tag filtering, etc.). -func grepDirFallback(dir string) ([]string, bool) { - matches, err := filepath.Glob(filepath.Join(dir, "*.go")) - if err != nil { - return nil, false - } - bodyRE := regexp.MustCompile(`(?ms)Keywords\(\)\s*\[\]string\s*\{(.*?)^\}`) - var out []string - found := false - for _, m := range matches { - if strings.HasSuffix(m, "_test.go") { - continue - } - src, err := os.ReadFile(m) - if err != nil { - continue - } - for _, body := range bodyRE.FindAllStringSubmatch(string(src), -1) { - found = true - out = append(out, matchStringLits(body[1])...) - } - } - return dedupNonEmpty(out), found -} - -func matchStringLits(s string) []string { - var out []string - for _, m := range stringLitRE.FindAllStringSubmatch(s, -1) { - // m[0] is `"..."`, suitable for strconv.Unquote. - if v, err := strconv.Unquote(m[0]); err == nil { - out = append(out, v) - } - } - return dedupNonEmpty(out) -} - -func dedupNonEmpty(in []string) []string { - seen := make(map[string]struct{}, len(in)) - out := make([]string, 0, len(in)) - for _, s := range in { - if s == "" { - continue - } - if _, ok := seen[s]; ok { - continue - } - seen[s] = struct{}{} - out = append(out, s) - } - return out -} diff --git a/scripts/test/build_keyword_corpus.py b/scripts/test/build_keyword_corpus.py deleted file mode 100755 index be9d0e13de69..000000000000 --- a/scripts/test/build_keyword_corpus.py +++ /dev/null @@ -1,722 +0,0 @@ -#!/usr/bin/env python3 -"""Build the Layer 1 keyword corpus by querying GitHub Code Search for the -keywords each changed detector pre-filters on. - -Output is a zstd-compressed JSONL whose shape matches the S3 corpus: -each line is `{"provenance": {...}, "content": ""}`. -The corpora script extracts `.content` and pipes it to trufflehog via -stdin, so provenance fields are descriptive only — they aid postmortem -debugging of where a finding came from but don't reach trufflehog itself. - -A sidecar meta JSON is written next to the corpus. It reports per-detector -result counts plus a `thin_l1` list of detectors whose total returned -results was zero. The diff script reads it to render a thin-coverage -callout. - -Rate-limit policy: - - Search bucket is 30 requests/minute on the authenticated search API. - - We track X-RateLimit-Remaining and X-RateLimit-Reset on every search - response and pre-emptively sleep when remaining < safety threshold. - - Floor of 2.1s between consecutive search calls as belt-and-suspenders. - - 403/429 responses: honor Retry-After / X-RateLimit-Reset, sleep, retry - once. Two failures in a row → give up the keyword and move on. - -Cap: - - At most --max-results-per-detector unique results across all keywords - for that detector (default 100). - - Per-keyword sub-cap of ceil(cap / len(keywords)) so one popular - keyword can't starve the others. - - Identity for dedup: (repo_full_name, path, sha). - -Dependencies: - - Python stdlib only at runtime. - - `zstd` CLI (already installed in the corpora workflow) for the final - compression step. -""" -from __future__ import annotations - -import argparse -import hashlib -import json -import math -import os -import subprocess -import sys -import time -import urllib.error -import urllib.parse -import urllib.request -from dataclasses import dataclass, field -from typing import Any - - -GITHUB_API = "https://api.github.com" -USER_AGENT = "trufflehog-detector-bench/0.1" -SEARCH_PER_PAGE = 100 # API max — fewer round-trips means less rate budget eaten. -SEARCH_FLOOR_SLEEP = 2.1 # seconds — 30 req/min => 2s; 0.1 of cushion. -RAW_FETCH_TIMEOUT = 20.0 -SEARCH_TIMEOUT = 30.0 -MAX_RAW_BYTES = 384 * 1024 # GH Code Search index ceiling; defensive cap. - - -@dataclass -class RateState: - """Rate-limit state for the search bucket. Updated from every search - response and consulted before the next call.""" - - remaining: int = 30 # Optimistic; real value comes back in the first response. - reset_epoch: float = 0.0 - last_call: float = 0.0 - - def wait_before_call(self, safety: int = 2) -> None: - """Sleep just enough to respect both the 30/min header budget and - the per-call floor.""" - now = time.time() - # Floor pacing. - gap = SEARCH_FLOOR_SLEEP - (now - self.last_call) - if gap > 0: - time.sleep(gap) - # Header-driven pacing. - if self.remaining is not None and self.remaining < safety: - now = time.time() - wait = max(0.0, self.reset_epoch - now) + 1.0 - if wait > 0: - print( - f"[rate-limit] remaining={self.remaining}, sleeping {wait:.1f}s for reset", - file=sys.stderr, - ) - time.sleep(wait) - # After the reset window expires, the bucket is full again. - self.remaining = 30 - - -@dataclass -class DetectorReport: - detector: str - keywords: list[str] = field(default_factory=list) - fetched: int = 0 - cache_hits: int = 0 - keyword_failures: list[str] = field(default_factory=list) - thin_l1: bool = False - - -def main() -> int: - args = parse_args() - - token = os.environ.get("GITHUB_TOKEN", "").strip() - if not token: - print( - "[build_keyword_corpus] GITHUB_TOKEN is empty; writing an empty corpus " - "and marking all detectors thin_l1.", - file=sys.stderr, - ) - - detectors = [d.strip() for d in args.detectors.split(",") if d.strip()] - if not detectors: - # Nothing changed — write empty outputs and exit cleanly so the - # workflow can still append the path to DATASETS without a special - # case. - write_outputs(args.output_corpus, args.output_meta, [], {"reports": [], "thin_l1": []}) - return 0 - - rate = RateState() - reports: list[DetectorReport] = [] - corpus_lines: list[dict[str, Any]] = [] - seen_global: set[tuple[str, str, str]] = set() - - # Anything below can take time and touch the network. We want a written - # corpus + meta sidecar regardless of whether we got partway through, so - # downstream workflow steps stay deterministic even on fetch failures. - cache_dir = getattr(args, "cache_dir", None) or None - if cache_dir: - os.makedirs(cache_dir, exist_ok=True) - - try: - run_main_loop(args, detectors, token, rate, reports, corpus_lines, seen_global, cache_dir) - finally: - summary = build_summary(reports) - write_outputs(args.output_corpus, args.output_meta, corpus_lines, summary) - print( - f"[build_keyword_corpus] wrote {len(corpus_lines)} corpus lines, " - f"{len(summary['thin_l1'])} detector(s) marked thin_l1", - file=sys.stderr, - ) - return 0 - - -def build_summary(reports: list[DetectorReport]) -> dict[str, Any]: - return { - "reports": [ - { - "detector": r.detector, - "keywords": r.keywords, - "fetched": r.fetched, - "cache_hits": r.cache_hits, - "keyword_failures": r.keyword_failures, - "thin_l1": r.thin_l1, - } - for r in reports - ], - "thin_l1": [r.detector for r in reports if r.thin_l1], - } - - -def run_main_loop( - args: argparse.Namespace, - detectors: list[str], - token: str, - rate: RateState, - reports: list[DetectorReport], - corpus_lines: list[dict[str, Any]], - seen_global: set[tuple[str, str, str]], - cache_dir: str | None = None, -) -> None: - for raw_name in detectors: - # detect_changed_detectors.sh emits names like "github.v2"; the - # source dir is pkg/detectors/github/v2. Strip the .v suffix - # and translate it into a /v path component. - detector_name, version_suffix = split_version(raw_name) - package_dir = resolve_package_dir(detector_name, version_suffix, args.detectors_root) - - report = DetectorReport(detector=raw_name) - reports.append(report) - - if package_dir is None: - print( - f"[build_keyword_corpus] {raw_name}: cannot resolve package dir; " - "marking thin_l1", - file=sys.stderr, - ) - report.thin_l1 = True - continue - - keywords = run_extract_keywords(args.extract_keywords_bin, package_dir) - report.keywords = keywords - if not keywords: - print( - f"[build_keyword_corpus] {raw_name}: no keywords extracted from " - f"{package_dir}; marking thin_l1", - file=sys.stderr, - ) - report.thin_l1 = True - continue - - if not token: - report.thin_l1 = True - continue - - per_kw_cap = max(1, math.ceil(args.max_results_per_detector / len(keywords))) - cap_remaining = args.max_results_per_detector - - print( - f"[build_keyword_corpus] {raw_name}: keywords={keywords} " - f"cap={args.max_results_per_detector} per_kw_cap={per_kw_cap}", - file=sys.stderr, - ) - - for kw in keywords: - if cap_remaining <= 0: - break - try: - added, from_cache = fetch_keyword_results( - keyword=kw, - detector_label=raw_name, - cap_remaining=cap_remaining, - per_kw_cap=per_kw_cap, - rate=rate, - token=token, - seen_global=seen_global, - corpus_lines=corpus_lines, - cache_dir=cache_dir, - ) - except KeywordFetchError as exc: - print( - f"[build_keyword_corpus] {raw_name}: keyword '{kw}' failed: {exc}", - file=sys.stderr, - ) - report.keyword_failures.append(kw) - continue - except Exception as exc: # noqa: BLE001 — last-resort, see below - # We want partial outputs on the way out even if a fetch - # step blows up unexpectedly. Log, mark, continue — the - # finally block in main() still writes corpus/meta. - print( - f"[build_keyword_corpus] {raw_name}: keyword '{kw}' raised " - f"{type(exc).__name__}: {exc}", - file=sys.stderr, - ) - report.keyword_failures.append(kw) - continue - if from_cache: - report.cache_hits += added - else: - report.fetched += added - cap_remaining -= added - - if report.fetched == 0: - report.thin_l1 = True - - -def parse_args() -> argparse.Namespace: - p = argparse.ArgumentParser() - p.add_argument( - "--detectors", - default=os.environ.get("DETECTORS", ""), - help="Comma-separated detector list (matches detect_changed_detectors.sh format).", - ) - p.add_argument( - "--detectors-root", - default="pkg/detectors", - help="Path to the detectors source tree (default pkg/detectors).", - ) - p.add_argument( - "--extract-keywords-bin", - default=os.environ.get("EXTRACT_KEYWORDS_BIN", "/tmp/extract-keywords"), - help="Pre-built extract-keywords binary.", - ) - p.add_argument( - "--output-corpus", - default="/tmp/keyword-corpus.jsonl.zstd", - help="Path for the zstd-compressed JSONL corpus output.", - ) - p.add_argument( - "--output-meta", - default="/tmp/keyword-corpus-meta.json", - help="Path for the per-detector meta sidecar JSON.", - ) - p.add_argument( - "--cache-dir", - default="", - help="Directory for per-keyword result cache. Populated on fetch, read on subsequent runs.", - ) - p.add_argument( - "--max-results-per-detector", - type=int, - default=int(os.environ.get("KEYWORD_CORPUS_CAP", "100")), - help="Cap on unique results fetched per detector across all keywords.", - ) - return p.parse_args() - - -def split_version(name: str) -> tuple[str, str]: - """`jdbc` → ('jdbc', ''); `github.v2` → ('github', 'v2').""" - if "." in name: - base, _, ver = name.partition(".") - return base, ver - return name, "" - - -def resolve_package_dir(name: str, version: str, root: str) -> str | None: - """Map a detector identifier back to its package directory. - - detect_changed_detectors.sh emits the proto-enum name (lowercase), but - package directory names sometimes diverge (e.g. proto NpmToken lives in - pkg/detectors/npmtoken). When the simple lowercase mapping doesn't - exist we fall through with None and let the caller mark thin_l1 — this - is correct semantics: we couldn't find data for this detector, surface - it as thin coverage rather than failing the workflow. - """ - candidates = [name] - if version: - candidates = [os.path.join(c, version) for c in candidates] - for c in candidates: - path = os.path.join(root, c) - if os.path.isdir(path): - return path - return None - - -def run_extract_keywords(binary: str, package_dir: str) -> list[str]: - if not os.path.isfile(binary): - print( - f"[build_keyword_corpus] extract-keywords binary not found at {binary}", - file=sys.stderr, - ) - return [] - try: - out = subprocess.run( - [binary, package_dir], - capture_output=True, - text=True, - timeout=20, - check=False, - ) - except subprocess.TimeoutExpired: - print(f"[build_keyword_corpus] extract-keywords timed out on {package_dir}", file=sys.stderr) - return [] - if out.returncode != 0: - if out.stderr.strip(): - print(out.stderr.strip(), file=sys.stderr) - return [] - try: - loaded = json.loads(out.stdout.strip() or "[]") - except json.JSONDecodeError: - return [] - if not isinstance(loaded, list): - return [] - return [k for k in loaded if isinstance(k, str) and k] - - -class KeywordFetchError(Exception): - """Wraps a fatal failure for a single keyword lookup.""" - - -def _keyword_cache_key(keyword: str) -> str: - """Stable filename-safe key for a keyword's cache file.""" - return hashlib.sha256(keyword.encode()).hexdigest() - - -def fetch_keyword_results( - *, - keyword: str, - detector_label: str, - cap_remaining: int, - per_kw_cap: int, - rate: RateState, - token: str, - seen_global: set[tuple[str, str, str]], - corpus_lines: list[dict[str, Any]], - cache_dir: str | None = None, -) -> tuple[int, bool]: - """Returns (added, from_cache). - - added: number of corpus lines added for this keyword. - from_cache: True if results came from the on-disk keyword cache. - """ - # --- cache read --- - if cache_dir: - cache_file = os.path.join(cache_dir, _keyword_cache_key(keyword) + ".json") - if os.path.isfile(cache_file): - try: - with open(cache_file, encoding="utf-8") as f: - cached_lines: list[dict[str, Any]] = json.load(f) - added = 0 - for line in cached_lines: - if added >= per_kw_cap or (cap_remaining - added) <= 0: - break - prov = line.get("provenance") or {} - key = (prov.get("repo", ""), prov.get("path", ""), prov.get("sha", "")) - if not key[0] or key in seen_global: - continue - seen_global.add(key) - corpus_lines.append(line) - added += 1 - print( - f"[build_keyword_corpus] cache hit: keyword='{keyword}' loaded {added} lines", - file=sys.stderr, - ) - return added, True - except (OSError, json.JSONDecodeError) as exc: - print( - f"[build_keyword_corpus] cache read failed for '{keyword}': {exc}; fetching fresh", - file=sys.stderr, - ) - - # --- fresh fetch --- - added = 0 - fetched_lines: list[dict[str, Any]] = [] - page = 1 - while added < per_kw_cap and (cap_remaining - added) > 0: - items, has_more = search_code(keyword, page, rate, token) - if not items: - break - for item in items: - if added >= per_kw_cap or (cap_remaining - added) <= 0: - break - repo = (item.get("repository") or {}).get("full_name") or "" - path = item.get("path") or "" - sha = item.get("sha") or "" - key = (repo, path, sha) - if not repo or not path or key in seen_global: - continue - download_url = item.get("html_url") - # `git_url` (blob API) is the canonical content source; fall - # back to constructing a raw URL from the html_url when blob is - # absent. Keep both candidates for robustness. - raw_candidates = build_raw_candidates(item) - content = fetch_first_ok(raw_candidates, token=token) - if content is None: - continue - seen_global.add(key) - line = { - "provenance": { - "layer": "L1", - "detector": detector_label, - "keyword": keyword, - "repo": repo, - "path": path, - "sha": sha, - "url": download_url or "", - }, - "content": content, - } - corpus_lines.append(line) - fetched_lines.append(line) - added += 1 - if not has_more: - break - page += 1 - - # --- cache write --- - if cache_dir and fetched_lines: - cache_file = os.path.join(cache_dir, _keyword_cache_key(keyword) + ".json") - try: - with open(cache_file, "w", encoding="utf-8") as f: - json.dump(fetched_lines, f) - except OSError as exc: - print( - f"[build_keyword_corpus] cache write failed for '{keyword}': {exc}", - file=sys.stderr, - ) - - return added, False - - -def search_code( - keyword: str, - page: int, - rate: RateState, - token: str, -) -> tuple[list[dict[str, Any]], bool]: - """Single page of GitHub Code Search. Returns (items, has_more). - - `has_more` is True iff the response yielded a full page of results, - indicating the next page may have content. Using the size of the - returned items list (vs. parsing the total_count field) avoids - overshooting the 1000-result hard cap that the search API enforces. - """ - qs = urllib.parse.urlencode( - {"q": keyword, "per_page": SEARCH_PER_PAGE, "page": page} - ) - url = f"{GITHUB_API}/search/code?{qs}" - body, headers = github_request( - url, - token=token, - accept="application/vnd.github.v3+json", - rate=rate, - is_search=True, - ) - update_rate(rate, headers) - if body is None: - return [], False - try: - data = json.loads(body) - except json.JSONDecodeError: - return [], False - items = data.get("items") or [] - has_more = len(items) >= SEARCH_PER_PAGE - return items, has_more - - -def build_raw_candidates(item: dict[str, Any]) -> list[str]: - """Build candidate raw-content URLs from a code-search hit. - - The search API doesn't return a direct raw URL — `html_url` points at - the GitHub web UI. Translate it to raw.githubusercontent.com by - replacing `/blob/` with the raw host. Also include the `git_url` blob - API URL as a backup; that path is on the core 5000/hr token bucket - rather than the 30/min search bucket, so it's a safer fallback when - raw.githubusercontent.com gives us trouble. - """ - out: list[str] = [] - html_url = item.get("html_url") or "" - if html_url and "/blob/" in html_url: - raw = ( - html_url.replace("https://github.com/", "https://raw.githubusercontent.com/", 1) - .replace("/blob/", "/", 1) - ) - out.append(raw) - git_url = item.get("git_url") or "" - if git_url: - out.append(git_url) # GET on this returns a JSON envelope with base64 content. - return out - - -def fetch_first_ok(urls: list[str], *, token: str) -> str | None: - """Try each candidate URL in order and return the first successful - body, or None if all fail. The blob-API form returns a JSON envelope - that we decode separately.""" - for url in urls: - try: - if url.startswith("https://raw.githubusercontent.com/"): - req = urllib.request.Request(url, headers=raw_headers(token)) - with urllib.request.urlopen(req, timeout=RAW_FETCH_TIMEOUT) as resp: - data = resp.read(MAX_RAW_BYTES + 1) - if len(data) > MAX_RAW_BYTES: - return None - return decode_text(data) - # Blob API path: fetch JSON, base64-decode `content`. - body, _headers = github_request( - url, - token=token, - accept="application/vnd.github.v3+json", - rate=None, - is_search=False, - ) - if not body: - continue - try: - payload = json.loads(body) - except json.JSONDecodeError: - continue - if (payload.get("encoding") or "").lower() == "base64": - import base64 - - raw = base64.b64decode(payload.get("content") or "") - if len(raw) > MAX_RAW_BYTES: - return None - return decode_text(raw) - except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError): - continue - return None - - -def github_request( - url: str, - *, - token: str, - accept: str, - rate: RateState | None, - is_search: bool, - max_retries: int = 1, -) -> tuple[str | None, dict[str, str]]: - """Issue a GitHub API request, honoring rate-limit pacing for searches - and retrying once on 403/429 if the headers indicate a wait window. - Returns (body, headers) — body is None on hard failure.""" - headers = { - "User-Agent": USER_AGENT, - "Accept": accept, - } - if token: - headers["Authorization"] = f"Bearer {token}" - - attempt = 0 - while True: - if is_search and rate is not None: - rate.wait_before_call() - rate.last_call = time.time() - req = urllib.request.Request(url, headers=headers) - try: - with urllib.request.urlopen(req, timeout=SEARCH_TIMEOUT) as resp: - response_headers = {k.lower(): v for k, v in resp.headers.items()} - body = resp.read().decode("utf-8", errors="replace") - return body, response_headers - except urllib.error.HTTPError as exc: - response_headers = {k.lower(): v for k, v in (exc.headers or {}).items()} - if exc.code in (403, 429) and attempt < max_retries: - wait = compute_retry_wait(response_headers) - print( - f"[rate-limit] {exc.code} on {url}; sleeping {wait:.1f}s", - file=sys.stderr, - ) - time.sleep(wait) - attempt += 1 - continue - print(f"[github_request] {exc.code} on {url}: giving up", file=sys.stderr) - return None, response_headers - except (urllib.error.URLError, TimeoutError, OSError) as exc: - if attempt < max_retries: - time.sleep(2.0) - attempt += 1 - continue - print(f"[github_request] transport error on {url}: {exc}", file=sys.stderr) - return None, {} - except ValueError as exc: - # Malformed header (typically a corrupt token) — no point retrying. - print(f"[github_request] invalid request for {url}: {exc}", file=sys.stderr) - return None, {} - - -def compute_retry_wait(headers: dict[str, str]) -> float: - """Honor Retry-After (seconds) when present, else fall back to - X-RateLimit-Reset; floor at 1 second so we always make forward - progress even if the headers are wrong/missing.""" - if "retry-after" in headers: - try: - return max(1.0, float(headers["retry-after"])) - except ValueError: - pass - reset = headers.get("x-ratelimit-reset") - if reset: - try: - wait = float(reset) - time.time() + 1.0 - return max(1.0, wait) - except ValueError: - pass - return 60.0 - - -def update_rate(rate: RateState, headers: dict[str, str]) -> None: - rem = headers.get("x-ratelimit-remaining") - reset = headers.get("x-ratelimit-reset") - if rem is not None: - try: - rate.remaining = int(rem) - except ValueError: - pass - if reset is not None: - try: - rate.reset_epoch = float(reset) - except ValueError: - pass - - -def raw_headers(token: str) -> dict[str, str]: - h = {"User-Agent": USER_AGENT, "Accept": "application/vnd.github.v3.raw"} - if token: - h["Authorization"] = f"Bearer {token}" - return h - - -def decode_text(data: bytes) -> str: - """UTF-8 with replacement; raw blobs may contain odd bytes but trufflehog - consumes the JSON-extracted .content as text via stdin so we want a - valid string regardless.""" - return data.decode("utf-8", errors="replace") - - -def write_outputs( - output_corpus: str, - output_meta: str, - corpus_lines: list[dict[str, Any]], - summary: dict[str, Any], -) -> None: - """Write the JSONL corpus, compress it with zstd, and write the meta - sidecar. zstd is invoked as a subprocess so we don't depend on a Python - extension module — the `zstd` CLI is already installed in the corpora - workflow.""" - # 1. Plain JSONL → temp file. - if output_corpus.endswith(".zstd"): - tmp_jsonl = output_corpus[: -len(".zstd")] - elif output_corpus.endswith(".zst"): - tmp_jsonl = output_corpus[: -len(".zst")] - else: - tmp_jsonl = output_corpus + ".jsonl" - with open(tmp_jsonl, "w", encoding="utf-8") as f: - for line in corpus_lines: - f.write(json.dumps(line, ensure_ascii=False)) - f.write("\n") - - # 2. zstd compress in place. - if output_corpus.endswith(".zstd") or output_corpus.endswith(".zst"): - try: - subprocess.run( - ["zstd", "-q", "-f", "-o", output_corpus, tmp_jsonl], - check=True, - ) - except (subprocess.CalledProcessError, FileNotFoundError) as exc: - print(f"[build_keyword_corpus] zstd compression failed: {exc}", file=sys.stderr) - raise - os.unlink(tmp_jsonl) - else: - # Caller asked for an uncompressed output; leave it alone. - os.replace(tmp_jsonl, output_corpus) - - # 3. Sidecar meta. - with open(output_meta, "w", encoding="utf-8") as f: - json.dump(summary, f, indent=2) - f.write("\n") - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index ed710b446bc4..2db488b7aa5d 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -11,25 +11,16 @@ calls against a large corpus where thousands of matches could dominate runtime. The diff measures regex match changes only. -Phase 2: when --changed-detectors is provided, the report focuses on the -detectors changed by the PR. Detectors flagged via --new-detectors are -rendered with 🆕 status and absolute density (no main baseline). When ---corpus-bytes is provided, a blast-radius column projects matches per -10 GB of scanned content. - -Phase 3a: --keyword-corpus-meta points at the sidecar JSON written by -scripts/test/build_keyword_corpus.py. When present, detectors whose Layer 1 -(GitHub Code Search) fetch returned zero results get a concise warning -rendered above the summary table — they're flagged so reviewers know the -bench's verdict for those detectors leans entirely on the S3 corpus and -may be under-sampled. +When --changed-detectors is provided, the report focuses on the detectors +changed by the PR. Detectors flagged via --new-detectors are rendered with 🆕 +status and absolute density (no main baseline). When --corpus-bytes is +provided, a blast-radius column projects matches per 10 GB of scanned content. Usage: diff_corpora_results.py [--changed-detectors=] [--new-detectors=] [--corpus-bytes=] - [--keyword-corpus-meta=] """ import argparse import json @@ -138,37 +129,8 @@ def render_blast_radius(matches, corpus_bytes, signed=False): return f"{projected:,.0f}" -def load_keyword_corpus_meta(path): - """Read the sidecar emitted by build_keyword_corpus.py. - - Returns a dict with `thin_l1` (set of detector names, lowercase, dotted - suffix stripped to match identity normalization elsewhere in this - file) and `reports` (kept as-is for future use). Missing/unreadable - file → empty result, surfaced silently — Phase 3a coverage is a - nice-to-have, not load-bearing. - """ - if not path: - return {"thin_l1": set(), "reports": []} - try: - with open(path, "r", encoding="utf-8") as f: - raw = json.load(f) - except (OSError, json.JSONDecodeError): - return {"thin_l1": set(), "reports": []} - thin = set() - for name in raw.get("thin_l1") or []: - if not isinstance(name, str): - continue - norm = name.split(".", 1)[0].strip().lower() - if norm: - thin.add(norm) - return {"thin_l1": thin, "reports": raw.get("reports") or []} - - - -def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, - keyword_meta=None): +def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): new_detectors = new_detectors or set() - keyword_meta = keyword_meta or {"thin_l1": set(), "reports": []} if changed: all_names = {d for d in (set(main) | set(pr)) @@ -296,22 +258,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None, parts.append(f"- `{d}`") parts.append("") - thin_l1 = sorted(keyword_meta.get("thin_l1") or set()) - if changed: - thin_l1 = [d for d in thin_l1 if d in changed] - if thin_l1: - # Single contiguous blockquote: `>` on the spacer line keeps GitHub - # Markdown from splitting the bullet list into a second quote. - parts.append( - "> ⚠️ **Thin Layer 1 coverage:** GitHub Code Search returned no " - "snippets for the detectors below. The bench's verdict for them " - "leans entirely on the S3 corpus and may be under-sampled." - ) - parts.append(">") - for d in thin_l1: - parts.append(f"> - `{d}`") - parts.append("") - return "\n".join(parts) @@ -325,8 +271,6 @@ def main(): help="CSV of detectors present in PR but not main; rendered with 🆕.") parser.add_argument("--corpus-bytes", type=int, default=0, help="Total uncompressed bytes scanned; enables blast-radius column.") - parser.add_argument("--keyword-corpus-meta", default="", - help="Path to build_keyword_corpus.py sidecar; surfaces thin-L1 warnings.") args = parser.parse_args() main_findings = load_findings(args.main_jsonl) @@ -334,7 +278,6 @@ def main(): changed = parse_csv(args.changed_detectors) new_detectors = parse_csv(args.new_detectors) corpus_bytes = args.corpus_bytes if args.corpus_bytes > 0 else None - keyword_meta = load_keyword_corpus_meta(args.keyword_corpus_meta) sys.stdout.write(render( main_findings, @@ -342,7 +285,6 @@ def main(): changed=changed if changed else None, new_detectors=new_detectors, corpus_bytes=corpus_bytes, - keyword_meta=keyword_meta, )) From 8a20a972022f0a99dc8249735842f714ac476b0d Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 15:39:32 +0500 Subject: [PATCH 30/43] revert changes for testing --- pkg/detectors/acmevault/eraser.go | 116 ------------- .../acmevault/eraser_integration_test.go | 161 ------------------ pkg/detectors/acmevault/eraser_test.go | 69 -------- pkg/detectors/jdbc/jdbc.go | 2 +- pkg/engine/defaults/defaults.go | 4 +- 5 files changed, 3 insertions(+), 349 deletions(-) delete mode 100644 pkg/detectors/acmevault/eraser.go delete mode 100644 pkg/detectors/acmevault/eraser_integration_test.go delete mode 100644 pkg/detectors/acmevault/eraser_test.go diff --git a/pkg/detectors/acmevault/eraser.go b/pkg/detectors/acmevault/eraser.go deleted file mode 100644 index d0bb694ef7e0..000000000000 --- a/pkg/detectors/acmevault/eraser.go +++ /dev/null @@ -1,116 +0,0 @@ -package acmevault - -import ( - "context" - "fmt" - "io" - "net/http" - "strings" - - regexp "github.com/wasilibs/go-re2" - - "github.com/trufflesecurity/trufflehog/v3/pkg/common" - "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" - "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detector_typepb" -) - -type Scanner struct { - client *http.Client -} - -// Ensure the Scanner satisfies the interface at compile time. -var _ detectors.Detector = (*Scanner)(nil) - -var ( - defaultClient = common.SaneHttpClient() - // Make sure that your group is surrounded in boundary characters such as below to reduce false positives. - keyPat = regexp.MustCompile(`acme-vault-[A-Za-z0-9]{32}defaults`) -) - -// Keywords are used for efficiently pre-filtering chunks. -// Use identifiers in the secret preferably, or the provider name. -func (s Scanner) Keywords() []string { - return []string{"acmevault"} -} - -// FromData will find and optionally verify acmevault secrets in a given set of bytes. -func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { - dataStr := string(data) - - uniqueMatches := make(map[string]struct{}) - for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) { - uniqueMatches[match[1]] = struct{}{} - } - - for match := range uniqueMatches { - s1 := detectors.Result{ - DetectorType: detector_typepb.DetectorType_Eraser, - Raw: []byte(match), - SecretParts: map[string]string{"key": match}, - ExtraData: map[string]string{ - "rotation_guide": "https://howtorotate.com/docs/tutorials/acmevault/", - }, - } - - if verify { - client := s.client - if client == nil { - client = defaultClient - } - - isVerified, extraData, verificationErr := verifyMatch(ctx, client, match) - s1.Verified = isVerified - s1.ExtraData = extraData - s1.SetVerificationError(verificationErr, match) - } - - results = append(results, s1) - } - - return -} - -func verifyMatch(ctx context.Context, client *http.Client, token string) (bool, map[string]string, error) { - // https://docs.acmevault.io/reference/generate-diagram-from-acmevault-dsl - payload := strings.NewReader("{\"elements\":[{\"type\":\"diagram\"}]}") - - url := "https://app.acmevault.io/api/render/elements" - req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, payload) - if err != nil { - return false, nil, err - } - - req.Header = http.Header{"Authorization": []string{"Bearer " + token}} - req.Header.Add("content-type", "application/json") - - res, err := client.Do(req) - if err != nil { - return false, nil, err - } - defer func() { - _, _ = io.Copy(io.Discard, res.Body) - _ = res.Body.Close() - }() - - switch res.StatusCode { - case http.StatusOK: - return true, nil, nil - case http.StatusUnauthorized: - // 401 API token unauthorized - // The secret is determinately not verified (nothing to do) - return false, nil, nil - default: - // 400 The request is missing the 'text' parameter - // 500 acmevault was unable to generate a result - // 503 Service temporarily unavailable. This may be the result of too many requests. - return false, nil, fmt.Errorf("unexpected HTTP response status %d", res.StatusCode) - } -} - -func (s Scanner) Type() detector_typepb.DetectorType { - return detector_typepb.DetectorType_Eraser -} - -func (s Scanner) Description() string { - return "acmevault is a tool used for generating diagrams from DSL. acmevault API tokens can be used to authenticate and interact with the acmevault API." -} diff --git a/pkg/detectors/acmevault/eraser_integration_test.go b/pkg/detectors/acmevault/eraser_integration_test.go deleted file mode 100644 index d91273ecaa0a..000000000000 --- a/pkg/detectors/acmevault/eraser_integration_test.go +++ /dev/null @@ -1,161 +0,0 @@ -//go:build detectors -// +build detectors - -package acmevault - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - - "github.com/trufflesecurity/trufflehog/v3/pkg/common" - "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" - "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detector_typepb" -) - -func TestEraser_FromChunk(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) - defer cancel() - testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors5") - if err != nil { - t.Fatalf("could not get test secrets from GCP: %s", err) - } - secret := testSecrets.MustGetField("ERASER") - inactiveSecret := testSecrets.MustGetField("ERASER_INACTIVE") - - type args struct { - ctx context.Context - data []byte - verify bool - } - tests := []struct { - name string - s Scanner - args args - want []detectors.Result - wantErr bool - wantVerificationErr bool - }{ - { - name: "found, verified", - s: Scanner{}, - args: args{ - ctx: context.Background(), - data: []byte(fmt.Sprintf("You can find a eraser secret %s within", secret)), - verify: true, - }, - want: []detectors.Result{ - { - DetectorType: detector_typepb.DetectorType_Eraser, - Verified: true, - }, - }, - wantErr: false, - wantVerificationErr: false, - }, - { - name: "found, unverified", - s: Scanner{}, - args: args{ - ctx: context.Background(), - data: []byte(fmt.Sprintf("You can find a eraser secret %s within but not valid", inactiveSecret)), // the secret would satisfy the regex but not pass validation - verify: true, - }, - want: []detectors.Result{ - { - DetectorType: detector_typepb.DetectorType_Eraser, - Verified: false, - }, - }, - wantErr: false, - wantVerificationErr: false, - }, - { - name: "not found", - s: Scanner{}, - args: args{ - ctx: context.Background(), - data: []byte("You cannot find the secret within"), - verify: true, - }, - want: nil, - wantErr: false, - wantVerificationErr: false, - }, - { - name: "found, would be verified if not for timeout", - s: Scanner{client: common.SaneHttpClientTimeOut(1 * time.Microsecond)}, - args: args{ - ctx: context.Background(), - data: []byte(fmt.Sprintf("You can find a eraser secret %s within", secret)), - verify: true, - }, - want: []detectors.Result{ - { - DetectorType: detector_typepb.DetectorType_Eraser, - Verified: false, - }, - }, - wantErr: false, - wantVerificationErr: true, - }, - { - name: "found, verified but unexpected api surface", - s: Scanner{client: common.ConstantResponseHttpClient(500, "")}, - args: args{ - ctx: context.Background(), - data: []byte(fmt.Sprintf("You can find a eraser secret %s within", secret)), - verify: true, - }, - want: []detectors.Result{ - { - DetectorType: detector_typepb.DetectorType_Eraser, - Verified: false, - }, - }, - wantErr: false, - wantVerificationErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := tt.s.FromData(tt.args.ctx, tt.args.verify, tt.args.data) - if (err != nil) != tt.wantErr { - t.Errorf("Eraser.FromData() error = %v, wantErr %v", err, tt.wantErr) - return - } - for i := range got { - if len(got[i].Raw) == 0 { - t.Fatalf("no raw secret present: \n %+v", got[i]) - } - if (got[i].VerificationError() != nil) != tt.wantVerificationErr { - t.Fatalf("wantVerificationError = %v, verification error = %v", tt.wantVerificationErr, got[i].VerificationError()) - } - } - ignoreOpts := cmpopts.IgnoreFields(detectors.Result{}, "Raw", "verificationError") - if diff := cmp.Diff(got, tt.want, ignoreOpts); diff != "" { - t.Errorf("Eraser.FromData() %s diff: (-got +want)\n%s", tt.name, diff) - } - }) - } -} - -func BenchmarkFromData(benchmark *testing.B) { - ctx := context.Background() - s := Scanner{} - for name, data := range detectors.MustGetBenchmarkData() { - benchmark.Run(name, func(b *testing.B) { - b.ResetTimer() - for n := 0; n < b.N; n++ { - _, err := s.FromData(ctx, false, data) - if err != nil { - b.Fatal(err) - } - } - }) - } -} diff --git a/pkg/detectors/acmevault/eraser_test.go b/pkg/detectors/acmevault/eraser_test.go deleted file mode 100644 index a93cae06ae39..000000000000 --- a/pkg/detectors/acmevault/eraser_test.go +++ /dev/null @@ -1,69 +0,0 @@ -package acmevault - -import ( - "context" - "testing" - - "github.com/google/go-cmp/cmp" - - "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" - "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" -) - -func TestEraser_Pattern(t *testing.T) { - d := Scanner{} - ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) - tests := []struct { - name string - input string - want []string - }{ - { - name: "typical pattern", - input: "eraser_token = 'KkBmh6TUBIcyFAp20XXa'", - want: []string{"KkBmh6TUBIcyFAp20XXa"}, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input)) - if len(matchedDetectors) == 0 { - t.Errorf("keywords '%v' not matched by: %s", d.Keywords(), test.input) - return - } - - results, err := d.FromData(context.Background(), false, []byte(test.input)) - if err != nil { - t.Errorf("error = %v", err) - return - } - - if len(results) != len(test.want) { - if len(results) == 0 { - t.Errorf("did not receive result") - } else { - t.Errorf("expected %d results, only received %d", len(test.want), len(results)) - } - return - } - - actual := make(map[string]struct{}, len(results)) - for _, r := range results { - if len(r.RawV2) > 0 { - actual[string(r.RawV2)] = struct{}{} - } else { - actual[string(r.Raw)] = struct{}{} - } - } - expected := make(map[string]struct{}, len(test.want)) - for _, v := range test.want { - expected[v] = struct{}{} - } - - if diff := cmp.Diff(expected, actual); diff != "" { - t.Errorf("%s diff: (-want +got)\n%s", test.name, diff) - } - }) - } -} diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index 3e759bb5df9e..cb9816f468f4 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,7 +53,7 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. diff --git a/pkg/engine/defaults/defaults.go b/pkg/engine/defaults/defaults.go index e8253675d544..46db6f158115 100644 --- a/pkg/engine/defaults/defaults.go +++ b/pkg/engine/defaults/defaults.go @@ -7,7 +7,6 @@ import ( accuweatherv1 "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/accuweather/v1" accuweatherv2 "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/accuweather/v2" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/adafruitio" - "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/acmevault" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/adzuna" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/aeroworkflow" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/agora" @@ -166,6 +165,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudflarecakey" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudflareglobalapikey" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudimage" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudinary" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudmersive" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudplan" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudsmith" @@ -872,7 +872,6 @@ func buildDetectorList() []detectors.Detector { &accuweatherv1.Scanner{}, &accuweatherv2.Scanner{}, &adafruitio.Scanner{}, - &acmevault.Scanner{}, // &adobeio.Scanner{}, &adzuna.Scanner{}, &aeroworkflow.Scanner{}, @@ -1039,6 +1038,7 @@ func buildDetectorList() []detectors.Detector { &cloudflarecakey.Scanner{}, &cloudflareglobalapikey.Scanner{}, &cloudimage.Scanner{}, + &cloudinary.Scanner{}, &cloudmersive.Scanner{}, &cloudplan.Scanner{}, &cloudsmith.Scanner{}, From 30c98bbc2c597165073ab7f1d78142e647443611 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 15:44:35 +0500 Subject: [PATCH 31/43] move Configure AWS credentials step to run only when detector changes are detected --- .github/workflows/detector-corpora-test.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 28eaf22b84c8..041ca48b142e 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -36,13 +36,6 @@ jobs: - name: Install dependencies run: sudo apt-get install -y zstd jq - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-1 - - name: Resolve merge-base id: merge_base shell: bash @@ -111,6 +104,14 @@ jobs: No detector source files changed in this PR. Bench skipped. + - name: Configure AWS credentials + if: steps.detect.outputs.any_changed == 'true' + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + # Two independent builds run in parallel: # A) prepare main worktree → build main binary (git I/O then CPU) # B) build PR binary (CPU, no dependencies) From 0d852e6c2beea5c3d1d0c1ab61abcbb9d7bc10a3 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 15:50:53 +0500 Subject: [PATCH 32/43] revert unnecessary changes --- pkg/engine/defaults/defaults.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/engine/defaults/defaults.go b/pkg/engine/defaults/defaults.go index 46db6f158115..f769af2c5614 100644 --- a/pkg/engine/defaults/defaults.go +++ b/pkg/engine/defaults/defaults.go @@ -165,7 +165,6 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudflarecakey" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudflareglobalapikey" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudimage" - "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudinary" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudmersive" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudplan" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/cloudsmith" @@ -1038,7 +1037,6 @@ func buildDetectorList() []detectors.Detector { &cloudflarecakey.Scanner{}, &cloudflareglobalapikey.Scanner{}, &cloudimage.Scanner{}, - &cloudinary.Scanner{}, &cloudmersive.Scanner{}, &cloudplan.Scanner{}, &cloudsmith.Scanner{}, From 624cfbe0b2e5e31c6619bae45d5bf8cb1eaa54ad Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 15:57:47 +0500 Subject: [PATCH 33/43] cleanup + bugbot fixes --- .github/workflows/detector-corpora-test.yml | 6 -- scripts/test/detect_changed_detectors.sh | 2 +- scripts/test/detector_corpora_test.sh | 65 ++++----------------- scripts/test/diff_corpora_results.py | 32 +--------- 4 files changed, 14 insertions(+), 91 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 041ca48b142e..f69c27a537ef 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -163,7 +163,6 @@ jobs: export OUTPUT_JSONL=/tmp/results-pr.jsonl export STDERR_FILE=/tmp/corpora-stderr-pr.txt export INCLUDE_DETECTORS="$PR_CSV" - export CORPUS_BYTES_FILE=/tmp/corpus-bytes.txt ./scripts/test/detector_corpora_test.sh "${files[@]}" ) & PID_PR=$! @@ -194,15 +193,10 @@ jobs: NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }} run: | set -o pipefail - CORPUS_BYTES=0 - if [[ -s /tmp/corpus-bytes.txt ]]; then - CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt) - fi python3 scripts/test/diff_corpora_results.py \ /tmp/results-main.jsonl /tmp/results-pr.jsonl \ --changed-detectors="$CHANGED" \ --new-detectors="$NEW_DETECTORS" \ - --corpus-bytes="$CORPUS_BYTES" \ > /tmp/diff-report.md cat /tmp/diff-report.md diff --git a/scripts/test/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh index 6f6b3ad33df2..bf74c6023935 100755 --- a/scripts/test/detect_changed_detectors.sh +++ b/scripts/test/detect_changed_detectors.sh @@ -107,7 +107,7 @@ detector_id_for_dir() { # the most common, so the modal value wins. local proto proto=$( - grep -hE 'return[[:space:]]+\S*DetectorType_[A-Za-z0-9]+' "$dir"/*.go 2>/dev/null \ + grep -E 'return[[:space:]]+\S*DetectorType_[A-Za-z0-9]+' "$dir"/*.go 2>/dev/null \ | grep -v '_test\.go' \ | grep -oE 'DetectorType_[A-Za-z0-9]+' \ | sort | uniq -c | sort -rn \ diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index 2d1f8dec041e..1b423a42daf0 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -37,65 +37,28 @@ if [[ -n "$INCLUDE_DETECTORS" ]]; then INCLUDE_FLAG=(--include-detectors="$INCLUDE_DETECTORS") fi -# When set, total uncompressed content bytes streamed to trufflehog (across -# all datasets in this run) are written to this path. Used by the diff -# script to compute blast-radius density. Awk inline-counts the post-jq -# stream so we don't double-read the corpus for byte accounting. -CORPUS_BYTES_FILE="${CORPUS_BYTES_FILE:-}" -TOTAL_BYTES=0 - # --no-verification avoids network calls against a large corpus where thousands # of matches could trigger API calls, dominating runtime. Verifier behavior is # covered by detector unit and integration tests. scan() { local input="$1" - local bytes_tmp="" - if [[ -n "$CORPUS_BYTES_FILE" ]]; then - bytes_tmp=$(mktemp) - fi # jq stderr is folded into STDERR_FILE so benign "Broken pipe" notices # (trufflehog exits before jq finishes draining the corpus) don't pollute # CI logs. Real jq parse errors land in the same file for postmortem. set +e - if [[ -n "$bytes_tmp" ]]; then - unzstd -c "$input" 2>> "$STDERR_FILE" \ - | jq -r .content 2>> "$STDERR_FILE" \ - | awk -v BF="$bytes_tmp" '{ b += length($0) + 1; print } END { printf "%d", b > BF; close(BF) }' \ - | "$TRUFFLEHOG_BIN" \ - --no-update \ - --no-verification \ - --allow-verification-overlap \ - --log-level=3 \ - --concurrency=6 \ - --json \ - --print-avg-detector-time \ - "${INCLUDE_FLAG[@]}" \ - stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" - else - unzstd -c "$input" 2>> "$STDERR_FILE" \ - | jq -r .content 2>> "$STDERR_FILE" \ - | "$TRUFFLEHOG_BIN" \ - --no-update \ - --no-verification \ - --allow-verification-overlap \ - --log-level=3 \ - --concurrency=6 \ - --json \ - --print-avg-detector-time \ - "${INCLUDE_FLAG[@]}" \ - stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" - fi + unzstd -c "$input" 2>> "$STDERR_FILE" \ + | jq -r .content 2>> "$STDERR_FILE" \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=6 \ + --json \ + --print-avg-detector-time \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" set -e - # awk's END block may not run if trufflehog exits before draining stdin - # (SIGPIPE kills awk first), leaving bytes_tmp empty. Default to 0 and - # require a clean integer before arithmetic so a partial read can't - # break the step with `$((TOTAL_BYTES + ))`. - if [[ -n "$bytes_tmp" ]]; then - bytes=$(cat "$bytes_tmp" 2>/dev/null || echo 0) - [[ "$bytes" =~ ^[0-9]+$ ]] || bytes=0 - TOTAL_BYTES=$((TOTAL_BYTES + bytes)) - rm -f "$bytes_tmp" - fi } for CORPORA_FILE in "$@"; do @@ -106,10 +69,6 @@ for CORPORA_FILE in "$@"; do fi done -if [[ -n "$CORPUS_BYTES_FILE" ]]; then - echo "$TOTAL_BYTES" > "$CORPUS_BYTES_FILE" -fi - if [[ "$RUN_DUCKDB_SUMMARY" == "1" ]]; then duckdb -c " CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index 2db488b7aa5d..2081987a7709 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -50,9 +50,6 @@ # same literal — keep the two in sync. STICKY_COMMENT_MARKER = "" -# 10 GB notional monorepo for blast-radius projection. -BLAST_RADIUS_BYTES = 10 * 1024 * 1024 * 1024 - def parse_csv(s): """Parse a comma-separated detector list into normalized name set. @@ -118,18 +115,7 @@ def build_top_line_summary(rows, changed): return summary -def render_blast_radius(matches, corpus_bytes, signed=False): - if corpus_bytes is None or corpus_bytes <= 0: - return "" - density = matches / corpus_bytes # matches per byte - projected = density * BLAST_RADIUS_BYTES - if signed: - sign = "+" if projected > 0 else ("−" if projected < 0 else "") - return f"{sign}{abs(projected):,.0f}" - return f"{projected:,.0f}" - - -def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): +def render(main, pr, changed=None, new_detectors=None): new_detectors = new_detectors or set() if changed: @@ -162,11 +148,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): if new_ids or removed_ids or m["total"] != p["total"]: has_diff = True - if is_new: - blast = render_blast_radius(p["total"], corpus_bytes, signed=False) - else: - blast = render_blast_radius(p["total"] - m["total"], corpus_bytes, signed=True) - rows.append({ "detector": d, "is_new": is_new, @@ -177,7 +158,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): "unique_pr": len(p["identities"]), "new_count": len(new_ids), "removed_count": len(removed_ids), - "blast": blast, }) parts = [ @@ -206,13 +186,9 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): else: rows.sort(key=lambda r: r["detector"]) - show_blast = corpus_bytes is not None and corpus_bytes > 0 cols = ["Status", "Detector", "Unique matches (main)", "Unique matches (PR)", "New", "Removed"] aligns = ["", "", "---:", "---:", "---:", "---:"] - if show_blast: - cols.append("Δ per 10 GB") - aligns.append("---:") parts += [ "| " + " | ".join(cols) + " |", "|" + "|".join(a if a else "---" for a in aligns) + "|", @@ -237,8 +213,6 @@ def render(main, pr, changed=None, new_detectors=None, corpus_bytes=None): str(r["new_count"]), str(r["removed_count"]), ] - if show_blast: - cells.append(r["blast"] or "—") parts.append("| " + " | ".join(cells) + " |") parts.append("") parts.append(STATUS_KEY) @@ -269,22 +243,18 @@ def main(): help="CSV of detectors changed in PR; filters report.") parser.add_argument("--new-detectors", default="", help="CSV of detectors present in PR but not main; rendered with 🆕.") - parser.add_argument("--corpus-bytes", type=int, default=0, - help="Total uncompressed bytes scanned; enables blast-radius column.") args = parser.parse_args() main_findings = load_findings(args.main_jsonl) pr_findings = load_findings(args.pr_jsonl) changed = parse_csv(args.changed_detectors) new_detectors = parse_csv(args.new_detectors) - corpus_bytes = args.corpus_bytes if args.corpus_bytes > 0 else None sys.stdout.write(render( main_findings, pr_findings, changed=changed if changed else None, new_detectors=new_detectors, - corpus_bytes=corpus_bytes, )) From f222e57e87b20fb33a7af18dbe97ce8ea1fbf4cc Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 16:33:14 +0500 Subject: [PATCH 34/43] run test with bigger (30gb) dataset, loosen jdbc regex --- .github/workflows/detector-corpora-test.yml | 1 + pkg/detectors/jdbc/jdbc.go | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index f69c27a537ef..98d182dcbc8d 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -14,6 +14,7 @@ on: env: DATASETS: | s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd + s3://trufflehog-corpora-datasets/contents.jsonl.zstd jobs: corpora-test: diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index cb9816f468f4..e69042988beb 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,7 +53,8 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + // TODO: revert before merging — regex intentionally loosened to trigger corpora test CI on this PR. + keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. From d0d94a234f35c359874266974c87256818eb5769 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 17:25:10 +0500 Subject: [PATCH 35/43] optimizations --- .github/workflows/detector-corpora-test.yml | 88 ++++++++++++--------- scripts/test/detect_changed_detectors.sh | 4 +- scripts/test/detector_corpora_test.sh | 66 +++++++++++++--- scripts/test/diff_corpora_results.py | 2 +- 4 files changed, 109 insertions(+), 51 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 98d182dcbc8d..4d985ebc51df 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -113,43 +113,60 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 + # Cache the main scan results by merge-base + scoped detector set. + # On subsequent pushes to the same PR without a rebase, both are + # identical, so the main scan (35 GB of S3 streaming + trufflehog) is + # skipped entirely. + - name: Restore main scan cache + id: main_scan_cache + if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' + uses: actions/cache/restore@v4 + with: + path: /tmp/results-main.jsonl + key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }} + # Two independent builds run in parallel: # A) prepare main worktree → build main binary (git I/O then CPU) + # Skipped on main scan cache hit — binary is not needed. # B) build PR binary (CPU, no dependencies) - name: Build binaries if: steps.detect.outputs.any_changed == 'true' shell: bash env: MERGE_BASE: ${{ steps.merge_base.outputs.sha }} + MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} run: | set -o pipefail # Chain A: prepare worktree, then build main binary. - ( - git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" - cd /tmp/trufflehog-main-src - CGO_ENABLED=0 go build -o /tmp/trufflehog-main . - ) & - PID_MAIN_BUILD=$! + # Skipped when main scan results are already cached. + if [[ "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + ( + git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" + cd /tmp/trufflehog-main-src + CGO_ENABLED=0 go build -o /tmp/trufflehog-main . + ) & + PID_MAIN_BUILD=$! + fi # Chain B: build PR binary (no dependencies). CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . & PID_PR_BUILD=$! - wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; } - wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + [[ -n "${PID_MAIN_BUILD:-}" ]] && { wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; }; } + wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } - # PR and main scans run in parallel. Each streams the corpus files - # independently from S3 — no shared state, different output files, - # different binaries. The main scan is skipped when main_csv is empty - # (PR adds only new detectors). CORPUS_BYTES_FILE is only written by - # the PR scan (blast-radius needs one consistent byte count). + # PR and main scans share a single S3 stream per dataset file, teed to + # both binaries simultaneously. The main side is skipped on a cache hit + # (results already in /tmp/results-main.jsonl) or when main_csv is empty + # (PR adds only new detectors — no overlap with main). - name: Run corpora tests if: steps.detect.outputs.any_changed == 'true' shell: bash env: PR_CSV: ${{ steps.detect.outputs.pr_csv }} MAIN_CSV: ${{ steps.detect.outputs.main_csv }} + MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} run: | set -o pipefail files=() @@ -158,33 +175,32 @@ jobs: files+=("$dataset") done <<< "$DATASETS" - # PR scan. - ( - export TRUFFLEHOG_BIN=/tmp/trufflehog-pr - export OUTPUT_JSONL=/tmp/results-pr.jsonl - export STDERR_FILE=/tmp/corpora-stderr-pr.txt - export INCLUDE_DETECTORS="$PR_CSV" - ./scripts/test/detector_corpora_test.sh "${files[@]}" - ) & - PID_PR=$! - - # Main scan (skipped when no detectors overlap with main). - if [[ -n "$MAIN_CSV" ]]; then - ( - export TRUFFLEHOG_BIN=/tmp/trufflehog-main - export OUTPUT_JSONL=/tmp/results-main.jsonl - export STDERR_FILE=/tmp/corpora-stderr-main.txt - export INCLUDE_DETECTORS="$MAIN_CSV" - ./scripts/test/detector_corpora_test.sh "${files[@]}" - ) & - PID_MAIN=$! - else + export TRUFFLEHOG_BIN=/tmp/trufflehog-pr + export OUTPUT_JSONL=/tmp/results-pr.jsonl + export STDERR_FILE=/tmp/corpora-stderr-pr.txt + export INCLUDE_DETECTORS="$PR_CSV" + + if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + # Dual-binary: single S3 download teed to both PR and main binaries. + export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main + export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl + export INCLUDE_DETECTORS_MAIN="$MAIN_CSV" + elif [[ -z "$MAIN_CSV" ]]; then echo "No overlapping detectors in main; skipping main scan." : > /tmp/results-main.jsonl + else + echo "Main scan cache hit; skipping main scan." fi - wait $PID_PR || { echo "PR scan failed" >&2; exit 1; } - [[ -n "${PID_MAIN:-}" ]] && { wait $PID_MAIN || { echo "Main scan failed" >&2; exit 1; }; } + ./scripts/test/detector_corpora_test.sh "${files[@]}" \ + || { echo "Corpora scan failed" >&2; exit 1; } + + - name: Save main scan cache + if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' && steps.main_scan_cache.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: /tmp/results-main.jsonl + key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }} - name: Diff results if: steps.detect.outputs.any_changed == 'true' diff --git a/scripts/test/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh index bf74c6023935..25931ac33726 100755 --- a/scripts/test/detect_changed_detectors.sh +++ b/scripts/test/detect_changed_detectors.sh @@ -86,8 +86,8 @@ mapfile -t BASE_IMPORTS < <(parse_defaults_imports "$MERGE_BASE") NEW_DIRS_FILE=$(mktemp) trap 'rm -f "$NEW_DIRS_FILE"' EXIT comm -23 \ - <(printf '%s\n' "${HEAD_IMPORTS[@]}") \ - <(printf '%s\n' "${BASE_IMPORTS[@]}") \ + <(printf '%s\n' "${HEAD_IMPORTS[@]+"${HEAD_IMPORTS[@]}"}") \ + <(printf '%s\n' "${BASE_IMPORTS[@]+"${BASE_IMPORTS[@]}"}") \ > "$NEW_DIRS_FILE" is_new_detector() { diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index 1b423a42daf0..3dbfe31ef10f 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -37,27 +37,69 @@ if [[ -n "$INCLUDE_DETECTORS" ]]; then INCLUDE_FLAG=(--include-detectors="$INCLUDE_DETECTORS") fi +if [[ -n "${OUTPUT_JSONL_MAIN:-}" ]]; then + > "$OUTPUT_JSONL_MAIN" +fi + # --no-verification avoids network calls against a large corpus where thousands # of matches could trigger API calls, dominating runtime. Verifier behavior is # covered by detector unit and integration tests. +# +# Dual-binary mode: when TRUFFLEHOG_BIN_MAIN / OUTPUT_JSONL_MAIN / +# INCLUDE_DETECTORS_MAIN are set, the corpus stream is teed to both the PR +# binary (stdout side) and the main binary (process substitution) so S3 is +# only downloaded once. scan() { local input="$1" # jq stderr is folded into STDERR_FILE so benign "Broken pipe" notices # (trufflehog exits before jq finishes draining the corpus) don't pollute # CI logs. Real jq parse errors land in the same file for postmortem. set +e - unzstd -c "$input" 2>> "$STDERR_FILE" \ - | jq -r .content 2>> "$STDERR_FILE" \ - | "$TRUFFLEHOG_BIN" \ - --no-update \ - --no-verification \ - --allow-verification-overlap \ - --log-level=3 \ - --concurrency=6 \ - --json \ - --print-avg-detector-time \ - "${INCLUDE_FLAG[@]}" \ - stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + + local main_include_flag=() + if [[ -n "${INCLUDE_DETECTORS_MAIN:-}" ]]; then + main_include_flag=(--include-detectors="$INCLUDE_DETECTORS_MAIN") + fi + + if [[ -n "${TRUFFLEHOG_BIN_MAIN:-}" ]]; then + # Single S3 download teed to both binaries simultaneously. + unzstd -c "$input" 2>> "$STDERR_FILE" \ + | jq -r .content 2>> "$STDERR_FILE" \ + | tee >( + "${TRUFFLEHOG_BIN_MAIN}" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=8 \ + --json \ + "${main_include_flag[@]}" \ + stdin >> "${OUTPUT_JSONL_MAIN}" 2>> "$STDERR_FILE" + ) \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=8 \ + --json \ + --print-avg-detector-time \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + else + unzstd -c "$input" 2>> "$STDERR_FILE" \ + | jq -r .content 2>> "$STDERR_FILE" \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=8 \ + --json \ + --print-avg-detector-time \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + fi set -e } diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index 2081987a7709..7b02be455bc3 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -38,7 +38,7 @@ STATUS_KEY = ( "🔴 regression: >5 new, >20% increase over main, or any removed" " \u00a0·\u00a0 " - "⚠️ warning: 1–5 new" + "⚠️ warning: 1–5 new and ≤20% increase over main" " \u00a0·\u00a0 " "✅ clean" " \u00a0·\u00a0 " From dacf850abba427b56dc32bcb83a9646cb8840243 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 18:16:27 +0500 Subject: [PATCH 36/43] bugbot fixes --- .github/workflows/detector-corpora-test.yml | 9 ++++++--- scripts/test/diff_corpora_results.py | 11 ++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 4d985ebc51df..fd042ca46d2e 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -127,20 +127,23 @@ jobs: # Two independent builds run in parallel: # A) prepare main worktree → build main binary (git I/O then CPU) - # Skipped on main scan cache hit — binary is not needed. + # Skipped on main scan cache hit or when main_csv is empty + # (all changed detectors are new — no baseline needed). # B) build PR binary (CPU, no dependencies) - name: Build binaries if: steps.detect.outputs.any_changed == 'true' shell: bash env: MERGE_BASE: ${{ steps.merge_base.outputs.sha }} + MAIN_CSV: ${{ steps.detect.outputs.main_csv }} MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} run: | set -o pipefail # Chain A: prepare worktree, then build main binary. - # Skipped when main scan results are already cached. - if [[ "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + # Skipped when main scan results are already cached, or when all + # changed detectors are new (main_csv empty — no baseline needed). + if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then ( git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" cd /tmp/trufflehog-main-src diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index 7b02be455bc3..1549bc1465c6 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -36,13 +36,10 @@ ) STATUS_KEY = ( - "🔴 regression: >5 new, >20% increase over main, or any removed" - " \u00a0·\u00a0 " - "⚠️ warning: 1–5 new and ≤20% increase over main" - " \u00a0·\u00a0 " - "✅ clean" - " \u00a0·\u00a0 " - "🆕 new detector (no baseline)" + "- 🔴 regression: >5 new, >20% increase over main, or any removed\n" + "- ⚠️ warning: 1–5 new and ≤20% increase over main\n" + "- ✅ clean\n" + "- 🆕 new detector (no baseline)" ) # Marker on the very first line of the body so peter-evans/find-comment can From 1cf22a87a57fc5917e305021a885cbcbbf769e11 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 19:15:03 +0500 Subject: [PATCH 37/43] revert jdbc changes and bugbot fix --- pkg/detectors/jdbc/jdbc.go | 3 +-- scripts/test/detector_corpora_test.sh | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index e69042988beb..cb9816f468f4 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,8 +53,7 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - // TODO: revert before merging — regex intentionally loosened to trigger corpora test CI on this PR. - keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index 3dbfe31ef10f..d7ebe2221a09 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -86,6 +86,7 @@ scan() { --print-avg-detector-time \ "${INCLUDE_FLAG[@]}" \ stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + wait else unzstd -c "$input" 2>> "$STDERR_FILE" \ | jq -r .content 2>> "$STDERR_FILE" \ From 5792a093e616dc5b2368ae96d8eb2ab9304ba093 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 19:46:54 +0500 Subject: [PATCH 38/43] run only on regex and/or keywords change --- scripts/test/detect_changed_detectors.sh | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/scripts/test/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh index 25931ac33726..1e1bfbc7f2d1 100755 --- a/scripts/test/detect_changed_detectors.sh +++ b/scripts/test/detect_changed_detectors.sh @@ -94,6 +94,35 @@ is_new_detector() { grep -qxF "$1" "$NEW_DIRS_FILE" } +# Step 2b — skip detectors whose diff doesn't touch regex patterns or Keywords. +# Corpora results only change when the matching logic changes; verification, +# redaction, or structural changes don't affect match counts. +has_pattern_change() { + local dir="$1" + + # Fast path: regex or Keywords() signature on a changed line. + git diff "$MERGE_BASE...$HEAD_REF" -- "$dir"/*.go 2>/dev/null \ + | grep -qE '^[+-][^+-].*(regexp\.|MustCompile|Keywords)' && return 0 + + # Slow path: compare the Keywords() function body between refs to catch + # changes to the return value (e.g. []string{"old"} → []string{"new"}) + # where the changed lines don't mention "Keywords" themselves. + local file + while IFS= read -r file; do + [[ "$file" == *_test.go ]] && continue + local head_body base_body + head_body=$(git show "$HEAD_REF:$file" 2>/dev/null \ + | awk '/func[[:space:]].*Keywords\(\)[[:space:]]*\[\]string/,/^[[:space:]]*\}/' \ + | tail -n +2) + base_body=$(git show "$MERGE_BASE:$file" 2>/dev/null \ + | awk '/func[[:space:]].*Keywords\(\)[[:space:]]*\[\]string/,/^[[:space:]]*\}/' \ + | tail -n +2) + [[ "$head_body" != "$base_body" ]] && return 0 + done < <(git diff --name-only "$MERGE_BASE...$HEAD_REF" -- "$dir"/*.go 2>/dev/null) + + return 1 +} + # Step 3 — for a dir, derive `[.v]`. detector_id_for_dir() { local dir="$1" @@ -127,6 +156,7 @@ emit_list() { local dir id for dir in "${CHANGED_DIRS[@]:-}"; do [[ -z "$dir" ]] && continue + has_pattern_change "$dir" || continue if id=$(detector_id_for_dir "$dir"); then echo "$id" else @@ -139,6 +169,7 @@ emit_main_list() { local dir id for dir in "${CHANGED_DIRS[@]:-}"; do [[ -z "$dir" ]] && continue + has_pattern_change "$dir" || continue # Strip `pkg/detectors/` prefix to get the import-path form, then # check against the new-detector set. local import_form="${dir#pkg/detectors/}" @@ -155,6 +186,7 @@ emit_new_list() { local dir id for dir in "${CHANGED_DIRS[@]:-}"; do [[ -z "$dir" ]] && continue + has_pattern_change "$dir" || continue local import_form="${dir#pkg/detectors/}" if ! is_new_detector "$import_form"; then continue From 032b2a8032e414fac612c4c2749df1b7a196f82c Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 19:53:00 +0500 Subject: [PATCH 39/43] bugbot fixes --- scripts/test/diff_corpora_results.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py index 1549bc1465c6..63b5f771a5b1 100755 --- a/scripts/test/diff_corpora_results.py +++ b/scripts/test/diff_corpora_results.py @@ -102,11 +102,18 @@ def status_emoji(new_count, removed_count, unique_main): def build_top_line_summary(rows, changed): - regressed = sum(1 for r in rows if not r["is_new"] and r["emoji"] in ("🔴", "⚠️")) + regressed = sum(1 for r in rows if not r["is_new"] and r["emoji"] == "🔴") + warned = sum(1 for r in rows if not r["is_new"] and r["emoji"] == "⚠️") new_count = sum(1 for r in rows if r["is_new"]) clean = sum(1 for r in rows if r["emoji"] == "✅") scoped = ", ".join(f"`{d}`" for d in sorted(changed)) if changed else "" - summary = f"**{regressed} regressed · {new_count} new · {clean} clean**" + parts = [] + if regressed: + parts.append(f"{regressed} regressed") + if warned: + parts.append(f"{warned} warned") + parts += [f"{new_count} new", f"{clean} clean"] + summary = f"**{' · '.join(parts)}**" if scoped: summary += f" \u00a0|\u00a0 Scoped to: {scoped}" return summary @@ -131,7 +138,13 @@ def render(main, pr, changed=None, new_detectors=None): rows = [] has_diff = False for d in sorted(all_names): - is_new = d.lower() in new_detectors + # A detector is only treated as fully new if the new_detectors set + # says so AND main produced no findings for it. When a PR modifies an + # existing version and adds a new version of the same detector (e.g. + # jdbc.v1 + jdbc.v2), both collapse to "jdbc" in new_detectors but + # main still ran against the existing version — its results must not + # be discarded. + is_new = d.lower() in new_detectors and d not in main m = main.get(d, _empty) p = pr.get(d, _empty) new_ids = p["identities"] - m["identities"] From 88c63e68c976145983f5e5a25d450b89841d6732 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Mon, 4 May 2026 20:28:51 +0500 Subject: [PATCH 40/43] bugbot fix --- scripts/test/detector_corpora_test.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index d7ebe2221a09..e8e11142fb82 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -61,6 +61,7 @@ scan() { main_include_flag=(--include-detectors="$INCLUDE_DETECTORS_MAIN") fi + local rc=0 if [[ -n "${TRUFFLEHOG_BIN_MAIN:-}" ]]; then # Single S3 download teed to both binaries simultaneously. unzstd -c "$input" 2>> "$STDERR_FILE" \ @@ -86,6 +87,7 @@ scan() { --print-avg-detector-time \ "${INCLUDE_FLAG[@]}" \ stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + rc=$? wait else unzstd -c "$input" 2>> "$STDERR_FILE" \ @@ -100,8 +102,10 @@ scan() { --print-avg-detector-time \ "${INCLUDE_FLAG[@]}" \ stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE" + rc=$? fi set -e + return $rc } for CORPORA_FILE in "$@"; do From 5a6e1a2d373ddc84b844815006c723e7013f7fd7 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Tue, 5 May 2026 16:35:02 +0500 Subject: [PATCH 41/43] incorporate brad's comments, loosen jdbc regex to run a test to ensure everything works as expected --- .github/workflows/detector-corpora-test.yml | 98 ++++++++++++--------- pkg/detectors/jdbc/jdbc.go | 3 +- scripts/test/detector_corpora_test.sh | 5 +- 3 files changed, 58 insertions(+), 48 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index fd042ca46d2e..3379c0b122f8 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -25,12 +25,13 @@ jobs: pull-requests: write steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: fetch-depth: 0 + persist-credentials: false - name: Install Go - uses: actions/setup-go@v5 + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 with: go-version: "1.25" @@ -86,7 +87,7 @@ jobs: - name: Find existing skip comment if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' id: find_skip_comment - uses: peter-evans/find-comment@v3 + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4 with: issue-number: ${{ github.event.pull_request.number }} comment-author: 'github-actions[bot]' @@ -94,7 +95,7 @@ jobs: - name: Post or update skip comment if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' - uses: peter-evans/create-or-update-comment@v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ steps.find_skip_comment.outputs.comment-id }} issue-number: ${{ github.event.pull_request.number }} @@ -107,7 +108,7 @@ jobs: - name: Configure AWS credentials if: steps.detect.outputs.any_changed == 'true' - uses: aws-actions/configure-aws-credentials@v4 + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 # v6 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -120,7 +121,7 @@ jobs: - name: Restore main scan cache id: main_scan_cache if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' - uses: actions/cache/restore@v4 + uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 with: path: /tmp/results-main.jsonl key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }} @@ -159,48 +160,59 @@ jobs: [[ -n "${PID_MAIN_BUILD:-}" ]] && { wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; }; } wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } - # PR and main scans share a single S3 stream per dataset file, teed to - # both binaries simultaneously. The main side is skipped on a cache hit - # (results already in /tmp/results-main.jsonl) or when main_csv is empty - # (PR adds only new detectors — no overlap with main). + # TODO: remove before merging — fake results for testing the diff/comment steps without a full scan. + # Restore the real step below once comment rendering is verified. - name: Run corpora tests if: steps.detect.outputs.any_changed == 'true' shell: bash - env: - PR_CSV: ${{ steps.detect.outputs.pr_csv }} - MAIN_CSV: ${{ steps.detect.outputs.main_csv }} - MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} run: | - set -o pipefail - files=() - while IFS= read -r dataset; do - [[ -z "$dataset" ]] && continue - files+=("$dataset") - done <<< "$DATASETS" - - export TRUFFLEHOG_BIN=/tmp/trufflehog-pr - export OUTPUT_JSONL=/tmp/results-pr.jsonl - export STDERR_FILE=/tmp/corpora-stderr-pr.txt - export INCLUDE_DETECTORS="$PR_CSV" + echo '{"DetectorName":"JDBC","Raw":"jdbc:mysql://user:pass@host/db","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' > /tmp/results-pr.jsonl + echo '{"DetectorName":"JDBC","Raw":"jdbc:mysql://user:pass@host/db","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' >> /tmp/results-pr.jsonl + echo '{"DetectorName":"JDBC","Raw":"jdbc:postgresql://admin:secret@db.example.com/prod","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' >> /tmp/results-pr.jsonl + echo '{"DetectorName":"JDBC","Raw":"jdbc:mysql://user:pass@host/db","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' > /tmp/results-main.jsonl - if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then - # Dual-binary: single S3 download teed to both PR and main binaries. - export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main - export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl - export INCLUDE_DETECTORS_MAIN="$MAIN_CSV" - elif [[ -z "$MAIN_CSV" ]]; then - echo "No overlapping detectors in main; skipping main scan." - : > /tmp/results-main.jsonl - else - echo "Main scan cache hit; skipping main scan." - fi - - ./scripts/test/detector_corpora_test.sh "${files[@]}" \ - || { echo "Corpora scan failed" >&2; exit 1; } + # PR and main scans share a single S3 stream per dataset file, teed to + # both binaries simultaneously. The main side is skipped on a cache hit + # (results already in /tmp/results-main.jsonl) or when main_csv is empty + # (PR adds only new detectors — no overlap with main). + # - name: Run corpora tests + # if: steps.detect.outputs.any_changed == 'true' + # shell: bash + # env: + # PR_CSV: ${{ steps.detect.outputs.pr_csv }} + # MAIN_CSV: ${{ steps.detect.outputs.main_csv }} + # MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} + # run: | + # set -o pipefail + # files=() + # while IFS= read -r dataset; do + # [[ -z "$dataset" ]] && continue + # files+=("$dataset") + # done <<< "$DATASETS" + # + # export TRUFFLEHOG_BIN=/tmp/trufflehog-pr + # export OUTPUT_JSONL=/tmp/results-pr.jsonl + # export STDERR_FILE=/tmp/corpora-stderr-pr.txt + # export INCLUDE_DETECTORS="$PR_CSV" + # + # if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + # # Dual-binary: single S3 download teed to both PR and main binaries. + # export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main + # export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl + # export INCLUDE_DETECTORS_MAIN="$MAIN_CSV" + # elif [[ -z "$MAIN_CSV" ]]; then + # echo "No overlapping detectors in main; skipping main scan." + # : > /tmp/results-main.jsonl + # else + # echo "Main scan cache hit; skipping main scan." + # fi + # + # ./scripts/test/detector_corpora_test.sh "${files[@]}" \ + # || { echo "Corpora scan failed" >&2; exit 1; } - name: Save main scan cache if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' && steps.main_scan_cache.outputs.cache-hit != 'true' - uses: actions/cache/save@v4 + uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 with: path: /tmp/results-main.jsonl key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }} @@ -226,7 +238,7 @@ jobs: - name: Resolve PR number if: steps.detect.outputs.any_changed == 'true' id: resolve_pr - uses: actions/github-script@v7 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | let issue_number; @@ -250,7 +262,7 @@ jobs: - name: Find existing diff comment if: steps.detect.outputs.any_changed == 'true' id: find_diff_comment - uses: peter-evans/find-comment@v3 + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4 with: issue-number: ${{ steps.resolve_pr.outputs.issue_number }} comment-author: 'github-actions[bot]' @@ -258,7 +270,7 @@ jobs: - name: Post or update diff comment if: steps.detect.outputs.any_changed == 'true' - uses: peter-evans/create-or-update-comment@v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ steps.find_diff_comment.outputs.comment-id }} issue-number: ${{ steps.resolve_pr.outputs.issue_number }} diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index cb9816f468f4..34b0ee784e35 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,7 +53,8 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + // TODO: revert before merging — regex intentionally loosened to trigger corpora test CI. + keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh index e8e11142fb82..0cdf20111191 100755 --- a/scripts/test/detector_corpora_test.sh +++ b/scripts/test/detector_corpora_test.sh @@ -122,10 +122,7 @@ CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); SELECT t.DetectorName detector, - COUNT(*) total, - SUM(CASE WHEN Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) verified, - SUM(CASE WHEN NOT Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) unverified, - SUM(CASE WHEN VerificationError IS NOT NULL THEN 1 ELSE 0 END) \"unknown\" + COUNT(*) total FROM t GROUP BY all ORDER BY total DESC, detector From 5b722a2162c50453b9c74b3b3800a16cdf1a1b57 Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Tue, 5 May 2026 16:42:02 +0500 Subject: [PATCH 42/43] revert test changes --- .github/workflows/detector-corpora-test.yml | 77 +++++++++------------ pkg/detectors/jdbc/jdbc.go | 3 +- 2 files changed, 34 insertions(+), 46 deletions(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 3379c0b122f8..3de02bd3101d 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -160,55 +160,44 @@ jobs: [[ -n "${PID_MAIN_BUILD:-}" ]] && { wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; }; } wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } - # TODO: remove before merging — fake results for testing the diff/comment steps without a full scan. - # Restore the real step below once comment rendering is verified. + # PR and main scans share a single S3 stream per dataset file, teed to + # both binaries simultaneously. The main side is skipped on a cache hit + # (results already in /tmp/results-main.jsonl) or when main_csv is empty + # (PR adds only new detectors — no overlap with main). - name: Run corpora tests if: steps.detect.outputs.any_changed == 'true' shell: bash + env: + PR_CSV: ${{ steps.detect.outputs.pr_csv }} + MAIN_CSV: ${{ steps.detect.outputs.main_csv }} + MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} run: | - echo '{"DetectorName":"JDBC","Raw":"jdbc:mysql://user:pass@host/db","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' > /tmp/results-pr.jsonl - echo '{"DetectorName":"JDBC","Raw":"jdbc:mysql://user:pass@host/db","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' >> /tmp/results-pr.jsonl - echo '{"DetectorName":"JDBC","Raw":"jdbc:postgresql://admin:secret@db.example.com/prod","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' >> /tmp/results-pr.jsonl - echo '{"DetectorName":"JDBC","Raw":"jdbc:mysql://user:pass@host/db","RawV2":"","Redacted":"","ExtraData":null,"StructuredData":null,"Verified":false,"VerificationError":null}' > /tmp/results-main.jsonl + set -o pipefail + files=() + while IFS= read -r dataset; do + [[ -z "$dataset" ]] && continue + files+=("$dataset") + done <<< "$DATASETS" - # PR and main scans share a single S3 stream per dataset file, teed to - # both binaries simultaneously. The main side is skipped on a cache hit - # (results already in /tmp/results-main.jsonl) or when main_csv is empty - # (PR adds only new detectors — no overlap with main). - # - name: Run corpora tests - # if: steps.detect.outputs.any_changed == 'true' - # shell: bash - # env: - # PR_CSV: ${{ steps.detect.outputs.pr_csv }} - # MAIN_CSV: ${{ steps.detect.outputs.main_csv }} - # MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} - # run: | - # set -o pipefail - # files=() - # while IFS= read -r dataset; do - # [[ -z "$dataset" ]] && continue - # files+=("$dataset") - # done <<< "$DATASETS" - # - # export TRUFFLEHOG_BIN=/tmp/trufflehog-pr - # export OUTPUT_JSONL=/tmp/results-pr.jsonl - # export STDERR_FILE=/tmp/corpora-stderr-pr.txt - # export INCLUDE_DETECTORS="$PR_CSV" - # - # if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then - # # Dual-binary: single S3 download teed to both PR and main binaries. - # export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main - # export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl - # export INCLUDE_DETECTORS_MAIN="$MAIN_CSV" - # elif [[ -z "$MAIN_CSV" ]]; then - # echo "No overlapping detectors in main; skipping main scan." - # : > /tmp/results-main.jsonl - # else - # echo "Main scan cache hit; skipping main scan." - # fi - # - # ./scripts/test/detector_corpora_test.sh "${files[@]}" \ - # || { echo "Corpora scan failed" >&2; exit 1; } + export TRUFFLEHOG_BIN=/tmp/trufflehog-pr + export OUTPUT_JSONL=/tmp/results-pr.jsonl + export STDERR_FILE=/tmp/corpora-stderr-pr.txt + export INCLUDE_DETECTORS="$PR_CSV" + + if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + # Dual-binary: single S3 download teed to both PR and main binaries. + export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main + export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl + export INCLUDE_DETECTORS_MAIN="$MAIN_CSV" + elif [[ -z "$MAIN_CSV" ]]; then + echo "No overlapping detectors in main; skipping main scan." + : > /tmp/results-main.jsonl + else + echo "Main scan cache hit; skipping main scan." + fi + + ./scripts/test/detector_corpora_test.sh "${files[@]}" \ + || { echo "Corpora scan failed" >&2; exit 1; } - name: Save main scan cache if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' && steps.main_scan_cache.outputs.cache-hit != 'true' diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go index 34b0ee784e35..cb9816f468f4 100644 --- a/pkg/detectors/jdbc/jdbc.go +++ b/pkg/detectors/jdbc/jdbc.go @@ -53,8 +53,7 @@ var ( // Matches typical JDBC connection strings. // The terminal character class additionally excludes () and & to avoid // capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&"). - // TODO: revert before merging — regex intentionally loosened to trigger corpora test CI. - keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) + keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`) ) // Keywords are used for efficiently pre-filtering chunks. From 6c3bbaea807aa934f66ce9951c2093acb6f9b63a Mon Sep 17 00:00:00 2001 From: Mustansir Muzaffar Date: Tue, 5 May 2026 16:45:02 +0500 Subject: [PATCH 43/43] fix misleading bench skipped message --- .github/workflows/detector-corpora-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml index 3de02bd3101d..3dc3ffbe018b 100644 --- a/.github/workflows/detector-corpora-test.yml +++ b/.github/workflows/detector-corpora-test.yml @@ -104,7 +104,7 @@ jobs: ## Corpora Test Results - No detector source files changed in this PR. Bench skipped. + No detector regex or keyword changes in this PR. Bench skipped. - name: Configure AWS credentials if: steps.detect.outputs.any_changed == 'true'