trufflesecurity · mustansir14 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
@@ -0,0 +1,131 @@
+name: Corpora Test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    # types: [opened, reopened] TODO: Decide if we should run this on every push
+    paths:
+      - 'pkg/detectors/**'
+      - '.github/workflows/detector-corpora-test.yml'
+      - 'scripts/detector_corpora_test.sh'
+      - 'scripts/diff_corpora_results.py'
+
+env:
+  DATASETS: |
+    s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd
+
+jobs:
+  corpora-test:
+    if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: "1.25"
+
+      - name: Install dependencies
+        run: sudo apt-get install -y zstd jq
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Resolve merge-base and prepare main worktree
+        shell: bash
+        run: |
+          set -o pipefail
+          git fetch --no-tags --prune origin main
+          MERGE_BASE=$(git merge-base origin/main HEAD)
+          echo "Merge base: $MERGE_BASE"
+          git worktree add /tmp/trufflehog-main-src "$MERGE_BASE"
+
+      - name: Build trufflehog (PR HEAD)
+        shell: bash
+        run: |
+          set -o pipefail
+          CGO_ENABLED=0 go build -o /tmp/trufflehog-pr .
+
+      - name: Build trufflehog (main merge-base)
+        shell: bash
+        working-directory: /tmp/trufflehog-main-src
+        run: |
+          set -o pipefail
+          CGO_ENABLED=0 go build -o /tmp/trufflehog-main .
+
+      - name: Run corpora test (main build)
+        shell: bash
+        env:
+          TRUFFLEHOG_BIN: /tmp/trufflehog-main
+          OUTPUT_JSONL: /tmp/results-main.jsonl
+          STDERR_FILE: /tmp/corpora-stderr-main.txt
+        run: |
+          set -o pipefail
+          files=()
+          while IFS= read -r dataset; do
+            [[ -z "$dataset" ]] && continue
+            files+=("$dataset")
+          done <<< "$DATASETS"
+          ./scripts/detector_corpora_test.sh "${files[@]}"
+
+      - name: Run corpora test (PR build)
+        shell: bash
+        env:
+          TRUFFLEHOG_BIN: /tmp/trufflehog-pr
+          OUTPUT_JSONL: /tmp/results-pr.jsonl
+          STDERR_FILE: /tmp/corpora-stderr-pr.txt
+        run: |
+          set -o pipefail
+          files=()
+          while IFS= read -r dataset; do
+            [[ -z "$dataset" ]] && continue
+            files+=("$dataset")
+          done <<< "$DATASETS"
+          ./scripts/detector_corpora_test.sh "${files[@]}"
+
+      - name: Diff results
+        shell: bash
+        run: |
+          set -o pipefail
+          python3 scripts/diff_corpora_results.py /tmp/results-main.jsonl /tmp/results-pr.jsonl > /tmp/diff-report.md
+          cat /tmp/diff-report.md
+
+      - name: Post results to PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('/tmp/diff-report.md', 'utf8');
+            let issue_number;
+            if (context.eventName === 'workflow_dispatch') {
+              const pulls = await github.rest.pulls.list({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`,
+                state: 'open',
+              });
+              if (pulls.data.length === 0) {
+                core.setFailed(`No open PR found for branch ${context.ref}`);
+                return;
+              }
+              issue_number = pulls.data[0].number;
+            } else {
+              issue_number = context.issue.number;
+            }
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number,
+              body,
+            });
diff --git a/pkg/detectors/jdbc/jdbc.go b/pkg/detectors/jdbc/jdbc.go
@@ -53,7 +53,7 @@ var (
 	// Matches typical JDBC connection strings.
 	// The terminal character class additionally excludes () and & to avoid
 	// capturing surrounding delimiters (e.g. "(jdbc:…)" or "…&user=x&").
-	keyPat = regexp.MustCompile(`(?i)jdbc:[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`)
+	keyPat = regexp.MustCompile(`(?i)[\w]{3,10}:[^\s"'<>,{}[\]]{10,511}[^\s"'<>,{}[\]()&]`)
 )
 
 // Keywords are used for efficiently pre-filtering chunks.

diff --git a/scripts/detector_corpora_test.sh b/scripts/detector_corpora_test.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+set -euo pipefail
+
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <corpora_file.jsonl.zstd> [<corpora_file2.jsonl.zstd> ...]"
+    exit 1
+fi
+
+# CI sets OUTPUT_JSONL to per-run paths and skips the human-readable DuckDB
+# summary. Local invocations leave it unset and get the summary table for
+# debugging.
+if [[ -z "${OUTPUT_JSONL+x}" ]]; then
+    OUTPUT_JSONL="/tmp/corpora_results.jsonl"
+    RUN_DUCKDB_SUMMARY=1
+else
+    RUN_DUCKDB_SUMMARY=0
+fi
+> "$OUTPUT_JSONL"
+
+# Captures trufflehog stderr (incl. --print-avg-detector-time output) for downstream phases.
+STDERR_FILE="${STDERR_FILE:-/tmp/corpora-stderr.txt}"
+> "$STDERR_FILE"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(dirname "$SCRIPT_DIR")"
+TRUFFLEHOG_BIN="${TRUFFLEHOG_BIN:-${REPO_ROOT}/trufflehog}"
+
+if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then
+    CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT"
+fi
+
+# --no-verification and --allow-verification-overlap are paired intentionally.
+# This bench measures per-detector regex behavior in isolation:
+#   - --no-verification: avoids network-flake noise (rate limits, transient 5xx
+#     errors) that would otherwise produce verified/unverified deltas
+#     indistinguishable from real regex regressions. Verifier behavior is
+#     covered by detector unit tests.
+#   - --allow-verification-overlap: bypasses the engine's cross-detector
+#     overlap routing (pkg/engine/engine.go:862-872 + likelyDuplicate). That
+#     routing exists for verification safety — when one chunk has matches from
+#     multiple detectors, it dedups near-identical results so the same secret
+#     isn't sent to multiple verifiers. With verification off, the routing has
+#     no purpose, but its dedup side-effect (silently dropping a detector's
+#     other matches in a multi-match chunk) makes a regex change in detector A
+#     shift raw match counts in unrelated detector B, contaminating the diff.
+#     Bypassing it gives each detector independent regex measurement.
+scan() {
+    local input="$1"
+    set +e
+    unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \
+        --no-update \
+        --no-verification \
+        --allow-verification-overlap \
+        --log-level=3 \
+        --concurrency=6 \
+        --json \
+        --print-avg-detector-time \
+        stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE"
+    set -e
+}
+
+for CORPORA_FILE in "$@"; do
+    if [[ "$CORPORA_FILE" == s3://* ]]; then
+        aws s3 cp "$CORPORA_FILE" - | scan /dev/stdin
+    else
+        scan "$CORPORA_FILE"
+    fi
+done
+
+if [[ "$RUN_DUCKDB_SUMMARY" == "1" ]]; then
+    duckdb -c "
+CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true);
+
+SELECT
+    t.DetectorName detector,
+    COUNT(*) total,
+    SUM(CASE WHEN Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) verified,
+    SUM(CASE WHEN NOT Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) unverified,
+    SUM(CASE WHEN VerificationError IS NOT NULL THEN 1 ELSE 0 END) \"unknown\"
+FROM t
+GROUP BY all
+ORDER BY total DESC, detector
+LIMIT 50;
+"
+fi
diff --git a/scripts/diff_corpora_results.py b/scripts/diff_corpora_results.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Diffs two trufflehog JSONL outputs (main vs PR build) and emits a Markdown
+report to stdout.
+
+Identity per finding: (DetectorName, Raw or RawV2 fallback). Set semantics —
+duplicates within a single scan collapse into one identity, so a regex change
+either adds a new (detector, secret) identity or removes one.
+
+Verification is disabled at scan time (see scripts/detector_corpora_test.sh),
+so verified/unverified deltas are intentionally not surfaced — the diff
+measures regex match changes only.
+
+Usage: diff_corpora_results.py <main.jsonl> <pr.jsonl>
+"""
+import json
+import sys
+from collections import defaultdict
+
+
+PREAMBLE = (
+    "This bench measures regex match regressions only. It runs with "
+    "`--no-verification --allow-verification-overlap` so each detector's "
+    "regex behavior is measured independently — verifier behavior is tested "
+    "separately by detector unit tests."
+)
+
+
+def load_findings(path):
+    """Returns dict: detector_name -> {"identities": set[str], "total": int}."""
+    by_detector = defaultdict(lambda: {"identities": set(), "total": 0})
+    with open(path, "r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            detector = obj.get("DetectorName") or ""
+            if not detector:
+                continue
+            raw = obj.get("Raw") or obj.get("RawV2") or ""
+            by_detector[detector]["identities"].add(raw)
+            by_detector[detector]["total"] += 1
+    return by_detector
+
+
+def render(main, pr):
+    detectors = sorted(set(main) | set(pr))
+    rows = []
+    has_diff = False
+    for d in detectors:
+        m = main.get(d, {"identities": set(), "total": 0})
+        p = pr.get(d, {"identities": set(), "total": 0})
+        new = p["identities"] - m["identities"]
+        removed = m["identities"] - p["identities"]
+        # A row is "diff-clean" only when NEW, REMOVED, AND raw totals all match.
+        # Total-count differences without identity changes are still real (e.g.,
+        # a regex change in one detector can shift duplicate-match counts via
+        # cross-detector dedup), so they must not be reported as ✅.
+        if new or removed or m["total"] != p["total"]:
+            has_diff = True
+        rows.append({
+            "detector": d,
+            "total_main": m["total"],
+            "total_pr": p["total"],
+            "unique_main": len(m["identities"]),
+            "unique_pr": len(p["identities"]),
+            "new": len(new),
+            "removed": len(removed),
+        })
+
+    title = "## Corpora Test Results — Diff (PR vs main)"
+    parts = [title, "", PREAMBLE, ""]
+
+    if not rows:
+        parts += ["_(No findings on either side.)_", ""]
+        return "\n".join(parts)
+
+    if has_diff:
+        rows.sort(key=lambda r: (r["new"] + r["removed"], r["detector"]), reverse=True)
+    else:
+        parts += ["✅ No diff vs main — regex matches are identical across both builds.", ""]
+        rows.sort(key=lambda r: r["detector"])
+
+    parts += [
+        "| Detector | total main | total PR | unique main | unique PR | NEW | REMOVED |",
+        "|---|---:|---:|---:|---:|---:|---:|",
+    ]
+    for r in rows:
+        parts.append(
+            f"| {r['detector']} | {r['total_main']} | {r['total_pr']} | "
+            f"{r['unique_main']} | {r['unique_pr']} | {r['new']} | {r['removed']} |"
+        )
+    parts.append("")
+    return "\n".join(parts)
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: diff_corpora_results.py <main.jsonl> <pr.jsonl>", file=sys.stderr)
+        sys.exit(2)
+    main_findings = load_findings(sys.argv[1])
+    pr_findings = load_findings(sys.argv[2])
+    sys.stdout.write(render(main_findings, pr_findings))
+
+
+if __name__ == "__main__":
+    main()