-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Automate corpora testing in CI #4927
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
720b203
942b25f
27b8867
e360e6d
1aae080
0a78ecc
b9d8506
26c1c03
f46e86c
021e8c3
420ec56
b0c3d28
25f08fc
735522b
7b44c92
e0e33bc
a93890e
24fdf36
c20a5dd
80f6747
02ae97b
5186d12
bbaa4af
648ae6a
b56b46b
3284602
2421212
6ba2661
a00d129
8a20a97
30c98bb
0d852e6
624cfbe
f222e57
d0d94a2
dacf850
1cf22a8
5792a09
032b2a8
c53fbcb
88c63e6
f7f08e6
5a6e1a2
5b722a2
6c3bbae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| name: Corpora Test | ||
|
|
||
| on: | ||
| workflow_dispatch: | ||
| pull_request: | ||
| # types: [opened, reopened] TODO: only done to see results in the PR (uncomment this when before merging) | ||
| paths: | ||
| - 'pkg/detectors/**' | ||
| - '.github/workflows/detector-corpora-test.yml' | ||
| - 'scripts/detector_corpora_test.sh' | ||
|
|
||
| env: | ||
| DATASETS: | | ||
| s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd | ||
|
|
||
| jobs: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note, some of the actions used here are old versions. Also, you might consider pinning the action versions used here to reduce risk of possible supply-chain attacks. zizmor is helpful: https://docs.zizmor.sh/
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for this. Really helpful! I'll do the needful |
||
| corpora-test: | ||
| if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }} | ||
| runs-on: ubuntu-latest | ||
| permissions: | ||
| contents: read | ||
| pull-requests: write | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Install Go | ||
| uses: actions/setup-go@v5 | ||
| with: | ||
| go-version: "1.25" | ||
|
|
||
| - name: Install dependencies | ||
| run: sudo apt-get install -y zstd jq | ||
|
|
||
| - name: Install DuckDB | ||
| run: | | ||
| wget -q https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip | ||
| unzip duckdb_cli-linux-amd64.zip | ||
| sudo mv duckdb /usr/local/bin/ | ||
|
|
||
| - name: Configure AWS credentials | ||
| uses: aws-actions/configure-aws-credentials@v4 | ||
| with: | ||
| aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} | ||
| aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
| aws-region: us-east-1 | ||
|
|
||
| - name: Run corpora test | ||
| run: | | ||
| files=() | ||
| while IFS= read -r dataset; do | ||
| [[ -z "$dataset" ]] && continue | ||
| files+=("$dataset") | ||
| done <<< "$DATASETS" | ||
| ./scripts/detector_corpora_test.sh "${files[@]}" | tee /tmp/corpora-results.txt | ||
|
cursor[bot] marked this conversation as resolved.
Outdated
|
||
|
|
||
| - name: Post results to PR | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const fs = require('fs'); | ||
| const results = fs.readFileSync('/tmp/corpora-results.txt', 'utf8'); | ||
| const body = [ | ||
| `## Corpora Test Results`, | ||
| ``, | ||
| `This test scans a real-world dataset of public content to measure how often this detector fires. A high number of unverified or unknown results may indicate the detector is too noisy and could impact signal quality in production.`, | ||
| ``, | ||
| `| Column | Meaning |`, | ||
| `|--------|---------|`, | ||
| `| \`total\` | All findings for this detector |`, | ||
| `| \`verified\` | Confirmed valid credentials |`, | ||
| `| \`unverified\` | Matched pattern but could not verify (credential may be invalid or service unreachable) |`, | ||
| `| \`unknown\` | Verification attempted but errored |`, | ||
| ``, | ||
| `\`\`\``, | ||
| results, | ||
| `\`\`\``, | ||
| ].join('\n'); | ||
| let issue_number; | ||
| if (context.eventName === 'workflow_dispatch') { | ||
| const pulls = await github.rest.pulls.list({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`, | ||
| state: 'open', | ||
| }); | ||
| if (pulls.data.length === 0) { | ||
| core.setFailed(`No open PR found for branch ${context.ref}`); | ||
| return; | ||
| } | ||
| issue_number = pulls.data[0].number; | ||
| } else { | ||
| issue_number = context.issue.number; | ||
| } | ||
| await github.rest.issues.createComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number, | ||
| body, | ||
| }); | ||
|
cursor[bot] marked this conversation as resolved.
Outdated
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| #!/bin/bash | ||
| set -euo pipefail | ||
|
|
||
| if [[ $# -lt 1 ]]; then | ||
| echo "Usage: $0 <corpora_file.jsonl.zstd> [<corpora_file2.jsonl.zstd> ...]" | ||
| exit 1 | ||
| fi | ||
|
|
||
| OUTPUT_JSONL="/tmp/corpora_results.jsonl" | ||
| > "$OUTPUT_JSONL" | ||
|
|
||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||
| REPO_ROOT="$(dirname "$SCRIPT_DIR")" | ||
| TRUFFLEHOG_BIN="${REPO_ROOT}/trufflehog" | ||
|
|
||
| CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" | ||
|
|
||
| scan() { | ||
| local input="$1" | ||
| set +e | ||
| unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \ | ||
| --no-update \ | ||
| --log-level=3 \ | ||
| --concurrency=6 \ | ||
| --json \ | ||
| --print-avg-detector-time \ | ||
| stdin >> "$OUTPUT_JSONL" 2>/dev/null | ||
| set -e | ||
| } | ||
|
|
||
| for CORPORA_FILE in "$@"; do | ||
| if [[ "$CORPORA_FILE" == s3://* ]]; then | ||
| aws s3 cp "$CORPORA_FILE" - | scan /dev/stdin | ||
| else | ||
| scan "$CORPORA_FILE" | ||
| fi | ||
| done | ||
|
|
||
| duckdb -c " | ||
| CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); | ||
|
|
||
| SELECT | ||
| t.DetectorName detector, | ||
| COUNT(*) total, | ||
| SUM(CASE WHEN Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) verified, | ||
| SUM(CASE WHEN NOT Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) unverified, | ||
| SUM(CASE WHEN VerificationError IS NOT NULL THEN 1 ELSE 0 END) \"unknown\" | ||
| FROM t | ||
| GROUP BY all | ||
| ORDER BY total DESC, detector | ||
| LIMIT 50; | ||
| " |
Uh oh!
There was an error while loading. Please reload this page.