-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Expand file tree
/
Copy pathdetector_corpora_test.sh
More file actions
executable file
·56 lines (47 loc) · 1.46 KB
/
detector_corpora_test.sh
File metadata and controls
executable file
·56 lines (47 loc) · 1.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash
set -euo pipefail
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <corpora_file.jsonl.zstd> [<corpora_file2.jsonl.zstd> ...]"
exit 1
fi
OUTPUT_JSONL="/tmp/corpora_results.jsonl"
> "$OUTPUT_JSONL"
# Captures trufflehog stderr (incl. --print-avg-detector-time output) for downstream phases.
STDERR_FILE="/tmp/corpora-stderr.txt"
> "$STDERR_FILE"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(dirname "$SCRIPT_DIR")"
TRUFFLEHOG_BIN="${REPO_ROOT}/trufflehog"
CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT"
scan() {
local input="$1"
set +e
unzstd -c "$input" | jq -r .content | "$TRUFFLEHOG_BIN" \
--no-update \
--log-level=3 \
--concurrency=6 \
--json \
--print-avg-detector-time \
stdin >> "$OUTPUT_JSONL" 2>> "$STDERR_FILE"
set -e
}
for CORPORA_FILE in "$@"; do
if [[ "$CORPORA_FILE" == s3://* ]]; then
aws s3 cp "$CORPORA_FILE" - | scan /dev/stdin
else
scan "$CORPORA_FILE"
fi
done
duckdb -c "
CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true);
SELECT
t.DetectorName detector,
COUNT(*) total,
SUM(CASE WHEN Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) verified,
SUM(CASE WHEN NOT Verified AND VerificationError IS NULL THEN 1 ELSE 0 END) unverified,
SUM(CASE WHEN VerificationError IS NOT NULL THEN 1 ELSE 0 END) \"unknown\"
FROM t
GROUP BY all
ORDER BY total DESC, detector
LIMIT 50;
"