Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 14 additions & 38 deletions detectzoo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,25 @@
"""DetectZoo: A unified toolkit for detecting AI-generated content."""

import importlib
import warnings

from detectzoo.utils.hf_quiet import configure_hf_quiet

configure_hf_quiet()


def _try_import(module: str) -> None:
"""Import a modality subpackage, warning (not failing) if optional deps are missing.

This keeps pure-audio or pure-text workflows usable even when optional
image/text/audio extras are not installed.
"""
try:
importlib.import_module(module)
except ImportError as exc:
warnings.warn(
f"detectzoo: skipped loading '{module}' ({exc}). "
"Install the corresponding optional extra to enable it "
"(e.g. `pip install detectzoo[audio]`).",
stacklevel=2,
)


for _mod in (
"detectzoo.datasets.audio",
"detectzoo.datasets.image",
"detectzoo.datasets.text",
"detectzoo.detectors.audio",
"detectzoo.detectors.image",
"detectzoo.detectors.text",
):
_try_import(_mod)

from detectzoo.core.base import BaseDetector, DetectionResult # noqa: E402
from detectzoo.core.registry import ( # noqa: E402
# isort: off
import detectzoo.utils.hf_quiet # noqa: F401
# isort: on

# Eager-load modality subpackages so @register_detector / @register_dataset run.
import detectzoo.datasets.audio # noqa: F401
import detectzoo.datasets.image # noqa: F401
import detectzoo.datasets.text # noqa: F401
import detectzoo.detectors.audio # noqa: F401
import detectzoo.detectors.image # noqa: F401
import detectzoo.detectors.text # noqa: F401
Comment on lines +3 to +13
from detectzoo.core.base import BaseDetector, DetectionResult
from detectzoo.core.registry import (
list_datasets,
list_detectors,
load_dataset,
load_detector,
)

__version__ = "0.1.0"
__version__ = "0.1.4"

__all__ = [
"BaseDetector",
Expand Down
24 changes: 16 additions & 8 deletions detectzoo/benchmarks/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,15 @@
# anti-spoofing trio (EER / AUC / F1); image and text keep DetectZoo's
# original column set unchanged.
_DEFAULT_PRINT_COLUMNS = [
"detector", "accuracy", "precision", "recall", "f1",
"tpr", "fpr", "roc_auc", "pr_auc",
"detector",
"accuracy",
"precision",
"recall",
"f1",
"tpr",
"fpr",
"roc_auc",
"pr_auc",
]
_PRINT_VIEWS = {
"audio": ["detector", "eer", "roc_auc", "f1"],
Expand Down Expand Up @@ -98,10 +105,7 @@ def evaluate_single(
metrics["n_samples"] = len(labels)

if save_scores:
metrics["samples"] = [
{"label": lbl, "score": scr}
for lbl, scr in zip(labels, scores)
]
metrics["samples"] = [{"label": lbl, "score": scr} for lbl, scr in zip(labels, scores)]

return metrics

Expand Down Expand Up @@ -159,8 +163,12 @@ def run_and_print(self, detectors: Sequence[BaseDetector]) -> None:
print(header)
print("-" * len(header))
for metrics in all_results.values():
row = " | ".join(f"{metrics.get(k, ''):>18}" if isinstance(metrics.get(k), str)
else f"{metrics.get(k, 0):>18.4f}" for k in header_keys)
row = " | ".join(
f"{metrics.get(k, ''):>18}"
if isinstance(metrics.get(k), str)
else f"{metrics.get(k, 0):>18.4f}"
for k in header_keys
)
print(row)

def _save_payload(
Expand Down
16 changes: 4 additions & 12 deletions detectzoo/core/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,15 @@ def load_detector(name: str, **kwargs: Any) -> BaseDetector:
resolved = _ALIASES.get(name, name)
if resolved not in _REGISTRY:
available = ", ".join(sorted(set(_REGISTRY) | set(_ALIASES))) or "(none)"
raise ValueError(
f"Unknown detector '{name}'. Available detectors: {available}"
)
raise ValueError(f"Unknown detector '{name}'. Available detectors: {available}")
return _REGISTRY[resolved](**kwargs)


def list_detectors(modality: str | None = None) -> list[str]:
"""Return names of all registered detectors, optionally filtered by modality."""
if modality is None:
return sorted(_REGISTRY)
return sorted(
name for name, cls in _REGISTRY.items() if cls.modality == modality
)
return sorted(name for name, cls in _REGISTRY.items() if cls.modality == modality)


# ======================================================================
Expand Down Expand Up @@ -127,16 +123,12 @@ def load_dataset(name: str, **kwargs: Any) -> BaseDataset:
resolved = _DATASET_ALIASES.get(name, name)
if resolved not in _DATASET_REGISTRY:
available = ", ".join(sorted(set(_DATASET_REGISTRY) | set(_DATASET_ALIASES))) or "(none)"
raise ValueError(
f"Unknown dataset '{name}'. Available datasets: {available}"
)
raise ValueError(f"Unknown dataset '{name}'. Available datasets: {available}")
return _DATASET_REGISTRY[resolved](**kwargs)


def list_datasets(modality: str | None = None) -> list[str]:
"""Return names of all registered datasets, optionally filtered by modality."""
if modality is None:
return sorted(_DATASET_REGISTRY)
return sorted(
name for name, cls in _DATASET_REGISTRY.items() if cls.modality == modality
)
return sorted(name for name, cls in _DATASET_REGISTRY.items() if cls.modality == modality)
1 change: 0 additions & 1 deletion detectzoo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,3 @@
"WritingPromptsDataset",
"XSumDataset",
]

4 changes: 2 additions & 2 deletions detectzoo/datasets/_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ def download_and_extract_tar(
*,
force: bool = False,
) -> Path:
"""Download a tar archive (optionally gzip/bzip2/xz-compressed), extract it, and cache the result.
"""Download a tar archive (gzip/bzip2/xz optional), extract it, and cache the result.

A ``.download_complete`` marker file prevents re-downloading on
A ``.download_complete`` marker file prevents re-downloading on
subsequent calls.
"""
marker = dest_dir / ".download_complete"
Expand Down
13 changes: 3 additions & 10 deletions detectzoo/datasets/audio/asvspoof2019.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,7 @@ def _looks_like_track_root(d: Path) -> bool:
if (d / proto_leaf).is_dir():
return True
# Tolerate extracts that contain only flac dirs (no protocols).
return any(
(d / leaf / "flac").is_dir()
for leaf in (train_leaf, dev_leaf, eval_leaf)
)
return any((d / leaf / "flac").is_dir() for leaf in (train_leaf, dev_leaf, eval_leaf))

# User pointed straight at one of the partition leaves.
if u.name in {train_leaf, dev_leaf, eval_leaf}:
Expand All @@ -119,9 +116,7 @@ def _looks_like_track_root(d: Path) -> bool:
def _protocol_file(track_root: Path, track: str, partition: str) -> Path:
proto_dir = track_root / f"ASVspoof2019_{track}_cm_protocols"
if not proto_dir.is_dir():
raise FileNotFoundError(
f"ASVspoof 2019: missing protocols directory {proto_dir}"
)
raise FileNotFoundError(f"ASVspoof 2019: missing protocols directory {proto_dir}")
if partition == "train":
names = (f"ASVspoof2019.{track}.cm.train.trn.txt",)
elif partition == "dev":
Expand All @@ -143,9 +138,7 @@ def _flac_dir(track_root: Path, track: str, partition: str) -> Path:
sub = "train" if partition == "train" else "dev" if partition == "dev" else "eval"
d = track_root / f"ASVspoof2019_{track}_{sub}" / "flac"
if not d.is_dir():
raise FileNotFoundError(
f"ASVspoof 2019: expected FLAC directory {d}"
)
raise FileNotFoundError(f"ASVspoof 2019: expected FLAC directory {d}")
return d


Expand Down
10 changes: 3 additions & 7 deletions detectzoo/datasets/audio/deepfake_eval_2024.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,7 @@ def _load_from_metadata(
meta_path = _find_metadata_csv(root)
audio_dir = root / _AUDIO_SUBDIR
if not audio_dir.is_dir():
raise FileNotFoundError(
f"Deepfake-Eval-2024: expected audio directory {audio_dir}"
)
raise FileNotFoundError(f"Deepfake-Eval-2024: expected audio directory {audio_dir}")

items: List[DatasetItem] = []
missing: List[str] = []
Expand Down Expand Up @@ -161,8 +159,7 @@ def _load_from_metadata(
if skip_missing:
continue
raise FileNotFoundError(
f"Deepfake-Eval-2024: audio missing for {filename!r} "
f"(looked under {audio_dir})"
f"Deepfake-Eval-2024: audio missing for {filename!r} (looked under {audio_dir})"
)

meta: dict[str, Any] = {
Expand All @@ -182,8 +179,7 @@ def _load_from_metadata(

if not items:
raise RuntimeError(
f"Deepfake-Eval-2024: no labelled audio loaded from {meta_path} "
f"(split={split!r})."
f"Deepfake-Eval-2024: no labelled audio loaded from {meta_path} (split={split!r})."
)
if missing and skip_missing:
from detectzoo.utils.logger import get_logger
Expand Down
6 changes: 2 additions & 4 deletions detectzoo/datasets/audio/for_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,15 +278,13 @@ def _load_preprocessed(
empty = [role for role, n in c.items() if n == 0]
if empty:
avail = {
sp: cs for sp, cs in per_split_counts.items()
if all(v > 0 for v in cs.values())
sp: cs for sp, cs in per_split_counts.items() if all(v > 0 for v in cs.values())
}
hint = (
f" Try `split={next(iter(avail))!r}` instead — that split "
f"has both classes ({avail[next(iter(avail))]})."
if avail
else " No other split has both classes either; the local "
"extraction is incomplete."
else " No other split has both classes either; the local extraction is incomplete."
)
raise RuntimeError(
f"FoR ({variant_key}, split={split!r}): the {empty!r} class "
Expand Down
8 changes: 4 additions & 4 deletions detectzoo/datasets/audio/in_the_wild.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ def _load_from_metadata_csv(root: Path, meta_path: Path) -> List[DatasetItem]:
raise ValueError(f"In-The-Wild: empty metadata file {meta_path}")
field_map = {f.strip().lower(): f for f in reader.fieldnames}
file_col = field_map.get("file") or field_map.get("filename") or field_map.get("path")
label_col = field_map.get("label") or field_map.get("class") or field_map.get("ground_truth")
label_col = (
field_map.get("label") or field_map.get("class") or field_map.get("ground_truth")
)
if not file_col or not label_col:
raise ValueError(
f"In-The-Wild: {meta_path} must contain file and label columns; "
Expand Down Expand Up @@ -183,9 +185,7 @@ def _load_from_metadata_csv(root: Path, meta_path: Path) -> List[DatasetItem]:
def _load_from_class_dirs(root: Path) -> List[DatasetItem]:
pairs = _class_dirs(root)
if not pairs:
raise FileNotFoundError(
f"In-The-Wild: no real/fake class folders under {root}"
)
raise FileNotFoundError(f"In-The-Wild: no real/fake class folders under {root}")
items: List[DatasetItem] = []
for dir_path, label, role in pairs:
meta = {"modality": "audio", "class": role}
Expand Down
4 changes: 1 addition & 3 deletions detectzoo/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ def load(self) -> List[DatasetItem]:
return self._items

@staticmethod
def _balance_and_truncate(
items: List[DatasetItem], max_samples: int
) -> List[DatasetItem]:
def _balance_and_truncate(items: List[DatasetItem], max_samples: int) -> List[DatasetItem]:
"""Pick ``max_samples`` items balanced across labels 0 and 1.

Takes ``max_samples // 2`` from each class. If one class is short,
Expand Down
4 changes: 2 additions & 2 deletions detectzoo/datasets/image/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Image-modality datasets for AI-generated image detection."""

from detectzoo.datasets.image.aigcdetect import AIGCDetectDataset
from detectzoo.datasets.image.chameleon import ChameleonDataset
from detectzoo.datasets.image.cnn_detection import CNNDetectionDataset
from detectzoo.datasets.image.drct2m import DRCT2MDataset
from detectzoo.datasets.image.genimage import GenImageDataset
from detectzoo.datasets.image.self_synthesis import SelfSynthesisDataset
from detectzoo.datasets.image.univfd import UnivFDDataset
from detectzoo.datasets.image.genimage import GenImageDataset
from detectzoo.datasets.image.chameleon import ChameleonDataset

__all__ = [
"AIGCDetectDataset",
Expand Down
26 changes: 13 additions & 13 deletions detectzoo/datasets/image/aigcdetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
Note:
In the original PatchCraft / AIGCDetectBenchmark setup, the **training split** is based on
the CNNSpot/CNNDetection training data (i.e., the ForenSynths-style ProGAN-based training
set), while AIGCDetect is primarily used as a large, unified test benchmark across many generators.
set), while AIGCDetect is primarily used as a large unified test benchmark across
many generators.

GitHub: https://github.com/Ekko-zn/AIGCDetectBenchmark
ModelScope: ``aemilia/AIGCDetectionBenchmark``
Expand All @@ -21,8 +22,8 @@
from pathlib import Path
from typing import Any, List, Optional, Sequence, Tuple

from detectzoo.datasets.base import BaseDataset, DatasetItem
from detectzoo.core.registry import register_dataset
from detectzoo.datasets.base import BaseDataset, DatasetItem

_MODELSCOPE_AIGCDETECT_DATASET: str = "aemilia/AIGCDetectionBenchmark"

Expand Down Expand Up @@ -67,11 +68,7 @@ def _partition_layout_ok(parent: Path, folder_name: str) -> bool:
return False
try:
for sub in base.iterdir():
if (
sub.is_dir()
and (sub / "0_real").is_dir()
and (sub / "1_fake").is_dir()
):
if sub.is_dir() and (sub / "0_real").is_dir() and (sub / "1_fake").is_dir():
return True
except OSError:
return False
Expand Down Expand Up @@ -198,9 +195,7 @@ def ensure_aigcdetect_downloaded(
if found is not None:
return found

raise RuntimeError(
"Could not locate AIGCDetectBenchmark after ModelScope download."
)
raise RuntimeError("Could not locate AIGCDetectBenchmark after ModelScope download.")


def resolve_aigcdetect_partition(partition: str) -> Tuple[str, str]:
Expand All @@ -219,7 +214,7 @@ class AIGCDetectDataset(BaseDataset):
Parameters
----------
root : str or Path, optional
Directory intended to contain partition folders, or a parent to search.
Directory intended to contain partition folders, or a parent to search.
When omitted, the default cache directory ``.detectzoo_data/aigcdetect/`` is used.
partitions : sequence of str, optional
Partition(s) to load. Each entry may be either a **column** name
Expand Down Expand Up @@ -248,7 +243,6 @@ def __init__(
self._resolved_root: Optional[Path] = None

def _data_root(self) -> Path:
from detectzoo.datasets._download import get_cache_dir

if self._resolved_root is not None:
return self._resolved_root
Expand Down Expand Up @@ -290,6 +284,12 @@ def _load_all(self) -> List[DatasetItem]:
):
for path in sorted(d.rglob("*")):
if path.is_file() and path.suffix.lower() in _IMAGE_EXTS:
items.append(DatasetItem(data=str(path), label=label, metadata={**base_meta, "source": source}))
items.append(
DatasetItem(
data=str(path),
label=label,
metadata={**base_meta, "source": source},
)
)

return items
Loading