Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

41 changes: 41 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Shared pytest fixtures and helpers for the DetectZoo test-suite."""

from __future__ import annotations

import importlib

import pytest

import detectzoo # noqa: F401 (ensures registries are populated)
from detectzoo.core.base import BaseDetector, DetectionResult


def require_modality(modality: str) -> None:
"""Skip the current test if a modality's detector package is unavailable.

DetectZoo loads modality subpackages on a best-effort basis (see
``detectzoo/__init__.py``): if an optional heavy dependency such as
``diffusers`` or ``timm`` is missing, the whole subpackage is skipped
with a warning rather than failing import. Tests that assert on a
modality's detectors must therefore skip gracefully when that package
could not be imported, so the suite stays green on partial installs.
"""
try:
importlib.import_module(f"detectzoo.detectors.{modality}")
except ImportError as exc: # pragma: no cover - depends on environment
pytest.skip(f"{modality} detectors unavailable ({exc})")


class DummyDetector(BaseDetector):
"""Lightweight detector that scores text by its length (no models)."""

name = "dummy"
modality = "text"

def predict(self, input_data) -> DetectionResult:
return self._make_result(min(len(str(input_data)) / 100.0, 1.0))


@pytest.fixture
def dummy_detector() -> DummyDetector:
return DummyDetector(threshold=0.5)
55 changes: 55 additions & 0 deletions tests/test_audio_detectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Tests for audio-modality detectors.

Audio detectors load pretrained checkpoints at construction time, so an
actual prediction requires a network download and is marked
``@pytest.mark.slow``. The non-slow tests verify registration and
interface invariants only, skipping when the audio subpackage cannot be
imported.
"""

from __future__ import annotations

import numpy as np
import pytest

from detectzoo.core.base import BaseDetector, DetectionResult
from detectzoo.core.registry import _REGISTRY, list_detectors, load_detector

from .conftest import require_modality


class TestAudioRegistry:
def test_audio_detectors_registered(self):
require_modality("audio")
names = set(list_detectors("audio"))
assert names, "No audio detectors registered"
expected = {"aasist", "rawnet2", "res_tssdnet", "samo"}
missing = expected - names
assert not missing, f"Missing expected audio detectors: {missing}"

def test_audio_detector_invariants(self):
require_modality("audio")
for name in list_detectors("audio"):
cls = _REGISTRY[name]
assert issubclass(cls, BaseDetector)
assert cls.modality == "audio"

def test_rawnet2_alias(self):
require_modality("audio")
from detectzoo.core.registry import _ALIASES

assert _ALIASES.get("rawnet2_audio") == "rawnet2"


@pytest.mark.slow
class TestAASISTDetector:
def test_predict_with_synthetic_audio(self):
require_modality("audio")

det = load_detector("aasist", device="cpu")
rng = np.random.default_rng(0)
waveform = rng.standard_normal(16000).astype(np.float32)
result = det.predict(waveform)
assert isinstance(result, DetectionResult)
assert 0.0 <= result.score <= 1.0
assert "score_spoof" in result.metadata
96 changes: 96 additions & 0 deletions tests/test_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Tests for the BenchmarkEvaluator (no model downloads).

A trivial in-memory dataset and a length-based dummy detector are used so
the evaluator's orchestration, metric aggregation, and persistence can be
exercised without any heavy dependencies.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import List

import pytest

from detectzoo.benchmarks.evaluator import BenchmarkEvaluator
from detectzoo.core.base import BaseDetector, DetectionResult
from detectzoo.datasets.base import BaseDataset, DatasetItem


class _MemoryDataset(BaseDataset):
name = "memory"
modality = "text"

def __init__(self, items: List[DatasetItem], **kw):
super().__init__(**kw)
self._mem = items

def _load_all(self) -> List[DatasetItem]:
return self._mem


class _KeywordDetector(BaseDetector):
"""Scores 0.9 if 'ai' appears in the text, else 0.1 — perfectly separable."""

name = "keyword"
modality = "text"

def predict(self, input_data) -> DetectionResult:
return self._make_result(0.9 if "ai" in str(input_data).lower() else 0.1)


@pytest.fixture
def dataset() -> _MemoryDataset:
items = [
DatasetItem(data="a human wrote this", label=0),
DatasetItem(data="another genuine note", label=0),
DatasetItem(data="this is ai generated", label=1),
DatasetItem(data="ai produced output", label=1),
]
return _MemoryDataset(items)


class TestBenchmarkEvaluator:
def test_evaluate_single(self, dataset):
ev = BenchmarkEvaluator(dataset)
metrics = ev.evaluate_single(_KeywordDetector())
assert metrics["detector"] == "keyword"
assert metrics["n_samples"] == 4
assert metrics["accuracy"] == 1.0
assert metrics["roc_auc"] == 1.0

def test_save_scores(self, dataset):
ev = BenchmarkEvaluator(dataset)
metrics = ev.evaluate_single(_KeywordDetector(), save_scores=True)
assert "samples" in metrics
assert len(metrics["samples"]) == 4
assert {"label", "score"} <= set(metrics["samples"][0])

def test_run_multiple(self, dataset):
ev = BenchmarkEvaluator(dataset)
results = ev.run([_KeywordDetector()])
assert "keyword" in results
assert results["keyword"]["accuracy"] == 1.0

def test_run_and_save(self, dataset, tmp_path: Path):
ev = BenchmarkEvaluator(dataset)
out = tmp_path / "nested" / "results.json"
ev.run_and_save([_KeywordDetector()], out)
assert out.is_file()
payload = json.loads(out.read_text())
assert payload["keyword"]["n_samples"] == 4

def test_run_and_save_with_meta(self, dataset, tmp_path: Path):
ev = BenchmarkEvaluator(dataset)
out = tmp_path / "results.json"
ev.run_and_save([_KeywordDetector()], out, meta={"run": "test"})
payload = json.loads(out.read_text())
assert payload["meta"] == {"run": "test"}
assert "keyword" in payload["results"]

def test_modality_inferred_from_dataset(self, dataset):
assert BenchmarkEvaluator(dataset).modality == "text"

def test_modality_override(self, dataset):
assert BenchmarkEvaluator(dataset, modality="audio").modality == "audio"
168 changes: 168 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Tests for the core infrastructure (registry, base classes, results)."""

from __future__ import annotations

import pytest
import torch

from detectzoo.core.base import BaseDetector, DetectionResult
from detectzoo.core.registry import (
_ALIASES,
_REGISTRY,
list_detectors,
load_detector,
)

VALID_MODALITIES = {"text", "image", "audio"}


class TestDetectionResult:
def test_fields(self):
r = DetectionResult(score=0.75, label="ai", confidence=0.6)
assert r.score == 0.75
assert r.label == "ai"
assert r.confidence == 0.6
assert r.metadata == {}

def test_default_confidence_and_metadata(self):
r = DetectionResult(score=0.5, label="human")
assert r.confidence == 0.0
assert r.metadata == {}

def test_metadata_is_independent_per_instance(self):
a = DetectionResult(score=0.1, label="human")
b = DetectionResult(score=0.9, label="ai")
a.metadata["k"] = 1
assert b.metadata == {}

def test_repr(self):
r = DetectionResult(score=1.0, label="human", confidence=0.5)
assert "DetectionResult" in repr(r)
assert "1.0000" in repr(r)


class TestRegistry:
def test_detectors_registered(self):
names = list_detectors()
assert len(names) >= 24, f"Expected >=24 detectors, got {len(names)}: {names}"

def test_registry_invariants(self):
"""Every registered class must expose its registry name and a valid modality."""
for name, cls in _REGISTRY.items():
assert issubclass(cls, BaseDetector), f"{name} is not a BaseDetector"
assert cls.name == name, f"{name}: cls.name={cls.name!r} mismatches key"
assert cls.modality in VALID_MODALITIES, f"{name}: bad modality {cls.modality!r}"

def test_text_detectors_present(self):
# Text detectors have no heavy optional deps, so they always load.
text = set(list_detectors("text"))
assert len(text) >= 18, f"Expected >=18 text detectors, got {sorted(text)}"
# A representative, stable subset that should always exist.
expected = {
"log_likelihood", "log_rank", "rank", "entropy", "detectgpt",
"fast_detectgpt", "binoculars", "lrr", "npr", "dna_gpt",
"revise_detect", "imbd", "lastde", "lastde_pp", "radar",
"text_fluoroscopy", "coco", "roberta_base", "roberta_large",
}
missing = expected - text
assert not missing, f"Missing expected text detectors: {missing}"

def test_load_unknown_raises(self):
with pytest.raises(ValueError, match="Unknown detector"):
load_detector("nonexistent_detector_xyz")

def test_alias_resolution(self):
# roberta aliases are pure-text and resolve without any download.
assert _ALIASES.get("roberta_openai_base") == "roberta_base"
assert _ALIASES.get("roberta_openai_large") == "roberta_large"
# Every alias must point at a real, registered detector.
for alias, target in _ALIASES.items():
assert target in _REGISTRY, f"Alias {alias!r} -> unknown target {target!r}"

def test_list_by_modality_filters(self):
for name in list_detectors("text"):
assert _REGISTRY[name].modality == "text"

def test_list_detectors_sorted(self):
names = list_detectors()
assert names == sorted(names)


class TestBaseDetector:
def _dummy(self, score: float, threshold: float = 0.5):
class _Dummy(BaseDetector):
name = "dummy_core"
modality = "text"

def predict(self, input_data):
return self._make_result(score)

return _Dummy(threshold=threshold)

def test_make_result_above_threshold(self):
r = self._dummy(0.8).predict("hello")
assert r.label == "ai"
assert r.score == 0.8

def test_make_result_at_threshold_is_ai(self):
# label uses score >= threshold.
r = self._dummy(0.5, threshold=0.5).predict("x")
assert r.label == "ai"

def test_make_result_below_threshold(self):
r = self._dummy(0.2).predict("hello")
assert r.label == "human"

def test_confidence_in_unit_interval(self):
r = self._dummy(0.8).predict("hello")
assert 0.0 <= r.confidence <= 1.0
assert r.confidence > 0.0

def test_make_result_passes_metadata(self):
class _Dummy(BaseDetector):
name = "dummy_meta"
modality = "text"

def predict(self, input_data):
return self._make_result(0.9, extra="info", n=3)

r = _Dummy().predict("x")
assert r.metadata == {"extra": "info", "n": 3}

def test_predict_batch(self):
class _LenDummy(BaseDetector):
name = "dummy_len"
modality = "text"

def predict(self, input_data):
return self._make_result(float(len(str(input_data))) / 100.0)

results = _LenDummy().predict_batch(["a", "bb", "ccc"])
assert len(results) == 3
assert all(isinstance(r, DetectionResult) for r in results)

def test_device_property_and_to(self):
d = self._dummy(0.5)
assert d.device == torch.device("cpu")
d.to("cpu")
assert d.device == torch.device("cpu")

def test_unload_clears_modules(self):
class _ModelDummy(BaseDetector):
name = "dummy_model"
modality = "text"

def __init__(self, **kw):
super().__init__(**kw)
self.net = torch.nn.Linear(2, 2)

def predict(self, input_data):
return self._make_result(0.5)

d = _ModelDummy()
assert isinstance(d.net, torch.nn.Module)
d.unload()
assert d.net is None

def test_repr(self):
assert "dummy_core" in repr(self._dummy(0.5))
Loading