diff --git a/README.md b/README.md index 6f04989c..d916935c 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ suites. - [Run](#run) - [Download](#download) - [Reference](#reference) + - [Local Mirror](#local-mirror) - [Report](#report) - [FAQ](#faq) - [Where does the name come from?](#where-does-the-name-come-from) @@ -642,7 +643,7 @@ optional arguments: ```bash ./fluster.py download --help -usage: fluster.py download [-h] [-j JOBS] [-k] [-r RETRIES] [-c CODEC] [testsuites ...] +usage: fluster.py download [-h] [-j JOBS] [-k] [-r RETRIES] [-m MIRROR] [-c CODEC] [testsuites ...] positional arguments: testsuites list of testsuites to download @@ -654,6 +655,8 @@ optional arguments: -k, --keep keep original downloaded file after extracting. Only applicable to compressed files such as .zip, .tar.gz, etc -r RETRIES, --retries RETRIES number of retries, before failing + -m MIRROR, --mirror MIRROR + base URL of a local mirror to download resources from (falls back to original source on failure) -c CODEC, --codec CODEC download test suites for specific codecs only (comma-separated) ``` @@ -663,6 +666,82 @@ optional arguments: - When using both `-c/--codec` and specific test suites, the behavior is **union-based**: - All test suites matching the codec filter are downloaded - Additionally, all specified test suites are downloaded, regardless of codec +### Local Mirror + +When running fluster on multiple machines or in a CI environment, downloading test vectors from the internet for each run can be slow. Fluster supports a **local mirror** to serve resources from a server on your LAN instead. + +#### How it works + +The `--mirror` option takes a base URL pointing to a mirror server. When downloading, fluster rewrites each source URL to point to the mirror first. If the mirror is unreachable or returns an error, fluster automatically falls back to the original internet source. + +For example, given a source URL: +``` +https://storage.googleapis.com/aom-test-data/av1-1-b10-00-quantizer-00.ivf +``` +and a mirror base URL: +``` +http://mirror.local:8080/fluster/ +``` +fluster will first attempt to download from: +``` +http://mirror.local:8080/fluster/storage.googleapis.com/aom-test-data/av1-1-b10-00-quantizer-00.ivf +``` + +#### Usage + +```bash +./fluster.py download --mirror http://mirror.local:8080/fluster/ +``` + +The `--mirror` option works with all other download options: +```bash +./fluster.py download -c H.264,H.265 --mirror http://mirror.local:8080/fluster/ +./fluster.py download AV1-TEST-VECTORS -j 8 --mirror http://mirror.local:8080/fluster/ +``` + +#### Setting up a mirror + +Use the `scripts/mirror_sync.py` script to populate a directory with all test vector resources: + +```bash +python3 scripts/mirror_sync.py -o /path/to/mirror -j 8 +``` + +This will scan all test suite JSON files and download every source URL into a directory tree that mirrors the original URL structure. Already-downloaded files are skipped on subsequent runs. + +Then serve the directory with any HTTP server: + +```bash +# Python (quick testing) +cd /path/to/mirror && python3 -m http.server 8080 + +# nginx (production) +# Point nginx root to /path/to/mirror +``` + +Use the same root path as the `--mirror` argument: +```bash +./fluster.py download --mirror http://mirror.local:8080/ +``` + +#### mirror_sync.py options + +```bash +python3 scripts/mirror_sync.py --help + +usage: mirror_sync.py [-h] [-o OUTPUT] [-t TEST_SUITES_DIR] [-j JOBS] [-r RETRIES] + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + output directory for the mirror tree (default: ./mirror) + -t TEST_SUITES_DIR, --test-suites-dir TEST_SUITES_DIR + directory containing test suite JSON files + -j JOBS, --jobs JOBS number of parallel downloads (default: 4) + -r RETRIES, --retries RETRIES + number of retries per download (default: 2) +``` + ### Reference ```bash diff --git a/fluster/fluster.py b/fluster/fluster.py index 9ee3f5c0..18633e19 100644 --- a/fluster/fluster.py +++ b/fluster/fluster.py @@ -953,7 +953,13 @@ def _generate_global_summary(results: Dict[str, List[Tuple[Decoder, TestSuite]]] print(output) def download_test_suites( - self, test_suites: List[str], jobs: int, keep_file: bool, retries: int, codec_string: Optional[str] = None + self, + test_suites: List[str], + jobs: int, + keep_file: bool, + retries: int, + codec_string: Optional[str] = None, + mirror: Optional[str] = None, ) -> None: """Download a group of test suites""" self._load_test_suites() @@ -999,4 +1005,5 @@ def download_test_suites( verify=True, keep_file=keep_file, retries=retries, + mirror=mirror, ) diff --git a/fluster/main.py b/fluster/main.py index 4bb1e180..178dfa66 100644 --- a/fluster/main.py +++ b/fluster/main.py @@ -352,6 +352,13 @@ def _add_download_cmd(self, subparsers: Any) -> None: type=int, default=2, ) + subparser.add_argument( + "-m", + "--mirror", + help="base URL of a local mirror to download resources from (falls back to original source on failure)", + type=str, + default=None, + ) subparser.add_argument( "-c", "--codec", @@ -419,4 +426,5 @@ def _download_cmd(args: Any, fluster: Fluster) -> None: keep_file=args.keep, retries=args.retries, codec_string=args.codec, + mirror=args.mirror, ) diff --git a/fluster/test_suite.py b/fluster/test_suite.py index 0fc3eab6..d0602749 100644 --- a/fluster/test_suite.py +++ b/fluster/test_suite.py @@ -47,6 +47,7 @@ def __init__( keep_file: bool, test_suite_name: str, retries: int, + mirror: Optional[str] = None, ): self.out_dir = out_dir self.verify = verify @@ -54,6 +55,7 @@ def __init__( self.keep_file = keep_file self.test_suite_name = test_suite_name self.retries = retries + self.mirror = mirror # This is added to avoid having to create an extra ancestor class def set_test_vector(self, test_vector: TestVector) -> None: @@ -74,8 +76,9 @@ def __init__( test_suite_name: str, test_vectors: Dict[str, TestVector], retries: int, + mirror: Optional[str] = None, ): - super().__init__(out_dir, verify, extract_all, keep_file, test_suite_name, retries) + super().__init__(out_dir, verify, extract_all, keep_file, test_suite_name, retries, mirror) self.test_vectors = test_vectors @@ -230,7 +233,7 @@ def _download_single_test_vector(ctx: DownloadWork) -> None: return print(f"\tDownloading test vector {ctx.test_vector.name} from {ctx.test_vector.source}") - utils.download(ctx.test_vector.source, dest_dir, ctx.retries**ctx.retries) + utils.download(ctx.test_vector.source, dest_dir, ctx.retries**ctx.retries, mirror=ctx.mirror) if ctx.test_vector.source_checksum != "__skip__": checksum = utils.file_checksum(dest_path) @@ -264,7 +267,7 @@ def _download_single_archive(ctx: DownloadWorkSingleArchive) -> None: os.remove(dest_path) print(f"\tDownloading source file from {first_tv.source}") - utils.download(first_tv.source, dest_dir, ctx.retries**ctx.retries) + utils.download(first_tv.source, dest_dir, ctx.retries**ctx.retries, mirror=ctx.mirror) # Check that source file was downloaded correctly if first_tv.source_checksum != "__skip__": @@ -301,6 +304,7 @@ def download( extract_all: bool = False, keep_file: bool = False, retries: int = 2, + mirror: Optional[str] = None, ) -> None: """Download the test suite""" os.makedirs(out_dir, exist_ok=True) @@ -314,14 +318,14 @@ def download( # Download test suite of multiple test vectors from a single archive print(f"Downloading test suite {self.name} using 1 job (single archive)") dwork_single = DownloadWorkSingleArchive( - out_dir, verify, extract_all, keep_file, self.name, self.test_vectors, retries + out_dir, verify, extract_all, keep_file, self.name, self.test_vectors, retries, mirror ) self._download_single_archive(dwork_single) elif len(unique_sources) == 1 and len(self.test_vectors) == 1: # Download test suite of single test vector print(f"Downloading test suite {self.name} using 1 job (single file)") single_tv = next(iter(self.test_vectors.values())) - dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries) + dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries, mirror) dwork.set_test_vector(single_tv) self._download_single_test_vector(dwork) else: @@ -338,7 +342,7 @@ def _callback_error(err: Any) -> None: downloads = [] for tv in self.test_vectors.values(): - dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries) + dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries, mirror) dwork.set_test_vector(tv) downloads.append( pool.apply_async( diff --git a/fluster/utils.py b/fluster/utils.py index 5d7b5d63..e41a9632 100644 --- a/fluster/utils.py +++ b/fluster/utils.py @@ -43,6 +43,24 @@ download_lock = Lock() +MIRROR_NETWORK_ERRORS = ( + urllib.error.URLError, + urllib.error.HTTPError, + OSError, + IOError, + ConnectionError, + TimeoutError, + http.client.IncompleteRead, +) + + +def rewrite_url(source_url: str, mirror_base: str) -> str: + parsed = urllib.parse.urlparse(source_url) + path = parsed.netloc + parsed.path + if parsed.query: + path += "?" + parsed.query + return mirror_base.rstrip("/") + "/" + path.lstrip("/") + def create_enhanced_opener() -> urllib.request.OpenerDirector: """Creates an enhanced URL opener with custom headers and cookie support.""" @@ -159,26 +177,33 @@ def download( max_retries: int = 5, timeout: int = 300, chunk_size: int = 2048 * 2048, # 4MB + mirror: Optional[str] = None, ) -> None: """Downloads a file to a directory with a mutex lock - to avoid conflicts and retries with exponential backoff.""" + to avoid conflicts and retries with exponential backoff. + If mirror is provided, tries the mirror URL first and falls back to the original URL.""" os.makedirs(dest_dir, exist_ok=True) filename = os.path.basename(url) dest_path = os.path.join(dest_dir, filename) + + if mirror: + mirror_url = rewrite_url(url, mirror) + try: + with download_lock: + _download_simple(mirror_url, dest_path, filename, timeout, chunk_size) + return + except MIRROR_NETWORK_ERRORS as e: + if os.path.exists(dest_path): + os.remove(dest_path) + print(f"\tWARNING: Mirror download failed for {mirror_url}: {e}") + print(f"\tFalling back to original source: {url}") + for attempt in range(max_retries): try: with download_lock: _download_simple(url, dest_path, filename, timeout, chunk_size) break - except ( - urllib.error.URLError, - urllib.error.HTTPError, - OSError, - IOError, - ConnectionError, - TimeoutError, - http.client.IncompleteRead, - ) as e: + except MIRROR_NETWORK_ERRORS as e: if os.path.exists(dest_path): os.remove(dest_path) diff --git a/scripts/mirror_sync.py b/scripts/mirror_sync.py new file mode 100755 index 00000000..fef89743 --- /dev/null +++ b/scripts/mirror_sync.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +# Fluster - testing framework for decoders conformance +# Copyright (C) 2026, Fluendo, S.A. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation, either version 3 +# of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see . + +import argparse +import json +import os +import sys +import urllib.parse + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from fluster import utils + + +def collect_source_urls(test_suites_dir: str) -> list[str]: + urls = [] + for root, _, files in os.walk(test_suites_dir): + for filename in files: + if not filename.endswith(".json"): + continue + filepath = os.path.join(root, filename) + try: + with open(filepath, encoding="utf-8") as f: + data = json.load(f) + except (json.JSONDecodeError, OSError) as e: + print(f"WARNING: skipping {filepath}: {e}") + continue + for tv in data.get("test_vectors", []): + source = tv.get("source") + if source and source not in urls: + urls.append(source) + for tv in data.get("failing_test_vectors", []): + source = tv.get("source") + if source and source not in urls: + urls.append(source) + return urls + + +def url_to_mirror_path(url: str) -> str: + parsed = urllib.parse.urlparse(url) + return os.path.join(parsed.netloc, parsed.path.lstrip("/")) + + +def sync_urls(urls: list[str], output_dir: str, jobs: int, retries: int) -> None: + from multiprocessing import Pool + + def _sync_one(url: str) -> None: + mirror_path = url_to_mirror_path(url) + dest_dir = os.path.join(output_dir, os.path.dirname(mirror_path)) + dest_file = os.path.join(output_dir, mirror_path) + + if os.path.exists(dest_file): + print(f" SKIP (exists): {mirror_path}") + return + + print(f" DOWNLOAD: {mirror_path}") + utils.download(url, dest_dir, max_retries=retries) + + if jobs <= 1: + for url in urls: + _sync_one(url) + else: + with Pool(jobs) as pool: + pool.map(_sync_one, urls) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Populate a local mirror directory with all fluster test suite resources. " + "The resulting directory can be served by any HTTP server (nginx, Apache, etc.) " + "and used with: fluster download --mirror http://HOST/ROOT" + ) + parser.add_argument( + "-o", + "--output", + default="mirror", + help="output directory for the mirror tree (default: ./mirror)", + ) + parser.add_argument( + "-t", + "--test-suites-dir", + default=os.path.join(os.path.dirname(__file__), "..", "test_suites"), + help="directory containing test suite JSON files", + ) + parser.add_argument( + "-j", + "--jobs", + type=int, + default=4, + help="number of parallel downloads (default: 4)", + ) + parser.add_argument( + "-r", + "--retries", + type=int, + default=2, + help="number of retries per download (default: 2)", + ) + args = parser.parse_args() + + test_suites_dir = os.path.abspath(args.test_suites_dir) + output_dir = os.path.abspath(args.output) + + if not os.path.isdir(test_suites_dir): + sys.exit(f"Test suites directory not found: {test_suites_dir}") + + urls = collect_source_urls(test_suites_dir) + if not urls: + sys.exit(f"No source URLs found in {test_suites_dir}") + + print(f"Found {len(urls)} unique source URLs in {test_suites_dir}") + print(f"Mirror output directory: {output_dir}\n") + + os.makedirs(output_dir, exist_ok=True) + sync_urls(urls, output_dir, args.jobs, args.retries) + + print(f"\nDone. Serve {output_dir} with an HTTP server, e.g.:") + print(f" cd {output_dir} && python3 -m http.server 8080") + print("Then use: fluster download --mirror http://:8080/") + + +if __name__ == "__main__": + main() diff --git a/tests/test_mirror.py b/tests/test_mirror.py new file mode 100644 index 00000000..5af49b5d --- /dev/null +++ b/tests/test_mirror.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import functools +import http.server +import os +import tempfile +import threading +import unittest + +from fluster import utils + + +class TestRewriteUrl(unittest.TestCase): + def test_basic_https(self) -> None: + result = utils.rewrite_url( + "https://storage.googleapis.com/aom-test-data/file.ivf", + "http://mirror.local:8080/fluster/", + ) + self.assertEqual(result, "http://mirror.local:8080/fluster/storage.googleapis.com/aom-test-data/file.ivf") + + def test_mirror_without_trailing_slash(self) -> None: + result = utils.rewrite_url( + "https://www.itu.int/wftp3/av-arch/jvt-site/draft_conformance/AVCv1/AUD_MW_E.zip", + "http://mirror.local:8080/fluster", + ) + self.assertEqual( + result, + "http://mirror.local:8080/fluster/www.itu.int/wftp3/av-arch/jvt-site/draft_conformance/AVCv1/AUD_MW_E.zip", + ) + + def test_mirror_with_trailing_slash(self) -> None: + result = utils.rewrite_url( + "https://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4/file.adts", + "http://mirror.local:8080/mirror/", + ) + self.assertEqual( + result, + "http://mirror.local:8080/mirror/standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4/file.adts", + ) + + def test_url_with_query(self) -> None: + result = utils.rewrite_url( + "https://example.com/path/file.zip?token=abc", + "http://mirror.local:8080/", + ) + self.assertEqual(result, "http://mirror.local:8080/example.com/path/file.zip?token=abc") + + def test_port_in_original_url(self) -> None: + result = utils.rewrite_url( + "https://example.com:8443/path/file.zip", + "http://mirror.local:8080/fluster/", + ) + self.assertEqual(result, "http://mirror.local:8080/fluster/example.com:8443/path/file.zip") + + def test_http_source(self) -> None: + result = utils.rewrite_url( + "http://example.com/path/file.zip", + "http://mirror.local:8080/fluster/", + ) + self.assertEqual(result, "http://mirror.local:8080/fluster/example.com/path/file.zip") + + +class _SilentHandler(http.server.SimpleHTTPRequestHandler): + def log_message(self, fmt, *args): + pass + + +class TestDownloadWithMirror(unittest.TestCase): + def _serve_dir(self, serve_root: str) -> tuple: + handler = functools.partial(_SilentHandler, directory=serve_root) + server = http.server.HTTPServer(("127.0.0.1", 0), handler) + port = server.server_address[1] + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + return server, port + + def test_download_from_mirror_success(self) -> None: + test_content = b"mirror test content" + with tempfile.TemporaryDirectory() as tmpdir: + mirror_root = os.path.join(tmpdir, "mirror") + subpath = os.path.join(mirror_root, "fake.example.com", "data") + os.makedirs(subpath, exist_ok=True) + with open(os.path.join(subpath, "testfile.bin"), "wb") as f: + f.write(test_content) + + server, port = self._serve_dir(mirror_root) + try: + dest_dir = os.path.join(tmpdir, "dest") + utils.download( + "https://fake.example.com/data/testfile.bin", + dest_dir, + max_retries=1, + mirror=f"http://127.0.0.1:{port}/", + ) + dest_file = os.path.join(dest_dir, "testfile.bin") + self.assertTrue(os.path.exists(dest_file)) + with open(dest_file, "rb") as f: + self.assertEqual(f.read(), test_content) + finally: + server.shutdown() + + def test_download_mirror_fallback_to_original(self) -> None: + original_content = b"original source content" + with tempfile.TemporaryDirectory() as tmpdir: + serve_root = os.path.join(tmpdir, "original") + os.makedirs(serve_root, exist_ok=True) + with open(os.path.join(serve_root, "fallback.bin"), "wb") as f: + f.write(original_content) + + server, port = self._serve_dir(serve_root) + try: + dest_dir = os.path.join(tmpdir, "dest") + utils.download( + f"http://127.0.0.1:{port}/fallback.bin", + dest_dir, + max_retries=1, + mirror="http://127.0.0.1:1/", + ) + dest_file = os.path.join(dest_dir, "fallback.bin") + self.assertTrue(os.path.exists(dest_file)) + with open(dest_file, "rb") as f: + self.assertEqual(f.read(), original_content) + finally: + server.shutdown() + + def test_download_without_mirror(self) -> None: + content = b"no mirror content" + with tempfile.TemporaryDirectory() as tmpdir: + serve_root = os.path.join(tmpdir, "serve") + os.makedirs(serve_root, exist_ok=True) + with open(os.path.join(serve_root, "plain.bin"), "wb") as f: + f.write(content) + + server, port = self._serve_dir(serve_root) + try: + dest_dir = os.path.join(tmpdir, "dest") + utils.download( + f"http://127.0.0.1:{port}/plain.bin", + dest_dir, + max_retries=1, + ) + dest_file = os.path.join(dest_dir, "plain.bin") + self.assertTrue(os.path.exists(dest_file)) + with open(dest_file, "rb") as f: + self.assertEqual(f.read(), content) + finally: + server.shutdown() + + +if __name__ == "__main__": + unittest.main(verbosity=2)