Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 80 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ suites.
- [Run](#run)
- [Download](#download)
- [Reference](#reference)
- [Local Mirror](#local-mirror)
- [Report](#report)
- [FAQ](#faq)
- [Where does the name come from?](#where-does-the-name-come-from)
Expand Down Expand Up @@ -642,7 +643,7 @@ optional arguments:
```bash
./fluster.py download --help

usage: fluster.py download [-h] [-j JOBS] [-k] [-r RETRIES] [-c CODEC] [testsuites ...]
usage: fluster.py download [-h] [-j JOBS] [-k] [-r RETRIES] [-m MIRROR] [-c CODEC] [testsuites ...]

positional arguments:
testsuites list of testsuites to download
Expand All @@ -654,6 +655,8 @@ optional arguments:
-k, --keep keep original downloaded file after extracting. Only applicable to compressed files such as .zip, .tar.gz, etc
-r RETRIES, --retries RETRIES
number of retries, before failing
-m MIRROR, --mirror MIRROR
base URL of a local mirror to download resources from (falls back to original source on failure)
-c CODEC, --codec CODEC
download test suites for specific codecs only (comma-separated)
```
Expand All @@ -663,6 +666,82 @@ optional arguments:
- When using both `-c/--codec` and specific test suites, the behavior is **union-based**:
- All test suites matching the codec filter are downloaded
- Additionally, all specified test suites are downloaded, regardless of codec
### Local Mirror

When running fluster on multiple machines or in a CI environment, downloading test vectors from the internet for each run can be slow. Fluster supports a **local mirror** to serve resources from a server on your LAN instead.

#### How it works

The `--mirror` option takes a base URL pointing to a mirror server. When downloading, fluster rewrites each source URL to point to the mirror first. If the mirror is unreachable or returns an error, fluster automatically falls back to the original internet source.

For example, given a source URL:
```
https://storage.googleapis.com/aom-test-data/av1-1-b10-00-quantizer-00.ivf
```
and a mirror base URL:
```
http://mirror.local:8080/fluster/
```
fluster will first attempt to download from:
```
http://mirror.local:8080/fluster/storage.googleapis.com/aom-test-data/av1-1-b10-00-quantizer-00.ivf
```

#### Usage

```bash
./fluster.py download --mirror http://mirror.local:8080/fluster/
```

The `--mirror` option works with all other download options:
```bash
./fluster.py download -c H.264,H.265 --mirror http://mirror.local:8080/fluster/
./fluster.py download AV1-TEST-VECTORS -j 8 --mirror http://mirror.local:8080/fluster/
```

#### Setting up a mirror

Use the `scripts/mirror_sync.py` script to populate a directory with all test vector resources:

```bash
python3 scripts/mirror_sync.py -o /path/to/mirror -j 8
```

This will scan all test suite JSON files and download every source URL into a directory tree that mirrors the original URL structure. Already-downloaded files are skipped on subsequent runs.

Then serve the directory with any HTTP server:

```bash
# Python (quick testing)
cd /path/to/mirror && python3 -m http.server 8080

# nginx (production)
# Point nginx root to /path/to/mirror
```

Use the same root path as the `--mirror` argument:
```bash
./fluster.py download --mirror http://mirror.local:8080/
```

#### mirror_sync.py options

```bash
python3 scripts/mirror_sync.py --help

usage: mirror_sync.py [-h] [-o OUTPUT] [-t TEST_SUITES_DIR] [-j JOBS] [-r RETRIES]

options:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
output directory for the mirror tree (default: ./mirror)
-t TEST_SUITES_DIR, --test-suites-dir TEST_SUITES_DIR
directory containing test suite JSON files
-j JOBS, --jobs JOBS number of parallel downloads (default: 4)
-r RETRIES, --retries RETRIES
number of retries per download (default: 2)
```

### Reference

```bash
Expand Down
9 changes: 8 additions & 1 deletion fluster/fluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,13 @@ def _generate_global_summary(results: Dict[str, List[Tuple[Decoder, TestSuite]]]
print(output)

def download_test_suites(
self, test_suites: List[str], jobs: int, keep_file: bool, retries: int, codec_string: Optional[str] = None
self,
test_suites: List[str],
jobs: int,
keep_file: bool,
retries: int,
codec_string: Optional[str] = None,
mirror: Optional[str] = None,
) -> None:
"""Download a group of test suites"""
self._load_test_suites()
Expand Down Expand Up @@ -999,4 +1005,5 @@ def download_test_suites(
verify=True,
keep_file=keep_file,
retries=retries,
mirror=mirror,
)
8 changes: 8 additions & 0 deletions fluster/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,13 @@ def _add_download_cmd(self, subparsers: Any) -> None:
type=int,
default=2,
)
subparser.add_argument(
"-m",
"--mirror",
help="base URL of a local mirror to download resources from (falls back to original source on failure)",
type=str,
default=None,
)
subparser.add_argument(
"-c",
"--codec",
Expand Down Expand Up @@ -419,4 +426,5 @@ def _download_cmd(args: Any, fluster: Fluster) -> None:
keep_file=args.keep,
retries=args.retries,
codec_string=args.codec,
mirror=args.mirror,
)
16 changes: 10 additions & 6 deletions fluster/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,15 @@ def __init__(
keep_file: bool,
test_suite_name: str,
retries: int,
mirror: Optional[str] = None,
):
self.out_dir = out_dir
self.verify = verify
self.extract_all = extract_all
self.keep_file = keep_file
self.test_suite_name = test_suite_name
self.retries = retries
self.mirror = mirror

# This is added to avoid having to create an extra ancestor class
def set_test_vector(self, test_vector: TestVector) -> None:
Expand All @@ -74,8 +76,9 @@ def __init__(
test_suite_name: str,
test_vectors: Dict[str, TestVector],
retries: int,
mirror: Optional[str] = None,
):
super().__init__(out_dir, verify, extract_all, keep_file, test_suite_name, retries)
super().__init__(out_dir, verify, extract_all, keep_file, test_suite_name, retries, mirror)
self.test_vectors = test_vectors


Expand Down Expand Up @@ -230,7 +233,7 @@ def _download_single_test_vector(ctx: DownloadWork) -> None:
return

print(f"\tDownloading test vector {ctx.test_vector.name} from {ctx.test_vector.source}")
utils.download(ctx.test_vector.source, dest_dir, ctx.retries**ctx.retries)
utils.download(ctx.test_vector.source, dest_dir, ctx.retries**ctx.retries, mirror=ctx.mirror)

if ctx.test_vector.source_checksum != "__skip__":
checksum = utils.file_checksum(dest_path)
Expand Down Expand Up @@ -264,7 +267,7 @@ def _download_single_archive(ctx: DownloadWorkSingleArchive) -> None:
os.remove(dest_path)

print(f"\tDownloading source file from {first_tv.source}")
utils.download(first_tv.source, dest_dir, ctx.retries**ctx.retries)
utils.download(first_tv.source, dest_dir, ctx.retries**ctx.retries, mirror=ctx.mirror)

# Check that source file was downloaded correctly
if first_tv.source_checksum != "__skip__":
Expand Down Expand Up @@ -301,6 +304,7 @@ def download(
extract_all: bool = False,
keep_file: bool = False,
retries: int = 2,
mirror: Optional[str] = None,
) -> None:
"""Download the test suite"""
os.makedirs(out_dir, exist_ok=True)
Expand All @@ -314,14 +318,14 @@ def download(
# Download test suite of multiple test vectors from a single archive
print(f"Downloading test suite {self.name} using 1 job (single archive)")
dwork_single = DownloadWorkSingleArchive(
out_dir, verify, extract_all, keep_file, self.name, self.test_vectors, retries
out_dir, verify, extract_all, keep_file, self.name, self.test_vectors, retries, mirror
)
self._download_single_archive(dwork_single)
elif len(unique_sources) == 1 and len(self.test_vectors) == 1:
# Download test suite of single test vector
print(f"Downloading test suite {self.name} using 1 job (single file)")
single_tv = next(iter(self.test_vectors.values()))
dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries)
dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries, mirror)
dwork.set_test_vector(single_tv)
self._download_single_test_vector(dwork)
else:
Expand All @@ -338,7 +342,7 @@ def _callback_error(err: Any) -> None:

downloads = []
for tv in self.test_vectors.values():
dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries)
dwork = DownloadWork(out_dir, verify, extract_all, keep_file, self.name, retries, mirror)
dwork.set_test_vector(tv)
downloads.append(
pool.apply_async(
Expand Down
45 changes: 35 additions & 10 deletions fluster/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,24 @@

download_lock = Lock()

MIRROR_NETWORK_ERRORS = (
urllib.error.URLError,
urllib.error.HTTPError,
OSError,
IOError,
ConnectionError,
TimeoutError,
http.client.IncompleteRead,
)


def rewrite_url(source_url: str, mirror_base: str) -> str:
parsed = urllib.parse.urlparse(source_url)
path = parsed.netloc + parsed.path
if parsed.query:
path += "?" + parsed.query
return mirror_base.rstrip("/") + "/" + path.lstrip("/")


def create_enhanced_opener() -> urllib.request.OpenerDirector:
"""Creates an enhanced URL opener with custom headers and cookie support."""
Expand Down Expand Up @@ -159,26 +177,33 @@ def download(
max_retries: int = 5,
timeout: int = 300,
chunk_size: int = 2048 * 2048, # 4MB
mirror: Optional[str] = None,
) -> None:
"""Downloads a file to a directory with a mutex lock
to avoid conflicts and retries with exponential backoff."""
to avoid conflicts and retries with exponential backoff.
If mirror is provided, tries the mirror URL first and falls back to the original URL."""
os.makedirs(dest_dir, exist_ok=True)
filename = os.path.basename(url)
dest_path = os.path.join(dest_dir, filename)

if mirror:
mirror_url = rewrite_url(url, mirror)
try:
with download_lock:
_download_simple(mirror_url, dest_path, filename, timeout, chunk_size)
return
except MIRROR_NETWORK_ERRORS as e:
if os.path.exists(dest_path):
os.remove(dest_path)
print(f"\tWARNING: Mirror download failed for {mirror_url}: {e}")
print(f"\tFalling back to original source: {url}")

for attempt in range(max_retries):
try:
with download_lock:
_download_simple(url, dest_path, filename, timeout, chunk_size)
break
except (
urllib.error.URLError,
urllib.error.HTTPError,
OSError,
IOError,
ConnectionError,
TimeoutError,
http.client.IncompleteRead,
) as e:
except MIRROR_NETWORK_ERRORS as e:
if os.path.exists(dest_path):
os.remove(dest_path)

Expand Down
Loading