From 8f17a0b9e1fd0cfe97a2d57ec907560805362687 Mon Sep 17 00:00:00 2001 From: jdidion Date: Fri, 12 Jun 2026 05:51:25 -0700 Subject: [PATCH] Fix #112: tolerate empty compressed files on the system read path Reading an empty (zero-byte) compressed file routed through SystemReader, which spawns gzip/pigz to decompress. System decompressors exit non-zero on empty input ("unexpected end of file"), unlike Python's gzip module, which treats an empty file as a valid empty stream. SystemReader._raise_if_error turned that exit code into an EOFError, but only when process.poll() observed the child exiting first, so the failure was a race that surfaced far more often on single-CPU machines. Suppress the non-zero exit code when the source file is empty, aligning the system-decompressor path with the Python path. Genuine truncation or corruption of a non-empty file still raises. Also make test_xopen_file write a real gzip stream before reading (it was relying on the empty-file race), and add test_xopen_empty_compressed_file as a deterministic regression test that forces the system read path on an empty .gz and reads to completion. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGES.md | 4 ++++ tests/test_xphyle.py | 21 +++++++++++++++++++++ xphyle/formats.py | 16 +++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 99ee60f..0d232a5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # Changes +## Unreleased + +* Fix #112 - Reading an empty compressed file via a system-level decompressor (gzip/pigz) no longer raises a spurious `EOFError`. System decompressors exit non-zero on empty input, which surfaced as a nondeterministic failure depending on a process-exit race (more frequent on single-CPU machines). + ## v4.4.1 (2020.12.06) * Fix #41 - Windows does not support SIGPIPE diff --git a/tests/test_xphyle.py b/tests/test_xphyle.py index 3cea923..38fabcb 100644 --- a/tests/test_xphyle.py +++ b/tests/test_xphyle.py @@ -1,6 +1,7 @@ from unittest import TestCase, skipIf from . import * import gzip +import os from io import BytesIO, IOBase from xphyle import * from xphyle.paths import TempDir, STDIN, STDOUT, STDERR, EXECUTABLE_CACHE @@ -194,8 +195,15 @@ def test_xopen_file(self): with self.assertRaises(IOError): xopen("foobar", "r") path = self.root.make_file(suffix=".gz") + # Write a real gzip stream so the read path decompresses actual data + # rather than an empty file (see issue #112: system decompressors exit + # non-zero on empty input, which made this test fail nondeterministically + # depending on a process-exit race). + with gzip.open(path, "wt") as o: + o.write("bar") with xopen(path, "rU", context_wrapper=True) as i: assert "rt" == i.mode + assert i.read() == "bar" with xopen(path, "w", compression=True, context_wrapper=True) as o: assert cast(FileLikeWrapper, o).compression == "gzip" o.write("foo") @@ -211,6 +219,19 @@ def test_xopen_file(self): with xopen(existing_file, "wt", overwrite=False): pass + def test_xopen_empty_compressed_file(self): + # Regression test for issue #112: reading an empty (zero-byte) + # compressed file via the system-level decompressor must not raise. + # System tools such as gzip/pigz exit non-zero on empty input, which + # previously surfaced as a nondeterministic EOFError depending on a + # process-exit race (failing far more often on single-CPU machines). + path = self.root.make_file(suffix=".gz") + assert os.path.getsize(path) == 0 + # Force the system-level read path and consume the whole stream so the + # subprocess exit code is deterministically checked. + with xopen(path, "rb", use_system=True, context_wrapper=True) as i: + assert i.read() == b"" + def test_xopen_fileobj(self): path = self.root.make_file(suffix=".gz") with open(path, "wb") as out1: diff --git a/xphyle/formats.py b/xphyle/formats.py index c528bfd..dd60c16 100644 --- a/xphyle/formats.py +++ b/xphyle/formats.py @@ -195,14 +195,28 @@ def __iter__(self) -> Iterator: def _raise_if_error(self) -> None: """Raise EOFError if process is not running anymore and the exit code is nonzero. + + An empty source file is a special case: system decompressors such as + ``gzip``/``pigz`` exit non-zero on empty input ("unexpected end of + file"), whereas the Python implementations treat an empty file as a + valid, empty stream. To keep the system- and library-level read paths + consistent, a non-zero exit code is ignored when the source file is + empty. """ retcode = self.process.poll() - if retcode is not None and retcode != 0: # pragma: no-cover + if retcode is not None and retcode != 0 and not self._source_is_empty(): raise EOFError( f"{self.executable_name} process returned non-zero exit code " f"{retcode}. Is the input file truncated or corrupt?" ) + def _source_is_empty(self) -> bool: + """Return True if the source file exists and is zero bytes.""" + try: + return os.path.getsize(self._name) == 0 + except OSError: # pragma: no-cover + return False + def read(self, *args) -> bytes: """Read bytes from the stream. Arguments are passed through to the subprocess ``read`` method.