From 02889883083511db36e78d6126868d5a57e18258 Mon Sep 17 00:00:00 2001 From: Chris Geyer Date: Wed, 27 May 2026 19:07:08 +0000 Subject: [PATCH 1/2] Always filter .netrc/_netrc credential files from lineage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Credential files (.netrc, _netrc) were being recorded as run inputs whenever a tool (curl, requests, huggingface_hub, wandb) read them for auth — e.g. the nanochat ClimbMix download recorded /home/ubuntu/.netrc as an input. They hold secrets, not reproducibility-relevant data. Filter them from inputs and outputs unconditionally, mirroring _is_git_metadata, i.e. independent of filters.ignore_paths so a user clearing that list cannot re-expose them. Adds a credential_files filter category (count + "creds" label). Co-Authored-By: Claude Opus 4.7 (1M context) --- roar/core/models/provenance.py | 2 ++ roar/execution/provenance/file_filter.py | 19 +++++++++++++++++++ roar/presenters/run_report.py | 1 + tests/unit/test_verbosity.py | 12 ++++++++++++ 4 files changed, 34 insertions(+) diff --git a/roar/core/models/provenance.py b/roar/core/models/provenance.py index 6d5c5d1e..b8a42cc0 100644 --- a/roar/core/models/provenance.py +++ b/roar/core/models/provenance.py @@ -87,6 +87,7 @@ class FilterCounts(ImmutableModel): tmp_files: Annotated[int, Field(ge=0)] = 0 roar_internal: Annotated[int, Field(ge=0)] = 0 git_metadata: Annotated[int, Field(ge=0)] = 0 + credential_files: Annotated[int, Field(ge=0)] = 0 write_noise: Annotated[int, Field(ge=0)] = 0 ignore_paths: Annotated[int, Field(ge=0)] = 0 @@ -99,6 +100,7 @@ def total(self) -> int: + self.tmp_files + self.roar_internal + self.git_metadata + + self.credential_files + self.write_noise + self.ignore_paths ) diff --git a/roar/execution/provenance/file_filter.py b/roar/execution/provenance/file_filter.py index 403e57dc..96a5ed84 100644 --- a/roar/execution/provenance/file_filter.py +++ b/roar/execution/provenance/file_filter.py @@ -21,6 +21,7 @@ _FILTER_CATEGORIES: tuple[str, ...] = ( "roar_internal", "git_metadata", + "credential_files", "system_reads", "package_reads", "torch_cache", @@ -218,6 +219,8 @@ def categorize_read(path: str) -> str | None: return "roar_internal" if self._is_git_metadata(path): return "git_metadata" + if self._is_credential_file(path): + return "credential_files" if ignore_system_reads and self._is_system_read(path): return "system_reads" if ignore_torch_cache and self._is_torch_cache(path): @@ -267,6 +270,9 @@ def filter_reads(paths: list[str]) -> list[str]: if self._is_write_noise(f): _record_drop("write_noise", f) continue + if self._is_credential_file(f): + _record_drop("credential_files", f) + continue if ignore_torch_cache and self._is_torch_cache(f): _record_drop("torch_cache", f) continue @@ -323,6 +329,19 @@ def _is_roar_internal(path: str) -> bool: or path == ".roar" ) + @staticmethod + def _is_credential_file(path: str) -> bool: + """Check if path is a credentials file that must never enter lineage. + + These hold secrets (not reproducibility-relevant data), so they are + always filtered from inputs and outputs regardless of config — a user + clearing ``filters.ignore_paths`` must not re-expose them. + """ + basename = path.rsplit("/", 1)[-1] + # .netrc / _netrc: machine login tokens (curl, requests, huggingface, + # wandb). The _netrc spelling is the Windows / NETRC-env variant. + return basename in (".netrc", "_netrc") + @staticmethod def _is_git_metadata(path: str) -> bool: """Check if path is git metadata used for roar context capture.""" diff --git a/roar/presenters/run_report.py b/roar/presenters/run_report.py index 9b47a77a..10692bbb 100644 --- a/roar/presenters/run_report.py +++ b/roar/presenters/run_report.py @@ -33,6 +33,7 @@ ("tmp_files", "tmp"), ("write_noise", "write-noise"), ("git_metadata", "git-meta"), + ("credential_files", "creds"), ("roar_internal", "roar-internal"), ) diff --git a/tests/unit/test_verbosity.py b/tests/unit/test_verbosity.py index ca5d6cef..a2bf4302 100644 --- a/tests/unit/test_verbosity.py +++ b/tests/unit/test_verbosity.py @@ -160,6 +160,18 @@ def test_counts_per_category(self) -> None: assert result.counts.roar_internal == 1 assert result.counts.write_noise == 1 # /dev/null + def test_credential_files_always_filtered(self) -> None: + # .netrc / _netrc hold login tokens — never reproducibility-relevant, + # so they are dropped from reads and writes regardless of config. + reads = ["/home/user/.netrc", "/home/user/project/data.parquet"] + writes = ["/home/user/_netrc"] + tracer, py = _make_data(reads, writes) + result = FileFilterService().filter_files(tracer, py, {}) + + assert result.counts.credential_files == 2 # .netrc read + _netrc write + assert result.read_files == ["/home/user/project/data.parquet"] + assert result.written_files == [] + def test_dropped_paths_empty_by_default(self) -> None: tracer, py = _make_data(["/sys/foo"], []) result = FileFilterService().filter_files(tracer, py, {}) From a25cf4a076453c1239dca6d201e6e6079a9552ce Mon Sep 17 00:00:00 2001 From: Chris Geyer Date: Sat, 30 May 2026 13:26:59 +0000 Subject: [PATCH 2/2] test: cover .netrc/_netrc credential filtering in file_filter Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/test_file_filter.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/unit/test_file_filter.py b/tests/unit/test_file_filter.py index 622ec490..0b97e697 100644 --- a/tests/unit/test_file_filter.py +++ b/tests/unit/test_file_filter.py @@ -170,3 +170,27 @@ def test_filter_files_keeps_user_data_reads(monkeypatch) -> None: assert user_data_path in filtered.opened_files assert user_data_path in filtered.read_files + + +def test_filter_files_always_drops_netrc_credentials(monkeypatch) -> None: + """`.netrc`/`_netrc` hold machine login tokens (wandb, huggingface, curl) + and must NEVER enter lineage — dropped from reads AND writes, regardless of + any filter config. Observed leaking into a nanochat run (wandb reads ~/.netrc). + """ + monkeypatch.setattr(file_filter, "_get_editable_install_dirs", lambda: frozenset()) + + netrc = str(Path("~/.netrc").expanduser()) + win_netrc = "/home/ubuntu/_netrc" + tracer_data = TracerData( + opened_files=[netrc, win_netrc], + read_files=[netrc, win_netrc], + written_files=[netrc], + ) + python_data = PythonInjectData(sys_prefix="", sys_base_prefix="", roar_inject_dir="") + + filtered = FileFilterService().filter_files(tracer_data, python_data, _filter_config()) + + for p in (netrc, win_netrc): + assert p not in filtered.opened_files + assert p not in filtered.read_files + assert p not in filtered.written_files