Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions roar/core/models/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class FilterCounts(ImmutableModel):
tmp_files: Annotated[int, Field(ge=0)] = 0
roar_internal: Annotated[int, Field(ge=0)] = 0
git_metadata: Annotated[int, Field(ge=0)] = 0
credential_files: Annotated[int, Field(ge=0)] = 0
write_noise: Annotated[int, Field(ge=0)] = 0
ignore_paths: Annotated[int, Field(ge=0)] = 0

Expand All @@ -99,6 +100,7 @@ def total(self) -> int:
+ self.tmp_files
+ self.roar_internal
+ self.git_metadata
+ self.credential_files
+ self.write_noise
+ self.ignore_paths
)
Expand Down
19 changes: 19 additions & 0 deletions roar/execution/provenance/file_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
_FILTER_CATEGORIES: tuple[str, ...] = (
"roar_internal",
"git_metadata",
"credential_files",
"system_reads",
"package_reads",
"torch_cache",
Expand Down Expand Up @@ -218,6 +219,8 @@ def categorize_read(path: str) -> str | None:
return "roar_internal"
if self._is_git_metadata(path):
return "git_metadata"
if self._is_credential_file(path):
return "credential_files"
if ignore_system_reads and self._is_system_read(path):
return "system_reads"
if ignore_torch_cache and self._is_torch_cache(path):
Expand Down Expand Up @@ -267,6 +270,9 @@ def filter_reads(paths: list[str]) -> list[str]:
if self._is_write_noise(f):
_record_drop("write_noise", f)
continue
if self._is_credential_file(f):
_record_drop("credential_files", f)
continue
if ignore_torch_cache and self._is_torch_cache(f):
_record_drop("torch_cache", f)
continue
Expand Down Expand Up @@ -323,6 +329,19 @@ def _is_roar_internal(path: str) -> bool:
or path == ".roar"
)

@staticmethod
def _is_credential_file(path: str) -> bool:
"""Check if path is a credentials file that must never enter lineage.

These hold secrets (not reproducibility-relevant data), so they are
always filtered from inputs and outputs regardless of config — a user
clearing ``filters.ignore_paths`` must not re-expose them.
"""
basename = path.rsplit("/", 1)[-1]
# .netrc / _netrc: machine login tokens (curl, requests, huggingface,
# wandb). The _netrc spelling is the Windows / NETRC-env variant.
return basename in (".netrc", "_netrc")

@staticmethod
def _is_git_metadata(path: str) -> bool:
"""Check if path is git metadata used for roar context capture."""
Expand Down
1 change: 1 addition & 0 deletions roar/presenters/run_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
("tmp_files", "tmp"),
("write_noise", "write-noise"),
("git_metadata", "git-meta"),
("credential_files", "creds"),
("roar_internal", "roar-internal"),
)

Expand Down
24 changes: 24 additions & 0 deletions tests/unit/test_file_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,27 @@ def test_filter_files_keeps_user_data_reads(monkeypatch) -> None:

assert user_data_path in filtered.opened_files
assert user_data_path in filtered.read_files


def test_filter_files_always_drops_netrc_credentials(monkeypatch) -> None:
"""`.netrc`/`_netrc` hold machine login tokens (wandb, huggingface, curl)
and must NEVER enter lineage — dropped from reads AND writes, regardless of
any filter config. Observed leaking into a nanochat run (wandb reads ~/.netrc).
"""
monkeypatch.setattr(file_filter, "_get_editable_install_dirs", lambda: frozenset())

netrc = str(Path("~/.netrc").expanduser())
win_netrc = "/home/ubuntu/_netrc"
tracer_data = TracerData(
opened_files=[netrc, win_netrc],
read_files=[netrc, win_netrc],
written_files=[netrc],
)
python_data = PythonInjectData(sys_prefix="", sys_base_prefix="", roar_inject_dir="")

filtered = FileFilterService().filter_files(tracer_data, python_data, _filter_config())

for p in (netrc, win_netrc):
assert p not in filtered.opened_files
assert p not in filtered.read_files
assert p not in filtered.written_files
12 changes: 12 additions & 0 deletions tests/unit/test_verbosity.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,18 @@ def test_counts_per_category(self) -> None:
assert result.counts.roar_internal == 1
assert result.counts.write_noise == 1 # /dev/null

def test_credential_files_always_filtered(self) -> None:
# .netrc / _netrc hold login tokens — never reproducibility-relevant,
# so they are dropped from reads and writes regardless of config.
reads = ["/home/user/.netrc", "/home/user/project/data.parquet"]
writes = ["/home/user/_netrc"]
tracer, py = _make_data(reads, writes)
result = FileFilterService().filter_files(tracer, py, {})

assert result.counts.credential_files == 2 # .netrc read + _netrc write
assert result.read_files == ["/home/user/project/data.parquet"]
assert result.written_files == []

def test_dropped_paths_empty_by_default(self) -> None:
tracer, py = _make_data(["/sys/foo"], [])
result = FileFilterService().filter_files(tracer, py, {})
Expand Down
Loading