diff --git a/fridata.py b/fridata.py index d270736..504021e 100644 --- a/fridata.py +++ b/fridata.py @@ -225,7 +225,8 @@ def create_parser(): add_common_arguments(verify_chains_parser) create_archive_parser = subparsers.add_parser( - "create_archive", help="Create PDB compressed archive" + "create_archive", + help="Write one PDB .zip per H5 shard under data_path/archives//", ) add_common_arguments(create_archive_parser) diff --git a/tests/test_archive.py b/tests/test_archive.py new file mode 100644 index 0000000..735da6e --- /dev/null +++ b/tests/test_archive.py @@ -0,0 +1,26 @@ +from toolbox.scripts.archive import _shard_zip_name + + +def test_shard_zip_name_distinct_batches_same_basename(): + data = "/data" + a = f"{data}/structures/PDB/subset_/fourth_7/0/pdbs.h5" + b = f"{data}/structures/PDB/subset_/fourth_7/1/pdbs.h5" + assert _shard_zip_name(a, data) != _shard_zip_name(b, data) + assert _shard_zip_name(a, data).endswith(".zip") + assert "0" in _shard_zip_name(a, data) or "pdbs" in _shard_zip_name(a, data) + + +def test_shard_zip_name_distinct_trees_same_batch_id(): + data = "/data" + a = f"{data}/structures/PDB/subset_/fourth_7/0/pdbs.h5" + b = f"{data}/structures/PDB/subset_/other_slug/0/pdbs.h5" + assert _shard_zip_name(a, data) != _shard_zip_name(b, data) + + +def test_shard_zip_name_long_path_uses_hash(): + data = "/data" + long_mid = "x" * 300 + h5 = f"{data}/structures/{long_mid}/0/pdbs.h5" + name = _shard_zip_name(h5, data) + assert name.endswith(".zip") + assert len(name) < 80 diff --git a/toolbox/models/embedding/embedding.py b/toolbox/models/embedding/embedding.py index cde4812..f24af07 100644 --- a/toolbox/models/embedding/embedding.py +++ b/toolbox/models/embedding/embedding.py @@ -75,7 +75,7 @@ def run(self): create_index(self.embeddings_index_path, present_embeddings, self.structures_dataset.config.data_path) end = time.time() - logger.info(f"Total time: {format_time(end - start)}") + logger.info(f"Total time: {format_time(end - start)}\n") def missing_ids_to_fasta(self, missing_ids: List[str]) -> Dict[str, str]: diff --git a/toolbox/scripts/archive.py b/toolbox/scripts/archive.py index 0c4ca30..7a5774a 100644 --- a/toolbox/scripts/archive.py +++ b/toolbox/scripts/archive.py @@ -1,6 +1,4 @@ -from toolbox.models.manage_dataset.index.handle_index import read_index -import datetime -import os +import hashlib import zipfile from pathlib import Path @@ -8,14 +6,29 @@ from tqdm import tqdm from tqdm.contrib.logging import logging_redirect_tqdm +from toolbox.models.manage_dataset.index.handle_index import read_index from toolbox.models.manage_dataset.utils import read_all_pdbs_from_h5 from toolbox.utlis.logging import logger -def process_h5_file(h5_file, dataset_path, output_dir): - full_h5_file_path = Path(dataset_path) / h5_file - prots = read_all_pdbs_from_h5(full_h5_file_path) - archive_name = os.path.basename(h5_file).replace(".h5", ".zip") +def _shard_zip_name(h5_file: str, data_path_str: str) -> str: + """Stable unique .zip basename per H5 shard (avoids .../N/pdbs.h5 → pdbs.zip collisions).""" + h5_posix = Path(h5_file).as_posix() + root = Path(data_path_str).as_posix().rstrip("/") + if root and h5_posix.startswith(root + "/"): + rel = h5_posix.removeprefix(root + "/") + else: + rel = h5_posix + stem = rel.removesuffix(".h5").replace("/", "__") + if len(stem) > 200: + stem = hashlib.sha256(rel.encode()).hexdigest()[:24] + return f"{stem}.zip" + + +def process_h5_file(h5_file, data_path_str, output_dir): + h5_path = Path(h5_file) + prots = read_all_pdbs_from_h5(str(h5_path)) + archive_name = _shard_zip_name(h5_file, data_path_str) archive_path = Path(output_dir) / archive_name with zipfile.ZipFile(archive_path, "w") as zipf: @@ -23,39 +36,32 @@ def process_h5_file(h5_file, dataset_path, output_dir): code = p.removesuffix(".pdb") zipf.writestr(f"{code}.pdb", pdb_file_content) - os.system(f"tar -czf {str(archive_path)}.tgz {str(archive_path)}") - return str(archive_path) def create_archive(structures_dataset: "StructuresDataset"): + data_path = structures_dataset.config.data_path dataset_path = structures_dataset.dataset_path() - proteins_index = read_index(Path(dataset_path) / "dataset_reversed.idx", structures_dataset.config.data_path) - output_dir = Path(dataset_path) / "archives" - output_dir.mkdir(exist_ok=True) + proteins_index = read_index( + Path(dataset_path) / "dataset_reversed.idx", data_path + ) + output_dir = Path(data_path) / "archives" / structures_dataset.dataset_dir_name() + output_dir.mkdir(parents=True, exist_ok=True) client = structures_dataset._client futures = [] for h5_file in proteins_index.keys(): - future = client.submit(process_h5_file, h5_file, dataset_path, output_dir) + future = client.submit( + process_h5_file, h5_file, data_path, output_dir + ) futures.append(future) n = len(futures) - logger.info("Building combined PDB archive from %s H5 shard(s)", n) - - current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - final_archive_name = f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip" - final_archive_path = Path.cwd() / final_archive_name - - with zipfile.ZipFile(final_archive_path, "w") as final_zip: - with logging_redirect_tqdm(): - with tqdm(total=n, desc="H5 shards → final zip", unit="h5") as pbar: - i = 0 - for fut in as_completed(futures): - archive_path = fut.result() - with open(archive_path, "rb") as f: - archive_data = f.read() - final_zip.writestr(f"{i}.zip", archive_data) - i += 1 - pbar.update(1) + logger.info("Writing %s PDB shard zip(s) under %s", n, output_dir) + + with logging_redirect_tqdm(): + with tqdm(total=n, desc="H5 shards → zip", unit="h5") as pbar: + for fut in as_completed(futures): + fut.result() + pbar.update(1)