tsenoner · tsenoner · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -11,11 +11,12 @@
 /docs/publication
 
 # --- data ---
-# Ignore all data directories except 3FTx, Pla2g2, and toxins
+# Ignore all data directories except 3FTx, Pla2g2, toxins, and jmb_2025
 /data/*
 !/data/3FTx
 !/data/Pla2g2
 !/data/toxins
+!/data/jmb_2025
 # Always ignore pdb subdirectories, tmp subdirectories, and .h5 files everywhere
 **/pdb/
 **/tmp/

diff --git a/data/jmb_2025/README.md b/data/jmb_2025/README.md
@@ -0,0 +1,69 @@
+# JMB 2025 figure data (archive)
+
+Archived inputs and outputs behind the figures in the original ProtSpace
+publication, kept for backwards compatibility and reproducibility:
+
+> Senoner T, Olenyi T, Heinzinger M, Spannagl A, Bouras G, Rost B, Koludarov I.
+> **ProtSpace: A Tool for Visualizing Protein Space.** *J Mol Biol* (2025).
+> DOI: [10.1016/j.jmb.2025.168940](https://doi.org/10.1016/j.jmb.2025.168940)
+> Submitted 30 Nov 2024 · Accepted 7 Jan 2025 · Online 10 Jan 2025
+
+These files were added to the repo in commit `7c0442e` (2024-11-28, inside the
+submission window) and later removed during the Oct 2025 data cleanup. This
+directory is a frozen archive — do not regenerate it against current UniProt.
+
+## `toxprot/` — venom-toxin dataset
+
+- **5,181 proteins** — reviewed (Swiss-Prot) Metazoan venom/toxin entries from
+  the UniProt Animal Toxin Annotation Project (ToxProt).
+- **Embeddings:** ProtT5 (`Rostlab/prot_t5_xl_uniref50`, 1024-dim), computed on
+  **mature sequences** — i.e. with the signal peptide cleaved.
+- Features per protein: taxonomic `Order` / `Family` / `Genus`, the curated
+  `protein_category`, and the raw UniProt `Protein families` string. The
+  category mapping (conotoxin, three_finger_toxin, phospholipase_a2, …) is
+  defined by the regex rules in the original `process_toxin.ipynb` notebook.
+
+### Files
+
+| File | Description |
+|------|-------------|
+| `toxins.json` | ProtSpace JSON — projections (PCA/UMAP/PaCMAP) + features for all 5,181 proteins |
+| `toxins_style.json` | `toxins.json` with manual styling applied (carries `visualization_state`) |
+| `toxins.csv` | Per-protein annotations: identifier, Order, Family, Genus, **protein_category** (curated), Protein families |
+| `toxins_full.fasta` | **Reconstructed** full UniProt sequences (signal peptide included) |
+| `toxins_mature.fasta` | **Reconstructed** mature sequences (signal peptide cleaved — the actual embedding input) |
+| `rebuild_mature_fasta.py` | Script that regenerates the FASTAs from the `toxins.csv` accessions |
+
+The ProtT5 embeddings (`toxins_prott5.h5`, 5,181 × 1024, keyed by accession) are
+**not tracked in git** (`**/*.h5` is ignored). They can be regenerated from
+`toxins_mature.fasta` with `protspace embed -e prot_t5`.
+
+### Dimensionality-reduction parameters (as used in the figures)
+
+Recovered from the projection metadata embedded in `toxins.json` (computed on
+the ProtT5 embeddings):
+
+| Method | Parameters |
+|--------|------------|
+| PCA 2D / 3D | `n_components=2` / `3` |
+| UMAP 2D / 3D | `n_neighbors=50`, `min_dist=0.5`, `metric=euclidean` |
+| PaCMAP 2D / 3D | `n_neighbors=50`, `MN_ratio=0.5`, `FP_ratio=2.0` |
+
+### Note on the FASTAs
+
+The original input FASTA was **never committed** — it lived only in an untracked
+`raw_data/` directory (`toxins.tsv`, `noise.csv`,
+`mature_seqs_prot_t5_xl_uniref50.h5`). No tracked file from 2024 contains
+sequences: the `.h5` stores embedding vectors keyed by accession, and the CSVs
+carry annotations only.
+
+`toxins_full.fasta` and `toxins_mature.fasta` are therefore **reconstructions**:
+`rebuild_mature_fasta.py` takes the 5,181 accessions from `toxins.csv`, re-fetches
+each entry's sequence and signal-peptide annotation from UniProt, and writes both
+the full sequence and the signal-peptide-stripped mature sequence (the latter is
+what was embedded). Because UniProt entries can change over time, these are
+faithful but not guaranteed byte-identical to the November 2024 input.
+
+Reconstruction result: **5,179 / 5,181** sequences recovered (3,532 had a signal
+peptide cleaved in the mature file). 2 accessions are now obsolete/merged and
+could not be retrieved: `D5KR58`, `Q2PE51`.
diff --git a/data/jmb_2025/toxprot/rebuild_mature_fasta.py b/data/jmb_2025/toxprot/rebuild_mature_fasta.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""Reconstruct the FASTA files for the JMB 2025 toxprot dataset.
+
+The original FASTAs were never committed; the embeddings (``toxins_prott5.h5``,
+not tracked in git) were generated from signal-peptide-stripped (mature)
+sequences. This script reads the 5,181 UniProt accessions from ``toxins.csv``,
+re-fetches their sequences + signal-peptide annotations from UniProt, and
+writes two files:
+
+  - ``toxins_full.fasta``   — full UniProt sequences (signal peptide included)
+  - ``toxins_mature.fasta`` — mature sequences (signal peptide cleaved; the
+                              actual input used to compute the embeddings)
+
+Note: UniProt entries may have changed since November 2024, so the result is a
+faithful reconstruction, not necessarily byte-identical to the original input.
+Accessions that are now obsolete/merged are reported and skipped.
+"""
+
+from __future__ import annotations
+
+import csv
+import re
+import sys
+from pathlib import Path
+
+import requests
+
+HERE = Path(__file__).resolve().parent
+ACCESSIONS_CSV = HERE / "toxins.csv"
+FULL_OUT = HERE / "toxins_full.fasta"
+MATURE_OUT = HERE / "toxins_mature.fasta"
+
+UNIPROT_ACCESSIONS_URL = "https://rest.uniprot.org/uniprotkb/accessions"
+SIGNAL_RE = re.compile(r"SIGNAL\s+(\d+)\.\.(\d+)")
+BATCH = 100
+
+
+def read_accessions(csv_path: Path) -> list[str]:
+    """Read the 5,181 UniProt accessions from the tracked annotation CSV.
+
+    The original embeddings .h5 (same keys) is not committed; the `identifier`
+    column here is the authoritative accession list for the dataset.
+    """
+    with csv_path.open(newline="") as f:
+        return sorted(row["identifier"] for row in csv.DictReader(f))
+
+
+def fetch_batch(accessions: list[str]) -> dict[str, tuple[str, str]]:
+    """Return {accession: (sequence, ft_signal)} for one batch."""
+    params = {
+        "accessions": ",".join(accessions),
+        "format": "tsv",
+        "fields": "accession,sequence,ft_signal",
+    }
+    resp = requests.get(UNIPROT_ACCESSIONS_URL, params=params, timeout=300)
+    resp.raise_for_status()
+    out: dict[str, tuple[str, str]] = {}
+    lines = resp.text.splitlines()
+    header = lines[0].split("\t")
+    i_acc, i_seq, i_sig = (
+        header.index("Entry"),
+        header.index("Sequence"),
+        header.index("Signal peptide"),
+    )
+    for line in lines[1:]:
+        cols = line.split("\t")
+        if len(cols) <= max(i_acc, i_seq, i_sig):
+            continue
+        out[cols[i_acc]] = (cols[i_seq], cols[i_sig])
+    return out
+
+
+def mature(seq: str, signal: str) -> str:
+    """Strip a single confidently-bounded signal peptide; else return seq."""
+    matches = SIGNAL_RE.findall(signal or "")
+    if len(matches) == 1:
+        return seq[int(matches[0][1]) :]
+    return seq
+
+
+def main() -> int:
+    accessions = read_accessions(ACCESSIONS_CSV)
+    print(f"{len(accessions)} accessions from {ACCESSIONS_CSV.name}")
+
+    records: dict[str, tuple[str, str]] = {}
+    for start in range(0, len(accessions), BATCH):
+        chunk = accessions[start : start + BATCH]
+        records.update(fetch_batch(chunk))
+        print(f"  fetched {min(start + BATCH, len(accessions))}/{len(accessions)}")
+
+    missing = [a for a in accessions if a not in records]
+    n_written = 0
+    n_sp = 0
+    with FULL_OUT.open("w") as ffull, MATURE_OUT.open("w") as fmat:
+        for acc in accessions:
+            if acc not in records:
+                continue
+            seq, signal = records[acc]
+            if not seq:
+                missing.append(acc)
+                continue
+            m = mature(seq, signal)
+            if len(m) != len(seq):
+                n_sp += 1
+            ffull.write(f">{acc}\n{seq}\n")
+            fmat.write(f">{acc}\n{m}\n")
+            n_written += 1
+
+    print(
+        f"Wrote {FULL_OUT.name} + {MATURE_OUT.name}: {n_written} sequences, "
+        f"{n_sp} with SP stripped in the mature file"
+    )
+    if missing:
+        print(f"WARNING: {len(missing)} accessions not retrievable (obsolete/merged):")
+        print(
+            "  "
+            + ", ".join(sorted(set(missing))[:20])
+            + (" ..." if len(set(missing)) > 20 else "")
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())