diff --git a/gufe/tests/test_pdbstructure.py b/gufe/tests/test_pdbstructure.py new file mode 100644 index 00000000..7e8f3a1b --- /dev/null +++ b/gufe/tests/test_pdbstructure.py @@ -0,0 +1,46 @@ +from io import StringIO + +from gufe.vendor.pdb_file.pdbstructure import PdbStructure, _parse_atom_index + + +def test_hex_conect_parsing(): + pdb_snippet = """ATOM 99999 C LIG A 1 0.000 0.000 0.000 1.00 0.00 C +ATOM A000F N LIG A 2 1.000 0.000 0.000 1.00 0.00 N +ATOM A000G O LIG A 3 0.000 1.000 0.000 1.00 0.00 O +CONECT99999A000FA000G""" + + f = StringIO(pdb_snippet) + pdb = PdbStructure(f, load_all_models=True) + + # Collect all atom serial numbers, including Maestro-style + atom_serials = [atom.serial_number for atom in pdb.iter_atoms(use_all_models=True)] + + # There should be 3 atoms + assert len(atom_serials) == 3 + + # All serial numbers should be integers and unique + for serial in atom_serials: + assert isinstance(serial, int) + assert len(set(atom_serials)) == 3 + + # Convert the known Maestro-style indices to integers + a000f_serial = _parse_atom_index("A000F") + a000g_serial = _parse_atom_index("A000G") + assert a000f_serial in atom_serials + assert a000g_serial in atom_serials + + # Check that CONECT records refer to the correct integers + conects = pdb._current_model.connects + assert len(conects) == 1 + central, bonded1, bonded2 = conects[0] + assert central == 99999 + # The bonded atoms match the converted Maestro-style serials + assert bonded1 in [a000f_serial, a000g_serial] + assert bonded2 in [a000f_serial, a000g_serial] + assert bonded1 != bonded2 + + known_hex_serials = ["A000G", "A000F"] + for hex_serial in known_hex_serials: + idx = _parse_atom_index(hex_serial) + assert isinstance(idx, int) + assert idx >= 100000 diff --git a/gufe/vendor/pdb_file/pdbstructure.py b/gufe/vendor/pdb_file/pdbstructure.py index cda68915..6a7b6af0 100644 --- a/gufe/vendor/pdb_file/pdbstructure.py +++ b/gufe/vendor/pdb_file/pdbstructure.py @@ -165,13 +165,16 @@ def _load(self, input_stream): if command == "ATOM " or command == "HETATM": self._add_atom(Atom(pdb_line, self, self.extraParticleIdentifier)) elif command == "CONECT": - atoms = [_parse_atom_index(pdb_line[6:11])] - for pos in (11, 16, 21, 26): - try: - atoms.append(_parse_atom_index(pdb_line[pos : pos + 5])) - except: - pass - self._current_model.connects.append(atoms) + try: + atoms = [_parse_atom_index(pdb_line[6:11])] + for pos in (11, 16, 21, 26): + try: + atoms.append(_parse_atom_index(pdb_line[pos : pos + 5])) + except: + pass + self._current_model.connects.append(atoms) + except: + pass # Notice MODEL punctuation, for the next level of detail # in the structure->model->chain->residue->atom->position hierarchy elif pdb_line[:5] == "MODEL": @@ -1060,12 +1063,27 @@ def __str__(self): return str(self.position) -def _parse_atom_index(index): - """Parse the string containing an atom index, which might be either decimal or hex.""" - try: - return int(index) - except: - return int(index, 16) - 0xA0000 + 100000 +def _parse_atom_index(index: str) -> int: + """ + Parse an atom serial index from a PDB file, supporting: + - Decimal numbers (e.g., 12345) + - Standard hex (0–9, A–F) + - Maestro-style extended hex (letters beyond F, e.g., A000G) + """ + index = index.strip() + + # Try decimal, then hex, then Maestro base36 + for base in (10, 16, 36): + try: + val = int(index, base) + if val >= 0xA0000: + val = val - 0xA0000 + 100000 + return val + except ValueError: + continue + + raise ValueError(f"Unable to parse atom index: '{index}'") + # run module directly for testing