diff --git a/news/fix_unk_resnames.rst b/news/fix_unk_resnames.rst new file mode 100644 index 000000000..0dd6b8302 --- /dev/null +++ b/news/fix_unk_resnames.rst @@ -0,0 +1,23 @@ +**Added:** + +* + +**Changed:** + +* Small molecules in RelativeHybridTopology topologies (including the output PDB) are now named LIG (alchemical ligand) and CF1, CF2… (cofactors) instead of UNK. If a residue name was already assigned, the assigned one is kept. + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* Fixed inflated ligand RMSD in the RelativeHybridTopology protocol's structural analysis for systems containing cofactors; the ligand RMSD is now computed for the alchemical ligand alone rather than conflating it with cofactors that shared the UNK residue name. + +**Security:** + +* diff --git a/src/openfe/protocols/openmm_rfe/hybridtop_units.py b/src/openfe/protocols/openmm_rfe/hybridtop_units.py index 5f4c1c9f3..2e5436c18 100644 --- a/src/openfe/protocols/openmm_rfe/hybridtop_units.py +++ b/src/openfe/protocols/openmm_rfe/hybridtop_units.py @@ -46,6 +46,7 @@ from openmmtools import multistate import openfe +from openfe.protocols.openmm_utils.offmolecule_utils import _get_offmol_resname, _set_offmol_resname from openfe.protocols.openmm_utils.omm_settings import ( BasePartialChargeSettings, ) @@ -753,6 +754,20 @@ def run( alchem_comps = self._inputs["alchemical_components"] solvent_comp, protein_comp, small_mols = self._get_components(stateA, stateB) + alchemical = set(alchem_comps["stateA"]) | set(alchem_comps["stateB"]) + cofactors = [smc for smc in small_mols if smc not in alchemical] + + for smc, offmol in small_mols.items(): + if _get_offmol_resname(offmol) is not None: + continue + if smc in alchemical: + _set_offmol_resname(offmol, "LIG") + else: + _set_offmol_resname(offmol, f"CF{cofactors.index(smc) + 1}") + + # Record the resname the alchemical ligand actually ended up with + alchem_resname = _get_offmol_resname(small_mols[mapping.componentA]) + # Assign partial charges now to avoid any discrepancies later self._assign_partial_charges(settings["charge_settings"], small_mols) @@ -810,6 +825,7 @@ def run( "positions": positions_outfile, "pdb_structure": self.shared_basepath / settings["output_settings"].output_structure, "selection_indices": selection_indices, + "alchemical_resname": alchem_resname, } if dry: @@ -1505,6 +1521,7 @@ def _structural_analysis( trj_file: pathlib.Path, output_directory: pathlib.Path, dry: bool, + ligand_resname: str = "LIG", ) -> dict[str, str | pathlib.Path]: """ Run structural analysis using ``openfe-analysis``. @@ -1520,6 +1537,8 @@ def _structural_analysis( will be stored. dry : bool Whether or not we are running a dry run. + ligand_resname: str + The residue name of the hybrid topology ligand. Default: LIG Returns ------- @@ -1535,7 +1554,9 @@ def _structural_analysis( from openfe_analysis import rmsd try: - data = rmsd.gather_rms_data(pdb_file, trj_file) + data = rmsd.gather_rms_data( + pdb_file, trj_file, ligand_selection=f"resname {ligand_resname}" + ) # TODO: eventually change this to more specific exception types except Exception as e: return {"structural_analysis_error": str(e)} @@ -1573,6 +1594,7 @@ def run( pdb_file: pathlib.Path, trajectory: pathlib.Path, checkpoint: pathlib.Path, + ligand_resname: str = "LIG", dry: bool = False, verbose: bool = True, scratch_basepath: pathlib.Path | None = None, @@ -1588,6 +1610,8 @@ def run( Path to the MultiStateReporter generated NetCDF file. checkpoint : pathlib.Path Path to the checkpoint file generated by MultiStateReporter. + ligand_resname: str + The residue name of the hybrid topology ligand. Default: LIG dry : bool Do a dry run of the calculation, creating all necessary hybrid system components (topology, system, sampler, etc...) but without @@ -1641,6 +1665,7 @@ def run( trj_file=trajectory, output_directory=self.shared_basepath, dry=dry, + ligand_resname=ligand_resname, ) # Return relevant things @@ -1669,6 +1694,9 @@ def _execute( pdb_file=pdb_file, trajectory=trajectory, checkpoint=checkpoint, + ligand_resname=setup_results.outputs.get( + "alchemical_resname", "LIG" + ), # Adding this for backward compatibility scratch_basepath=ctx.scratch, shared_basepath=ctx.shared, ) diff --git a/src/openfe/protocols/openmm_utils/offmolecule_utils.py b/src/openfe/protocols/openmm_utils/offmolecule_utils.py new file mode 100644 index 000000000..a715af4e2 --- /dev/null +++ b/src/openfe/protocols/openmm_utils/offmolecule_utils.py @@ -0,0 +1,109 @@ +# This code is part of OpenFE and is licensed under the MIT license. +# For details, see https://github.com/OpenFreeEnergy/openfe +import logging +from typing import Any + +from openff.toolkit import Molecule as OFFMolecule + +logger = logging.getLogger(__name__) + + +def _set_offmol_metadata( + offmol: OFFMolecule, + key: Any, + val: Any | None, +) -> None: + """ + Set a given metadata entry for a whole Molecule. + + Parameters + ---------- + offmol : openff.toolkit.Molecule + The Molecule to set the metadata for. + key : Any + The metadata key. + val : Any + The value to set the metadata entry to. + """ + if val is None: + for a in offmol.atoms: + a.metadata.pop(key, None) + else: + for a in offmol.atoms: + a.metadata[key] = val + + +def _get_offmol_metadata(offmol: OFFMolecule, key: Any) -> Any | None: + """ + Get an offmol's given metadata entry and make sure it is + consistent across all atoms in the Molecule. + + Parameters + ---------- + offmol : openff.toolkit.Molecule + Molecule to get the metadata value from. + key: Any + The metadata entry key. + + Returns + ------- + value : Any | None + Metadata for the given key in the molecule. ``None`` if the + Molecule does not have that metadata entry set, or if + the value is inconsistent across all the atoms. + """ + value: Any | None = None + for a in offmol.atoms: + if value is None: + try: + value = a.metadata[key] + except KeyError: + return None + + if value != a.metadata[key]: + wmsg = f"Inconsistent metadata {key} in OFFMol: {offmol}" + logger.warning(wmsg) + return None + + return value + + +def _set_offmol_resname( + offmol: OFFMolecule, + resname: str | None, +) -> None: + """ + Helper method to set offmol residue names + + Parameters + ---------- + offmol : openff.toolkit.Molecule + Molecule to assign a residue name to. + resname : str | None + Residue name to be set. Set to None to clear it. + + Returns + ------- + None + """ + _set_offmol_metadata(offmol, "residue_name", resname) + + +def _get_offmol_resname(offmol: OFFMolecule) -> str | None: + """ + Helper method to get an offmol's residue name and make sure it is + consistent across all atoms in the Molecule. + + Parameters + ---------- + offmol : openff.toolkit.Molecule + Molecule to get the residue name from. + + Returns + ------- + resname : Optional[str] + Residue name of the molecule. ``None`` if the Molecule + does not have a residue name, or if the residue name is + inconsistent across all the atoms. + """ + return _get_offmol_metadata(offmol, "residue_name") diff --git a/src/openfe/tests/data/openmm_rfe/benzene_toluene_hybrid_top/hybrid_topology_atoms.csv b/src/openfe/tests/data/openmm_rfe/benzene_toluene_hybrid_top/hybrid_topology_atoms.csv index 3ed6ee0e8..86ed7f043 100644 --- a/src/openfe/tests/data/openmm_rfe/benzene_toluene_hybrid_top/hybrid_topology_atoms.csv +++ b/src/openfe/tests/data/openmm_rfe/benzene_toluene_hybrid_top/hybrid_topology_atoms.csv @@ -1,17 +1,17 @@ ,serial,name,element,resSeq,resName,chainID,segmentID,formal_charge -0,,C1x,C,0,UNK,0,, -1,,C2x,C,0,UNK,0,, -2,,C3x,C,0,UNK,0,, -3,,C4x,C,0,UNK,0,, -4,,C5x,C,0,UNK,0,, -5,,C6x,C,0,UNK,0,, -6,,H1x,H,0,UNK,0,, -7,,H2x,H,0,UNK,0,, -8,,H3x,H,0,UNK,0,, -9,,H4x,H,0,UNK,0,, -10,,H5x,H,0,UNK,0,, -11,,H6x,H,0,UNK,0,, -12,,H1x,H,0,UNK,0,, -13,,H2x,H,0,UNK,0,, -14,,C1x,C,0,UNK,0,, -15,,H3x,H,0,UNK,0,, +0,,C1x,C,0,LIG,0,, +1,,C2x,C,0,LIG,0,, +2,,C3x,C,0,LIG,0,, +3,,C4x,C,0,LIG,0,, +4,,C5x,C,0,LIG,0,, +5,,C6x,C,0,LIG,0,, +6,,H1x,H,0,LIG,0,, +7,,H2x,H,0,LIG,0,, +8,,H3x,H,0,LIG,0,, +9,,H4x,H,0,LIG,0,, +10,,H5x,H,0,LIG,0,, +11,,H6x,H,0,LIG,0,, +12,,H1x,H,0,LIG,0,, +13,,H2x,H,0,LIG,0,, +14,,C1x,C,0,LIG,0,, +15,,H3x,H,0,LIG,0,, diff --git a/src/openfe/tests/protocols/openmm_rfe/test_hybrid_top_protocol.py b/src/openfe/tests/protocols/openmm_rfe/test_hybrid_top_protocol.py index a1f9fef2d..662b46ae1 100644 --- a/src/openfe/tests/protocols/openmm_rfe/test_hybrid_top_protocol.py +++ b/src/openfe/tests/protocols/openmm_rfe/test_hybrid_top_protocol.py @@ -11,6 +11,7 @@ from unittest import mock import gufe +import MDAnalysis as mda import mdtraj as mdt import numpy as np import openmm @@ -50,6 +51,12 @@ HAS_NAGL, HAS_OPENEYE, ) +from openfe.protocols.openmm_utils.offmolecule_utils import ( + _get_offmol_metadata, + _get_offmol_resname, + _set_offmol_metadata, + _set_offmol_resname, +) def _get_units(protocol_units, unit_type): @@ -507,7 +514,7 @@ def test_dry_run_ligand( ): # this might be a bit time consuming solv_settings.simulation_settings.sampler_method = method - solv_settings.output_settings.output_indices = "resname UNK" + solv_settings.output_settings.output_indices = "resname LIG" protocol = openmm_rfe.RelativeHybridTopologyProtocol( settings=solv_settings, @@ -1091,7 +1098,7 @@ def test_dry_run_complex( ): # this will be very time consuming solv_settings.simulation_settings.sampler_method = method - solv_settings.output_settings.output_indices = "protein or resname UNK" + solv_settings.output_settings.output_indices = "protein or resname LIG" protocol = openmm_rfe.RelativeHybridTopologyProtocol( settings=solv_settings, @@ -2401,3 +2408,114 @@ def test_dry_run_vacuum_write_frequency( assert reporter.velocity_interval == velocities_write_frequency.m else: assert reporter.velocity_interval == 0 + + +@pytest.fixture +def eg5_vac_inputs(eg5_ligands, eg5_cofactor): + ligA, ligB = eg5_ligands[0], eg5_ligands[1] + mapper = openfe.LomapAtomMapper() + mapping = next(mapper.suggest_mappings(ligA, ligB)) + stateA = openfe.ChemicalSystem({"ligand": ligA, "cofactor": eg5_cofactor}) + stateB = openfe.ChemicalSystem({"ligand": ligB, "cofactor": eg5_cofactor}) + return stateA, stateB, mapping + + +def _run_setup_dry(stateA, stateB, mapping, settings, tmp_path): + protocol = openmm_rfe.RelativeHybridTopologyProtocol(settings=settings) + dag = protocol.create(stateA=stateA, stateB=stateB, mapping=mapping) + setup_unit = _get_units(dag.protocol_units, HybridTopologySetupUnit)[0] + return setup_unit.run(dry=True, scratch_basepath=tmp_path, shared_basepath=tmp_path) + + +def test_ligand_separable_from_cofactor(eg5_vac_inputs, vac_settings, tmp_path): + vac_settings.output_settings.output_indices = "all" + stateA, stateB, mapping = eg5_vac_inputs + out = _run_setup_dry(stateA, stateB, mapping, vac_settings, tmp_path) + + u = mda.Universe(out["pdb_structure"]) + lig = u.select_atoms("resname LIG") + cof = u.select_atoms("resname CF1") + assert lig.n_atoms > 0 + assert cof.n_atoms > 0 + assert set(lig.indices).isdisjoint(cof.indices) + + +def test_existing_resname_preserved(eg5_ligands, eg5_cofactor, vac_settings, tmp_path): + vac_settings.output_settings.output_indices = "all" + + ligA, ligB = eg5_ligands[0], eg5_ligands[1] + off = eg5_cofactor.to_openff() + _set_offmol_resname(off, "MYC") + cof = openfe.SmallMoleculeComponent.from_openff(off) + + mapper = openfe.setup.KartografAtomMapper() + mapping = next(mapper.suggest_mappings(ligA, ligB)) + stateA = openfe.ChemicalSystem({"ligand": ligA, "cofactor": cof}) + stateB = openfe.ChemicalSystem({"ligand": ligB, "cofactor": cof}) + + out = _run_setup_dry(stateA, stateB, mapping, vac_settings, tmp_path) + + u = mda.Universe(out["pdb_structure"]) + assert "MYC" in u.residues.resnames + assert "CF1" not in u.residues.resnames + assert "LIG" in u.residues.resnames + + +def test_hybrid_ligand_takes_componentA_resname(eg5_ligands, eg5_cofactor, vac_settings, tmp_path): + vac_settings.output_settings.output_indices = "all" + + ligA, ligB = eg5_ligands[0], eg5_ligands[1] + offA, offB = ligA.to_openff(), ligB.to_openff() + _set_offmol_resname(offA, "LGA") + _set_offmol_resname(offB, "LGB") + ligA_named = openfe.SmallMoleculeComponent.from_openff(offA) + ligB_named = openfe.SmallMoleculeComponent.from_openff(offB) + + mapper = openfe.setup.KartografAtomMapper() + mapping = next(mapper.suggest_mappings(ligA_named, ligB_named)) + stateA = openfe.ChemicalSystem({"ligand": ligA_named, "cofactor": eg5_cofactor}) + stateB = openfe.ChemicalSystem({"ligand": ligB_named, "cofactor": eg5_cofactor}) + + out = _run_setup_dry(stateA, stateB, mapping, vac_settings, tmp_path) + + u = mda.Universe(out["pdb_structure"]) + lig = u.select_atoms("resname LGA") + cof = u.select_atoms("resname CF1") + + # the setup unit records componentA's resname for the analysis unit + assert out["alchemical_resname"] == "LGA" + + # the whole hybrid ligand is under componentA's resname + assert lig.n_atoms > 0 + assert u.select_atoms("resname LGB").n_atoms == 0 + # user-set names are kept, not overwritten with the LIG default + assert u.select_atoms("resname LIG").n_atoms == 0 + assert "UNK" not in {r.resname for r in u.residues} + # cofactor stays separate + assert cof.n_atoms > 0 + assert set(lig.indices).isdisjoint(cof.indices) + + +def test_structural_analysis_uses_ligand_resname(): + captured = {} + + def fake_gather(pdb, trj, ligand_selection="resname UNK"): + captured["ligand_selection"] = ligand_selection + return { + "protein_RMSD": [], + "ligand_RMSD": [], + "ligand_wander": [], + "protein_2D_RMSD": [], + "time(ps)": [], + } + + with mock.patch("openfe_analysis.rmsd.gather_rms_data", side_effect=fake_gather): + HybridTopologyMultiStateAnalysisUnit._structural_analysis( + pdb_file=Path("dummy.pdb"), + trj_file=Path("dummy.nc"), + output_directory=Path("."), + dry=True, + ligand_resname="LGA", + ) + + assert captured["ligand_selection"] == "resname LGA" diff --git a/src/openfe/tests/protocols/test_openmmutils.py b/src/openfe/tests/protocols/test_openmmutils.py index ffacad813..e230afbce 100644 --- a/src/openfe/tests/protocols/test_openmmutils.py +++ b/src/openfe/tests/protocols/test_openmmutils.py @@ -2,6 +2,7 @@ # For details, see https://github.com/OpenFreeEnergy/openfe import copy import gzip +import logging import os import sys from importlib import resources @@ -9,9 +10,7 @@ from unittest import mock import numpy as np -import pooch import pytest -from gufe import BaseSolventComponent from gufe.components.errors import ComponentValidationError from gufe.settings import OpenMMSystemGeneratorFFSettings, ThermoSettings from numpy.testing import assert_allclose, assert_equal @@ -44,6 +43,11 @@ HAS_NAGL, HAS_OPENEYE, ) +from openfe.protocols.openmm_utils.offmolecule_utils import ( + _get_offmol_metadata, + _set_offmol_metadata, + _set_offmol_resname, +) from ..conftest import HAS_INTERNET @@ -1189,3 +1193,22 @@ def test_forward_backwards_failure(simulation_nc): ret = ana.get_forward_and_reverse_analysis() assert ret is None + + +def test_get_metadata_inconsistent_warns(caplog): + mol = OFFMol.from_smiles("CC") + _set_offmol_metadata(mol, "residue_name", "LIG") + mol.atoms[0].metadata["residue_name"] = "COF" + + with caplog.at_level(logging.WARNING): + result = _get_offmol_metadata(mol, "residue_name") + + assert result is None + assert "Inconsistent metadata" in caplog.text + + +def test_set_metadata_none_clears(): + mol = OFFMol.from_smiles("CC") + _set_offmol_metadata(mol, "residue_name", "LIG") + _set_offmol_metadata(mol, "residue_name", None) + assert all("residue_name" not in a.metadata for a in mol.atoms)