Source code for pestifer.charmmff.ligand_paramgen.protonation
# Author: Cameron F. Abrams, <cfa22@drexel.edu>
"""
Protonate a ligand residue at a target pH while preserving its PDB
heavy-atom order, names, and coordinates.
Used as the first stage of the on-the-fly CGenFF parameterization
pipeline: the protonated 3D Mol returned here feeds the mol2 writer
that drives the ``cgenff`` binary.
"""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from rdkit import Chem
from ...molecule.residue import Residue
logger = logging.getLogger(__name__)
_PARAMGEN_INSTALL_HINT = (
"Install the optional dependencies with "
"`pip install pestifer[ligand-paramgen]`."
)
[docs]
class LigandProtonationError(RuntimeError):
"""Raised when a ligand cannot be protonated for CGenFF parameterization."""
def _import_rdkit():
"""Lazily import RDKit, raising a friendly error if it is not installed.
RDKit is an optional ``ligand-paramgen`` dependency, so it must not be
imported at module load time (that would break a plain install's CLI).
"""
try:
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds
except ModuleNotFoundError as exc:
raise LigandProtonationError(
f"RDKit is required for ligand parameterization. {_PARAMGEN_INSTALL_HINT}"
) from exc
return Chem, rdDetermineBonds
def _import_dimorphite():
"""Lazily import Dimorphite-DL, raising a friendly error if it is missing."""
try:
import dimorphite_dl
except ModuleNotFoundError as exc:
raise LigandProtonationError(
f"Dimorphite-DL is required for ligand parameterization. "
f"{_PARAMGEN_INSTALL_HINT}"
) from exc
return dimorphite_dl
[docs]
def protonate_ligand(
residue: "Residue",
smiles: str,
ph: float = 7.4,
) -> Chem.Mol:
"""
Protonate a ligand residue at the given pH using Dimorphite-DL.
The input ``residue`` supplies heavy-atom coordinates and PDB atom names;
the ``smiles`` supplies bond orders and formal charges. Heavy-atom order
in the returned Mol matches the input residue's heavy-atom order, and
each heavy atom carries its PDB name as the RDKit property ``_pdb_name``.
Hydrogens are placed in 3D at the protonation state appropriate for ``ph``.
Parameters
----------
residue
A pestifer :class:`~pestifer.molecule.residue.Residue` for the ligand.
Hydrogens in ``residue.atoms`` are ignored; only heavy atoms are used.
smiles
SMILES describing the ligand's heavy-atom skeleton and bond orders.
Heavy-atom count must match the number of heavy atoms in ``residue``.
ph
pH at which to protonate (default 7.4). ``ph_min`` and ``ph_max``
passed to Dimorphite-DL are both set to this value.
"""
Chem, _ = _import_rdkit()
dimorphite_dl = _import_dimorphite()
heavy_pdb_atoms = [
a for a in residue.atoms.data
if (a.elem or "").strip().upper() != "H"
]
if not heavy_pdb_atoms:
raise LigandProtonationError(f"Residue {residue} has no heavy atoms.")
coord_mol = _build_coord_mol(heavy_pdb_atoms)
template = Chem.MolFromSmiles(smiles)
if template is None:
raise LigandProtonationError(f"Could not parse SMILES: {smiles!r}")
if template.GetNumHeavyAtoms() != len(heavy_pdb_atoms):
raise LigandProtonationError(
f"SMILES has {template.GetNumHeavyAtoms()} heavy atoms but residue "
f"{residue} has {len(heavy_pdb_atoms)}."
)
# Substructure match maps each template atom -> its position in coord_mol
# (which is PDB heavy-atom order). Stamp the template's atoms with 1-indexed
# map numbers that encode the PDB order; this survives the SMILES round-trip
# through Dimorphite-DL.
# coord_mol has all-single bonds from DetermineConnectivity, so we relax
# bond-order strictness on the template's match.
match = coord_mol.GetSubstructMatch(template)
if not match:
params = Chem.AdjustQueryParameters.NoAdjustments()
params.makeBondsGeneric = True
relaxed = Chem.AdjustQueryProperties(template, params)
match = coord_mol.GetSubstructMatch(relaxed)
if not match:
raise LigandProtonationError(
f"SMILES skeleton does not match {residue}'s heavy-atom graph."
)
for t_idx, m_idx in enumerate(match):
template.GetAtomWithIdx(t_idx).SetAtomMapNum(m_idx + 1)
mapped_smiles = Chem.MolToSmiles(template, canonical=False)
logger.debug("Dimorphite input SMILES: %s", mapped_smiles)
# precision=0.0 collapses Dimorphite-DL's pKa uncertainty window so it
# returns a single dominant microspecies at the target pH; with the default
# precision Dimorphite-DL widens to pKa +/- 1 and can return both forms
# near borderline groups (e.g. amines whose pKa is ~3 above pH).
protonated_smiles = dimorphite_dl.protonate_smiles(
mapped_smiles,
ph_min=ph,
ph_max=ph,
precision=0.0,
max_variants=1,
)
if not protonated_smiles:
raise LigandProtonationError(
f"Dimorphite-DL returned no protomers for {residue} at pH {ph}."
)
logger.debug("Dimorphite output SMILES: %s", protonated_smiles[0])
protonated = Chem.MolFromSmiles(protonated_smiles[0])
if protonated is None:
raise LigandProtonationError(
f"Could not parse Dimorphite output SMILES: {protonated_smiles[0]!r}"
)
protonated = Chem.RenumberAtoms(protonated, _heavy_atom_map_order(protonated))
n_heavy = len(heavy_pdb_atoms)
if protonated.GetNumAtoms() < n_heavy:
raise LigandProtonationError(
f"Protonated SMILES has {protonated.GetNumAtoms()} atoms but expected "
f"at least {n_heavy} heavy atoms."
)
# Copy heavy-atom 3D coords from coord_mol (which is in PDB order); stamp
# PDB names on the heavy atoms.
conf_src = coord_mol.GetConformer()
conf_dst = Chem.Conformer(protonated.GetNumAtoms())
for i in range(n_heavy):
conf_dst.SetAtomPosition(i, conf_src.GetAtomPosition(i))
protonated.GetAtomWithIdx(i).SetProp("_pdb_name", heavy_pdb_atoms[i].name)
protonated.GetAtomWithIdx(i).SetAtomMapNum(0)
protonated.RemoveAllConformers()
protonated.AddConformer(conf_dst, assignId=True)
return Chem.AddHs(protonated, addCoords=True)
def _build_coord_mol(heavy_atoms) -> Chem.Mol:
"""Build a single-conformer RDKit Mol from heavy atoms in input order.
Connectivity is inferred from interatomic distances by
:func:`rdDetermineBonds.DetermineConnectivity`; all bonds are single
and bond orders are not meaningful. This Mol is used solely as the
target of a (bond-order-relaxed) substructure match against the
SMILES template, so we can stamp PDB-order atom map numbers onto
the template's atoms.
"""
Chem, rdDetermineBonds = _import_rdkit()
rwmol = Chem.RWMol()
conf = Chem.Conformer(len(heavy_atoms))
for i, a in enumerate(heavy_atoms):
sym = (a.elem or "").strip()
if not sym:
raise LigandProtonationError(
f"Atom {a.name} in residue has no element symbol; "
f"cannot build RDKit Mol."
)
idx = rwmol.AddAtom(Chem.Atom(sym))
conf.SetAtomPosition(idx, (float(a.x), float(a.y), float(a.z)))
rwmol.AddConformer(conf, assignId=True)
mol = rwmol.GetMol()
try:
rdDetermineBonds.DetermineConnectivity(mol)
except ValueError as exc:
raise LigandProtonationError(
f"RDKit could not infer connectivity from heavy-atom coordinates: {exc}"
) from exc
return mol
def _heavy_atom_map_order(mol: Chem.Mol) -> list[int]:
"""Permutation that reorders ``mol`` so mapped atoms come first in
map-number order, with any unmapped atoms (e.g., explicit Hs) trailing."""
mapped = sorted(
(a.GetAtomMapNum(), a.GetIdx())
for a in mol.GetAtoms()
if a.GetAtomMapNum() > 0
)
unmapped = [a.GetIdx() for a in mol.GetAtoms() if a.GetAtomMapNum() == 0]
return [idx for _, idx in mapped] + unmapped