Source code for pestifer.charmmff.ligand_paramgen.rcsb

# Author: Cameron F. Abrams, <cfa22@drexel.edu>
"""
Look up SMILES for a PDB chemical-component ID via the RCSB Data API.

The RCSB Data API exposes per-component records at
``https://data.rcsb.org/rest/v1/core/chemcomp/{COMP_ID}``. Each record's
``pdbx_chem_comp_descriptor`` array carries several SMILES variants
(OpenEye/CACTVS/ACDLabs, with and without stereo); we prefer the
OpenEye canonical-stereo form when present.

Used by the CGenFF ligand-parameterization pipeline so a HETATM resname
can be turned into a usable SMILES without the user having to supply
one for any ligand the PDB already knows about.
"""
from __future__ import annotations

import logging
from functools import lru_cache

import requests

logger = logging.getLogger(__name__)

RCSB_CHEMCOMP_URL = "https://data.rcsb.org/rest/v1/core/chemcomp/{comp_id}"

# Ordered preference: (program-substring, type). Earlier entries win.
# OpenEye canonical SMILES carry stereo; CACTVS canonical do too. The
# bare "SMILES" entries are non-canonical fallbacks. ACDLabs entries are
# last-ditch fallbacks for very old components that lack the others.
_SMILES_PREFERENCE: tuple[tuple[str, str], ...] = (
    ("OpenEye", "SMILES_CANONICAL"),
    ("CACTVS",  "SMILES_CANONICAL"),
    ("OpenEye", "SMILES"),
    ("CACTVS",  "SMILES"),
    ("",        "SMILES_CANONICAL"),
    ("",        "SMILES"),
)


[docs] class RCSBLookupError(RuntimeError): """Raised when an RCSB SMILES lookup fails."""
[docs] @lru_cache(maxsize=512) def fetch_ligand_smiles(comp_id: str, *, timeout: float = 10.0) -> str: """ Fetch the best-available SMILES for a PDB chemical component. Parameters ---------- comp_id PDB chemical-component ID (e.g. ``"ATP"``, ``"NAG"``). Case-insensitive; whitespace stripped. timeout HTTP request timeout in seconds. Returns ------- str SMILES string. Preference order: OpenEye canonical (stereo) > CACTVS canonical (stereo) > OpenEye non-canonical > CACTVS non-canonical > any remaining ``SMILES_CANONICAL`` > any remaining ``SMILES``. Raises ------ RCSBLookupError If the component ID is not found, the network call fails, or the response carries no SMILES descriptor. """ key = comp_id.strip().upper() if not key: raise RCSBLookupError("Empty PDB component ID.") url = RCSB_CHEMCOMP_URL.format(comp_id=key) try: resp = requests.get(url, timeout=timeout) except requests.RequestException as exc: raise RCSBLookupError(f"RCSB request failed for {key!r}: {exc}") from exc if resp.status_code == 404: raise RCSBLookupError(f"No PDB chemical component found for {key!r}.") if not resp.ok: raise RCSBLookupError( f"RCSB returned HTTP {resp.status_code} for {key!r}: " f"{resp.text[:200]!r}" ) descriptors = resp.json().get("pdbx_chem_comp_descriptor") or [] smiles = _pick_best_smiles(descriptors) if smiles is None: raise RCSBLookupError(f"No SMILES descriptor in RCSB response for {key!r}.") logger.debug("RCSB SMILES for %s: %s", key, smiles) return smiles
def _pick_best_smiles(descriptors: list[dict]) -> str | None: """Pick the highest-priority SMILES from a pdbx_chem_comp_descriptor list.""" for prog_substr, type_match in _SMILES_PREFERENCE: for d in descriptors: if d.get("type") != type_match: continue if prog_substr and prog_substr not in (d.get("program") or ""): continue descriptor = d.get("descriptor") if descriptor: return descriptor return None