Source code for pestifer.molecule.molecule

#Author: Cameron F. Abrams, <cfa22@drexel.edu>
"""
A class for handling molecules
"""
import logging
import os

from pathlib import Path
from pidibble.pdbparse import PDBParser

from .asymmetricunit import AsymmetricUnit
from .bioassemb import BioAssembList, BioAssemb
from .chainidmanager import ChainIDManager
from .segment import Segment

from ..core.objmanager import ObjManager

from ..objs.cleavagesite import CleavageSiteList
from ..objs.graft import Graft

from ..util.cifutil import CIFload
from ..util.stringthings import my_logger

logger=logging.getLogger(__name__)


[docs]
class Molecule:
    """
    A class for handling molecules, including their asymmetric unit and biological assemblies.
    This class is initialized with a source dictionary that can contain various specifications
    such as PDB or mmCIF identifiers, prebuilt structures, or AlphaFold predictions
    and manages the parsing of these structures into an asymmetric unit and biological assemblies.
    
    Parameters
    ----------
    source : dict
        A dictionary containing the source specifications for the molecule. It can include:

        - ``id``: PDB or mmCIF identifier (e.g., {``id``: ``1ABC``, ``file_format``: ``PDB``})
        - ``prebuilt``: A dictionary with ``psf`` and ``pdb`` keys for prebuilt structures (e.g., {``prebuilt``: {``psf``: ``structure.psf``, ``pdb``: ``structure.pdb``}})
        - ``alphafold``: A dictionary with AlphaFold specifications (e.g., {``alphafold``: {``model``: ``AF-1234``}})
    
    objmanager : ObjManager, optional
        An instance of ObjManager to manage objects within the molecule. If not provided,
        a new ObjManager will be created.
    chainIDmanager : ChainIDManager, optional
        An instance of ChainIDManager to manage chain IDs within the molecule. If not provided,
        a new ChainIDManager will be created.
    reset_counter : bool, optional
        If True, resets the molecule counter to 0. This is useful for testing or reinitialization purposes.
        Default is False.

    Attributes
    ----------
    molid : int
        Unique identifier for the molecule instance, automatically incremented with each new instance.
    objmanager : ObjManager
        An instance of ObjManager that manages objects within the molecule.
    chainIDmanager : ChainIDManager
        An instance of ChainIDManager that manages chain IDs within the molecule.
    sourcespecs : dict
        A dictionary containing the source specifications for the molecule, such as PDB or mmCIF identifiers,
        prebuilt structures, or AlphaFold predictions.
    asymmetric_unit : AsymmetricUnit
        An instance of AsymmetricUnit representing the asymmetric unit of the molecule.
    biological_assemblies : BioAssembList
        An instance of BioAssembList containing the biological assemblies derived from the asymmetric unit.
    parsed_struct : dict
        A dictionary containing the parsed structure of the molecule, which may include atoms, residues, and
        segments parsed from the source specifications.
    rcsb_file_format : str
        The file format of the source specifications, either ``PDB`` or ``mmCIF``.
    """

    """
    Optional attributes for the Molecule class.
    
    Attributes
    ----------
    active_biological_assembly : BioAssemb
        An instance of BioAssemb representing the currently active biological assembly.
        This assembly is derived from the asymmetric unit and can be activated based on user requests.
    """

    _molcounter: int = 0

    def __init__(self, source: dict = {}, objmanager: ObjManager = None, chainIDmanager: ChainIDManager=None, **kwargs):
        psf = None
        reset = kwargs.get('reset_counter', False)
        file_format = source.get('file_format', 'PDB')
        if reset:
            Molecule._molcounter = 0
        else:
            Molecule._molcounter += 1
        if not source:
            logger.debug('Molecule initialized without source.')
            p_struct = None
        else:
            # logger.debug('Molecule initialization')
            if 'source_id' in source and 'source_db' in source:
                ext = '.pdb' if file_format in ['PDB', 'pdb'] else '.cif'
                source_name = source['source_id'] + ext
                source_path = Path(source_name)
                if source_path.exists():
                    if file_format in ['PDB', 'pdb']:
                        p_struct = PDBParser(filepath=source_path).parse().parsed
                    elif file_format in ['mmCIF', 'cif']:
                        logger.debug(f'CIF source {source['source_id']}')
                        p_struct = CIFload(source_path)
                else:
                    if file_format in ['PDB', 'pdb']:
                        p_struct = PDBParser(source_id=source['source_id'], source_db=source['source_db']).parse().parsed
                    elif file_format in ['mmCIF', 'cif']:
                        logger.debug(f'CIF source {source['source_id']}')
                        p_struct = CIFload(source['source_id'], source_db=source['source_db'])
            elif 'prebuilt' in source:
                logger.debug(f'Prebuilt record:')
                my_logger(source["prebuilt"], logger.debug)
                psf = source['prebuilt']['psf']
                pdb = source['prebuilt']['pdb']
                xsc = source['prebuilt'].get('xsc', '')
                logger.debug(f'Using prebuilt psf {psf} and pdb {pdb}')
                p_struct = PDBParser(filepath=pdb).parse().parsed
            else:
                logger.debug(f'Neither "source_id/source_db" or "prebuilt" specified; initializing an empty molecule')
                p_struct = None
        if objmanager is None:
            logger.debug(f'Making an empty ObjManager')
            objmanager = ObjManager()
        if chainIDmanager is None:
            logger.debug(f'Molecule instantiating its own ChainIDManager')
            chainIDmanager = ChainIDManager()
        self.sourcespecs = source
        self.objmanager = objmanager
        self.chainIDmanager = chainIDmanager
        self.rcsb_file_format = file_format
        self.molid = kwargs.get('molid', Molecule._molcounter)
        self.parsed_struct = p_struct
        self.asymmetric_unit = AsymmetricUnit(parsed=p_struct, 
                                              sourcespecs=source, 
                                              objmanager=objmanager, 
                                              chainIDmanager=chainIDmanager, 
                                              psf=psf)
        self.asymmetric_unit.set_parent_molecule(self)
        self.biological_assemblies = BioAssembList(p_struct)
        self.biological_assemblies.set_parent_molecule(self)

    def __repr__(self):
        """
        Return a string representation of the Molecule instance.
        This method provides a concise description of the molecule, including its ID and the number of atoms.
        
        Returns
        -------
        str
            A string representation of the Molecule instance.
        """
        return f'Molecule(molid={self.molid}, num_atoms={self.num_atoms()})'


[docs]
    def set_coords(self, altcoordsfile):
        """
        Set the coordinates of the asymmetric unit from an alternate coordinates file.
        This method reads the alternate coordinates from a PDB file and updates the asymmetric unit's coordinates.
        
        Parameters
        ----------
        altcoordsfile : str
            The path to the alternate coordinates file in PDB format.
        
        Raises
        ------
        AssertionError
            If the provided file is not in PDB format or if the file does not exist.
        """
        nm, ext = os.path.splitext(altcoordsfile)
        assert ext == '.pdb', f'Alt-coords file must be PDB format'
        altstruct = PDBParser(filepath=altcoordsfile).parse().parsed
        self.asymmetric_unit.set_coords(altstruct)



[docs]
    def activate_biological_assembly(self, index):
        """
        Activate a biological assembly by its index.
        This method sets the active biological assembly based on the provided index.
        
        Parameters
        ----------
        index : int
            The index of the biological assembly to activate. If the index is 0 or if no biological assemblies are specified,
            the asymmetric unit will be used as the biological assembly.
        
        Returns
        -------
        self : Molecule
            Returns the instance of the Molecule with the active biological assembly set.
            
        Raises
        ------
        AssertionError
            If the specified biological assembly index is invalid.
        """
        if index == 0 or len(self.biological_assemblies) == 0: # we will use the unadulterated A.U. as the B.A.
            self.active_biological_assembly = BioAssemb(self.asymmetric_unit)
            if index != 0:
                logger.warning(f'No biological assemblies specified in input structure.  Using A.U.; ignoring your request of B.A. "{index}"')
        else:
            self.active_biological_assembly = self.biological_assemblies.get(index=index)
            assert self.active_biological_assembly != None, f'No biological assembly "{index:d}" found.'
            logger.info(f'Activating biological assembly {self.active_biological_assembly.name} (idx {index})')
        self.active_biological_assembly.activate(self.asymmetric_unit, self.chainIDmanager)
        return self



[docs]
    def get_chainmaps(self):
        """
        Get a mapping of chain IDs in the active biological assembly.
        This method returns a dictionary where keys are original chain IDs and values are lists of dictionaries
        containing the transform index and the mapped chain ID.
        
        Returns
        -------
        maps : dict
            A dictionary mapping original chain IDs to lists of dictionaries with transform index and mapped chain ID.
            Each entry in the list corresponds to a transformation applied to the asymmetric unit.
        """
        ba=self.active_biological_assembly
        maps={}
        for transform in ba.transforms:
            for oc,mc in transform.chainIDmap.items():
                if not oc in maps:
                    maps[oc]=[]
                maps[oc].append({'transform':transform.index,'mappedchain':mc})
        return maps



[docs]
    def loop_counts(self,min_loop_length=1):
        """
        Check if the asymmetric unit contains loops (missing residues) of a specified minimum length.
        This method iterates through the segments of the asymmetric unit and counts the number of loops
        that are in the ``MISSING`` state and have a length greater than or equal to the specified minimum length.

        Parameters
        ----------
        min_loop_length : int
            The minimum length of loops to consider.

        Returns
        -------
        nloops : dict
            A dictionary containing the counts of loops in the asymmetric unit, categorized by segment type.
            The keys are segment types (e.g., ``protein``, ``nucleicacid``) and the values are the counts of loops.
            For example, ``{'protein': 3, 'nucleicacid': 2}`` indicates that there are 3 loops in protein segments
            and 2 loops in nucleic acid segments.
        """
        self.min_loop_length=min_loop_length
        self.has_protein_loops=False
        nloops={}
        nloops['protein']=0
        nloops['nucleicacid']=0
        au=self.asymmetric_unit
        for S in au.segments:
            if S.segtype=='protein':
                for b in S.subsegments:
                    if b.state=='MISSING' and b.num_items()>=min_loop_length:
                        nloops['protein']+=1
            elif S.segtype=='nucleicacid':
                for b in S.subsegments:
                    if b.state=='MISSING' and b.num_items()>=min_loop_length:
                        nloops['nucleicacid']+=1
        if nloops['protein']>0:
            self.has_protein_loops=True
        return nloops



[docs]
    def nglycans(self):
        """
        Count the number of glycan segments in the asymmetric unit.
        This method iterates through the segments of the asymmetric unit and counts the number of segments
        that are of type ``glycan``.

        Returns
        -------
        nglycans : int
            The number of glycan segments found in the asymmetric unit.
        """
        nglycans=0
        au=self.asymmetric_unit
        for S in au.segments:
            if S.segtype=='glycan':
                nglycans+=1
        return nglycans



[docs]
    def num_images(self):
        """
        Count the number of images in the active biological assembly.
        This method returns the number of transforms applied to the asymmetric unit in the active biological assembly.
        
        Returns
        -------
        num_images : int
            The number of images (transforms) in the active biological assembly."""
        return len(self.active_biological_assembly.transforms)



[docs]
    def num_atoms(self):
        """
        Count the number of atoms in the asymmetric unit.
        This method returns the total number of atoms present in the asymmetric unit.
        
        Returns
        -------
        num_atoms : int
            The total number of atoms present in the asymmetric unit.
        """
        return len(self.asymmetric_unit.atoms)

    

[docs]
    def num_residues(self):
        """
        Count the number of residues in the asymmetric unit.
        This method returns the total number of residues present in the asymmetric unit.

        Returns
        -------
        num_residues : int
            The total number of residues present in the asymmetric unit.
        """
        return len(self.asymmetric_unit.residues)



[docs]
    def num_segments(self):
        """
        Count the number of segments in the asymmetric unit.
        This method returns the total number of segments present in the asymmetric unit.
        
        Returns
        -------
        num_segments : int
            The total number of segments present in the asymmetric unit.
        """
        return len(self.asymmetric_unit.segments)


    # def write_protein_loop_lines(self,writer,cycles=100,**options):
    #     """
    #     Write Tcl commands to declare loops in the asymmetric unit that are in the ``MISSING``
    #     state and have a length greater than or equal to the specified minimum length.
    #     This method iterates through the segments of the asymmetric unit and generates Tcl commands
    #     to declare the loops.
        
    #     Parameters
    #     ----------
    #     writer : scriptwriter
    #         An instance of a script writer that will be used to write the Tcl commands.
    #     cycles : int, optional
    #         The number of cycles for the loop declaration. Default is 100.
    #     options : dict, optional
    #         Additional options for loop declaration. It can include:

    #         - ``min_length``: The minimum length of loops to consider. Default is 4.
    #         - ``include_c_termini``: If True, includes C-terminal loops in the declaration. If False, skips C-terminal loops. Default is False.
    #     """
    #     ba=self.active_biological_assembly
    #     au=self.asymmetric_unit
    #     min_length=self.min_loop_length
    #     include_c_termini=options.get('include_c_termini',False)
    #     for S in au.segments:
    #         # chainID=S.chainID
    #         if S.segtype in 'protein':
    #             asymm_segname=S.segname
    #             n_subsegs=len(S.subsegments)
    #             for b in S.subsegments:
    #                 is_c_terminus=(S.subsegments.index(b)==(n_subsegs-1))
    #                 is_processible=b.state=='MISSING' and b.num_items()>=min_length
    #                 if is_processible and (not include_c_termini) and is_c_terminus:
    #                     logger.debug(f'A.U. C-terminal loop (state={b.state}, bounds={b.bounds}) declashing is skipped')
    #                     is_processible=False
    #                 if is_processible:
    #                     reslist=[f'{r.resid.resid}' for r in S.residues[b.bounds[0]:b.bounds[1]+1]]
    #                     tcllist='[list '+' '.join(reslist)+']'
    #                     for transform in ba.transforms:
    #                         cm=transform.chainIDmap
    #                         act_segID=cm.get(asymm_segname,asymm_segname)
    #                         writer.addline(f'declash_loop $mLL {act_segID} {tcllist} {cycles}')


[docs]
    def write_gaps(self,writer,min_length=4):
        """
        Write Tcl commands to declare gaps in the asymmetric unit that are in the ``MISSING``
        state and have a length greater than or equal to the specified minimum length.
        This method iterates through the segments of the asymmetric unit and generates Tcl commands
        to declare the gaps.
        
        Parameters
        ----------
        writer : scriptwriter
            An instance of a script writer that will be used to write the Tcl commands.
        min_length : int, optional
            The minimum length of gaps to consider. Default is 4.
        """
        ba=self.active_biological_assembly
        au=self.asymmetric_unit
        writer.addline('# fields: segname loop-begin-res loop-end-res connect-to-res')
        for S in au.segments:
            # chainID=S.chainID
            if S.segtype=='protein':
                asymm_segname=S.segname
                for i,b in enumerate(S.subsegments):
                    if b.state=='MISSING':
                        if b.num_items()>=min_length and i<(len(S.subsegments)-1):
                            reslist=[f'{r.resid.resid}' for r in S.residues[b.bounds[0]:b.bounds[1]+1]]
                            bpp=S.subsegments[i+1]
                            nreslist=[f'{r.resid.resid}' for r in S.residues[bpp.bounds[0]:bpp.bounds[1]+1]]
                            assert bpp.state=='RESOLVED'
                            for transform in ba.transforms:
                                cm=transform.chainIDmap
                                act_segID=cm.get(asymm_segname,asymm_segname)
                                writer.addline(f'{act_segID} {reslist[0]} {reslist[-1]} {nreslist[0]}')



[docs]
    def write_connect_patches(self,writer,min_length=4):
        """
        Write Tcl commands to create connect patches (``LINK``s) for gaps in the asymmetric unit.
        This method iterates through the segments of the asymmetric unit and generates Tcl commands
        to create connect patches for gaps that are in the ``MISSING`` state and have a length
        greater than or equal to the specified minimum length.

        Parameters
        ----------
        writer : scriptwriter
            An instance of a script writer that will be used to write the Tcl commands.
        min_length : int, optional
            The minimum length of gaps to consider. Default is 4.
        """
        ba=self.active_biological_assembly
        au=self.asymmetric_unit
        for S in au.segments:
            # chainID=S.chainID
            if S.segtype=='protein':
                asymm_segname=S.segname
                for i,b in enumerate(S.subsegments):
                    if b.state=='MISSING':
                        if b.num_items()>=min_length and i<(len(S.subsegments)-1):
                            llres=S.residues[b.bounds[1]-1]
                            lres=S.residues[b.bounds[1]]
                            nextb=S.subsegments[i+1]
                            rres=S.residues[nextb.bounds[0]]
                            rrres=S.residues[nextb.bounds[0]+1]
                            for transform in ba.transforms:
                                cm=transform.chainIDmap
                                act_segID=cm.get(asymm_segname,asymm_segname)
                                ll=f'{act_segID}:{llres.resid.resid}'
                                l= f'{act_segID}:{ lres.resid.resid}'
                                r= f'{act_segID}:{ rres.resid.resid}'
                                rr=f'{act_segID}:{rrres.resid.resid}'
                                # undo the C-terminus on the left residue
                                writer.addline(f'patch XCTR {l}')
                                # under the N-terminus on the right residue
                                rname=rres.resname
                                if rname=='GLY':
                                    writer.addline(f'patch XGLP {r}')
                                elif rname=='PRO':
                                    writer.addline(f'patch XPRP {r}')
                                else:
                                    writer.addline(f'patch XNTR {r}')
                                # re-establish the peptide bond linkage
                                writer.addline(f'patch LINK {ll} {l} {r} {rr}')

                                # writer.addline(f'patch HEAL {ll} {l} {r} {rr}')


[docs]
    def cleave_chains(self, clv_list: CleavageSiteList):
        """
        Cleave segments in the asymmetric unit based on a list of cleavage specifications.
        This method iterates through the list of cleavage specifications, finds the corresponding segments in the asymmetric unit,
        and performs the cleavage operation. It also updates the chain IDs of disulfide bonds and links that are affected by the cleavage.

        Parameters
        ----------
        clv_list : list
            A list of cleavage specifications to apply to the asymmetric unit.
        """
        au = self.asymmetric_unit
        cm = self.chainIDmanager
        topomods = self.objmanager.get('topol', {})
        for clv in clv_list:
            chainID = clv.chainID
            matching = au.segments.filter(lambda x, cid=chainID: x.chainID == cid)
            if not matching:
                logger.debug(f'No segment with chainID {chainID} found; no cleavage performed.')
                continue
            for S in matching:
                if not S.residues.filter(lambda x, clv=clv: x.resid == clv.resid1):
                    continue
                DchainID = cm.next_unused_chainID()
                logger.debug(f'Cleaving segment {S.segname} at {clv} to make new segment with chainID {DchainID}')
                D = S.cleave(clv, DchainID)
                au.add_segment(D)

                ssbonds = topomods.get('ssbonds', [])
                if ssbonds:
                    for c in ssbonds.filter(lambda x, seg=S: x.chainID1 == seg.segname):
                        for d in D.residues.filter(lambda x, c=c: x.resid == c.resid1):
                            c.chainID1 = d.chainID
                    for c in ssbonds.filter(lambda x, seg=S: x.chainID2 == seg.segname):
                        for d in D.residues.filter(lambda x, c=c: x.resid == c.resid2):
                            c.chainID2 = d.chainID
                links = topomods.get('links', [])
                if links:
                    logger.debug(f'Examining {len(links)} links for ones needing chainID update from {S.segname} to {DchainID} due to cleavage')
                    for c in links.filter(lambda x, seg=S: x.chainID1 == seg.segname):
                        logger.debug(f'...link {str(c)}')
                        for d in D.residues.filter(lambda x, c=c: x.resid == c.resid1):
                            logger.debug(f'...hit! {c.chainID1}->{d.chainID}')
                            c.update_residue(1,chainID=d.chainID)
                    for c in links.filter(lambda x, seg=S: x.chainID2 == seg.segname):
                        logger.debug(f'...link {str(c)}')
                        for d in D.residues.filter(lambda x, c=c: x.resid == c.resid2):
                            logger.debug(f'...hit! {c.chainID2}->{d.chainID}')
                            c.update_residue(2,chainID=d.chainID)
                    for c in links:
                        logger.debug(str(c))



Graft.model_rebuild()
Segment.model_rebuild()