Source code for pestifer.tasks.psfgen

# Author: Cameron F. Abrams, <cfa22@drexel.edu>
"""
Definition of the :class:`PsfgenTask` class for handling invocations of psfgen which create a molecule from a base PDB/mmCIF file.

Usage is described in the :ref:`subs_buildtasks_psfgen` documentation.

"""
import logging
import networkx as nx
import os

from copy import deepcopy
from pathlib import Path
from typing import ClassVar


from .basetask import VMDTask
from ..molecule.chainidmanager import ChainIDManager
from ..molecule.molecule import Molecule
from ..core.objmanager import ObjManager
from ..core.artifacts import *
from ..core.errors import PestiferBuildError
from ..objs.graft import GraftList
from ..psfutil.psfatom import PSFAtomList
from ..psfutil.psfcontents import PSFContents
from ..scripters import VMDScripter, PsfgenScripter
from ..util.util import write_residue_map

logger = logging.getLogger(__name__)


[docs]
class PsfgenTask(VMDTask):
    """ 
    A class for handling invocations of psfgen which create a molecule from a base PDB/mmCIF file
    or from a PSF file generated previously by psfgen
    This class is a descendant of the :class:`BaseTask <pestifer.core.basetask.BaseTask>` class, and as such it has two parameters.

    Parameters
    ----------
    config_specs : dict
        Configuration specifications for the task.
    """

    _yaml_header: ClassVar[str] = 'psfgen'
    """
    YAML header for the PsfgenTask, used to identify the task in configuration files as part of a ``tasks`` list.
    """


[docs]
    def provision(self, packet: dict = {}):
        """
        Provision the task with the standard packet, and initialize molecule dict and base molecule placeholders.
        """
        super().provision(packet=packet)
        self.molecules = {}
        self.base_molecule: Molecule = None



[docs]
    def do(self):
        """
        Execute the psfgen task.
        This method initializes the task, ingests the base molecule, and runs the psfgen process.
        It also handles any necessary coormods and declashing of loops and glycans based on the task specifications.
        The results of the psfgen process are saved as a PSF/PDB fileset, and the state is updated accordingly.
        """
        logger.debug('ingesting molecule(s)')
        self.ingest_molecules()
        logger.debug(f'base mol num images {self.base_molecule.num_images()}')
        logger.debug('Running first psfgen')
        self.result = self.psfgen()
        if self.result != 0:
            return self.result
        # we now have a full coordinate set, so we can do coormods
        self.coormods()
        self.declash()
        return self.result



[docs]
    def coormods(self):
        """
        Perform coordinate modifications based on the specifications provided in the task.
        """
        coormods = self.objmanager.get('coord',{})
        logger.debug(f'psfgen task has {len(coormods)} coormods:')
        logger.debug(';'.join([str(_) for _ in coormods]))

        if coormods:
            logger.debug(f'performing coormods')
            for objtype, objlist in coormods.items():
                if len(objlist) > 0:
                    self.next_basename(objtype)
                    vm: VMDScripter = self.scripters['vmd']
                    packages = []
                    if objtype == 'crotations':
                        packages.append('PestiferCRot')
                    vm.newscript(self.basename, packages=packages)
                    state: StateArtifacts = self.get_current_artifact('state')
                    vm.load_psf_pdb(state.psf.name, state.pdb.name, new_molid_varname='mCM')
                    match objtype:
                        case 'crotations':
                            for transform in self.base_molecule.active_biological_assembly.transforms.data:
                                vm.write_crots(objlist, chainIDmap=transform.chainIDmap)
                        case 'orient':
                            vm.write_orients(objlist)
                        case 'rottrans':
                            vm.write_rottranslist(objlist)
                    vm.write_pdb(self.basename, 'mCM')
                    vm.writescript()
                    vm.runscript()
                    self.register(dict(
                        pdb=PDBFileArtifact(self.basename), 
                        psf=state.psf, 
                        xsc=state.xsc), key='state', artifact_type=StateArtifacts)
                    self.register(self.basename, key='tcl', artifact_type=VMDScriptArtifact)
                    self.register(self.basename, key='log', artifact_type=VMDLogFileArtifact)



[docs]
    def declash(self):
        """
        Manages the declashing of protein loops and glycans
        """
        self.min_loop_length = self.specs['source'].get('sequence',{}).get('loops',{}).get('min_loop_length',0)
        self.declash_counts = self.base_molecule.loop_counts(min_loop_length=self.min_loop_length)
        num_images = self.base_molecule.num_images()
        for segtype in ['protein','nucleicacid']:
            self.declash_counts[segtype] *= num_images
        self.declash_counts['glycan'] = self.base_molecule.nglycans() * num_images
        for segtype,speckey in zip(['protein','nucleicacid','glycan'],['loops','loops','glycans']):
            if self.declash_counts[segtype]>0 and self.specs['source']['sequence'][speckey]['declash']['maxcycles']>0:
                logger.debug(f'Declashing {self.declash_counts[segtype]} {segtype} segments')
                self.declash_segtype(self.specs['source']['sequence'][speckey]['declash'], segtype=segtype)



[docs]
    def resi_topologies(self):
        """
        Collect the topology files that are needed for the residues in the base molecule.
        """
        resis = set([x.resname for x in self.base_molecule.asymmetric_unit.residues.data])
        logger.debug(f'Residue names in base molecule: {resis}')
        CC = self.resource_manager.charmmff_content
        new_topfiles = set()
        for resname in resis:
            topfile = CC.get_topfile_of_resname(resname)
            if topfile:
                new_topfiles.add(topfile)
            else:
                raise PestiferBuildError(f'No topology file found for residue name {resname}')
        return list(new_topfiles)



[docs]
    def patch_topologies(self):
        """
        Collect the topology files that are needed for the patches and links in the base molecule.
        This method retrieves the CHARMMFF topology files associated with the patches and links defined 
        in the base molecule's object manager.
        """
        objmanager = self.base_molecule.objmanager
        seqmods = objmanager.get('seq', {})
        patches = seqmods.get('patches', [])
        CC = self.resource_manager.charmmff_content
        new_topfiles = set()
        # logger.debug(f'New topologies: {new_topfiles}')
        for patch in patches:
            topfile = CC.get_topfile_of_resname(patch.patchname)
            if topfile:
                new_topfiles.add(topfile)
        # logger.debug(f'New topologies: {new_topfiles}')
        topomods = objmanager.get('topol', {})
        links = topomods.get('links', [])
        for link in links:
            topfile = CC.get_topfile_of_resname(link.patchname)
            if topfile:
                new_topfiles.add(topfile)
        # logger.debug(f'New topologies: {new_topfiles}')
        return list(new_topfiles)



[docs]
    def psfgen(self):
        """
        Run the psfgen process to generate a PSF and PDB fileset from the base molecule.
        """
        self.next_basename('build')
        pg: PsfgenScripter = self.scripters['psfgen']
        required_topology_files = list(set(self.patch_topologies() + self.resi_topologies()))
        pg.newscript(self.basename, packages=['PestiferCRot'], additional_topologies=required_topology_files)
        pg.set_molecule(self.base_molecule, altcoords=self.specs.get('source', {}).get('altcoords', None))
        pg.describe_molecule(self.base_molecule)
        pg.writescript(self.basename)
        result = pg.runscript(keep_tempfiles=True)
        if result != 0:
            return result
        # register PSF, PDB, log, and all charmmff files in the pipeline context
        self.register(self.basename, key='tcl', artifact_type=PsfgenInputScriptArtifact)
        self.register(self.basename, key='log', artifact_type=PsfgenLogFileArtifact) 
        self.register(dict(
            pdb=PDBFileArtifact(self.basename, pytestable=True), 
            psf=PSFFileArtifact(self.basename, pytestable=True)), key='state', artifact_type=StateArtifacts)
        self.register([CharmmffTopFileArtifact(x) for x in pg.topologies if x.endswith('.rtf')], key='charmmff_topfiles', artifact_type=CharmmffTopFileArtifacts)
        self.register([CharmmffStreamFileArtifact(x) for x in pg.topologies if x.endswith('.str')], key='charmmff_streamfiles', artifact_type=CharmmffStreamFileArtifacts)
        self.register([PDBFileArtifact(x) for x in pg.F if x.endswith('.pdb')], key='psfgen_temp_pdbs', artifact_type=PDBFileArtifactList)
        self.pipeline.show_artifacts(header='Artifacts after psfgen')                              
        self.strip_remarks()
        return 0

        

[docs]
    def strip_remarks(self):
        """
        Strip REMARK lines from the PDB file generated by psfgen.
        This method removes any REMARK lines from the PDB file to ensure that it contains only the relevant atomic coordinates and structure information.
        """
        state: StateArtifacts = self.get_current_artifact('state')
        pdb = state.pdb.name
        if not pdb:
            logger.warning('No PDB file found to strip remarks from.')
            return
        if not os.path.exists(pdb):
            logger.warning(f'PDB file {pdb} does not exist, cannot strip remarks.')
            return
        logger.debug(f'Stripping REMARK lines from {pdb}.')
        with open(pdb, 'r') as infile:
            lines = infile.readlines()
        with open(pdb, 'w') as outfile:
            for line in lines:
                if not line.startswith('REMARK'):
                    outfile.write(line)
        self.register(dict(
            pdb=PDBFileArtifact(self.basename, pytestable=True), 
            psf=state.psf), key='state', artifact_type=StateArtifacts)



[docs]
    def declash_segtype(self, specs: dict, segtype='protein'):
        """
        Declash loops in the base molecule using the custom ``PestiferDeclash`` TcL package.
        This method generates a VMD script to identify and declash loops in the molecular structure.
        It uses the ``PestiferDeclash`` package to perform the declashing operation, which involves identifying
        and modifying the coordinates of atoms in loop regions.
        """
        specs_to_declashers = specs#['source']['sequence']['declash']
        if segtype == 'protein':
            self.declash_protein_loops(specs_to_declashers)
        elif segtype == 'nucleicacid':
            self.declash_na_loops(specs_to_declashers)
        elif segtype == 'glycan':
            self.declash_glycans(specs_to_declashers)
        else:
            raise PestiferBuildError(f'Unknown segment type {segtype} for declashing')



[docs]
    def declash_protein_loops(self, specs: dict):
        """
        Declash loops in the base molecule using the custom ``PestiferDeclash`` TcL package.
        This method generates a VMD script to identify and declash loops in the molecular structure.
        It uses the ``PestiferDeclash`` package to perform the declashing operation, which involves identifying
        and modifying the coordinates of atoms in loop regions.
        """
        mol = self.base_molecule
        cycles = specs['maxcycles']
        self.next_basename('declash-loops')
        state: StateArtifacts = self.get_current_artifact('state')
        psf = state.psf.name
        pdb = state.pdb.name
        vt: VMDScripter = self.scripters['vmd']
        vt.newscript(self.basename, packages=['PestiferDeclash'])
        vt.load_psf_pdb(psf, pdb, new_molid_varname='mLL')
        vt.write_protein_loop_lines(mol, cycles=cycles, include_c_termini=specs['include_C_termini'])
        vt.write_pdb(self.basename, 'mLL')
        vt.writescript()
        vt.runscript()
        self.register(dict(
            pdb=PDBFileArtifact(self.basename), 
            psf=state.psf), key='state', artifact_type=StateArtifacts)
        self.register(self.basename, key='tcl', artifact_type=VMDScriptArtifact)
        self.register(self.basename, key='log', artifact_type=VMDLogFileArtifact)



[docs]
    def declash_na_loops(self,specs):
        """
        Declash nucleic acid loops in the base molecule using the custom ``PestiferDeclash`` TcL package.
        This method generates a VMD script to identify and declash nucleic acid loops in the molecular structure.
        It uses the ``PestiferDeclash`` package to perform the declashing operation, which involves identifying
        and modifying the coordinates of atoms in nucleic acid loop regions.
        """
        cycles = specs['maxcycles']
        clashdist = specs['clashdist']
        self.next_basename('declash-na-loops')
        vt: VMDScripter = self.scripters['vmd']
        state: StateArtifacts = self.get_current_artifact('state')
        psf = state.psf.name
        pdb = state.pdb.name
        outpdb = f'{self.basename}.pdb'
        vt.newscript(self.basename, packages=['PestiferDeclash'])
        vt.addline(f'mol new {psf}')
        vt.addline(f'mol addfile {pdb} waitfor all')
        vt.addline(f'set a [atomselect top all]')
        vt.addline(f'set molid [molinfo top get id]')
        nna = self._write_na_loops(vt)
        vt.addline(f'set nna {nna}')
        vt.addline(f'vmdcon -info "Declashing $nna nucleic acid loops; clashdist {clashdist}; maxcycles {cycles}"')
        vt.addline(r'for {set i 0} {$i<$nna} {incr i} {')
        vt.addline(f'   declash_pendant $molid $na_idx($i) $rbonds($i) $movers($i) {cycles} {clashdist}')
        vt.addline(r'}')
        vt.addline(f'$a writepdb {outpdb}')
        vt.writescript()
        logger.debug(f'Declashing {nna} nucleic acid loops')
        vt.runscript(progress_title='declash-nucleic-acid-loops')
        self.register(dict(
            pdb=PDBFileArtifact(self.basename), 
            psf=state.psf), key='state', artifact_type=StateArtifacts)
        self.register(self.basename, key='tcl', artifact_type=VMDScriptArtifact)
        self.register(self.basename, key='log', artifact_type=VMDLogFileArtifact)


    def _write_na_loops(self, vt: VMDScripter, **options):
        mol = self.base_molecule
        au = mol.asymmetric_unit
        state: StateArtifacts = self.get_current_artifact('state')
        psf = state.psf.name
        logger.debug(f'ingesting {psf}')
        struct = PSFContents(psf, parse_topology=['bonds'])
        logger.debug(f'PSF has {len(struct.atoms)} atoms; extracting nucleic acid atoms...')
        na_atoms: PSFAtomList = struct.atoms.get(lambda x: x.segtype == 'nucleicacid')
        logger.debug(f'PSF has {len(na_atoms)} nucleic acid atoms')
        # my_rep = list(set([(x.chainID, x.resid) for x in na_atoms.data]))
        # my_rep.sort(key=lambda x: (x[0], x[1]))
        logger.debug(f'Getting loops from {len(na_atoms)} nucleic acid atoms in PSF file {psf}')
        # logger.debug(f'{my_rep}')
        min_length = self.min_loop_length
        include_c_termini = options.get('include_c_termini', False)
        i = 0
        SL = [S for S in au.segments.data if S.segtype == 'nucleicacid']
        logger.debug(f'Asymmetric unit has {len(SL)} nucleic acid segments')
        for S in SL:
            asymm_segname = S.segname
            logger.debug(f'Processing segment {asymm_segname} for nucleic acid loops')
            n_subsegs = len(S.subsegments)
            for b in S.subsegments.data:
                lr_resid = S.residues[b.bounds[0]].resid
                rr_resid = S.residues[b.bounds[1]].resid
                logger.debug(f'Processing subsegment {b.state} for segname {asymm_segname} with bounds {lr_resid}-{rr_resid}')
                is_c_terminus = (S.subsegments.index(b) == (n_subsegs - 1))
                is_processible = b.state == 'MISSING' and b.num_items() >= min_length
                if is_processible and (not include_c_termini) and is_c_terminus:
                    logger.debug(f'A.U. C-terminal loop {b.state} declashing is skipped')
                    is_processible = False
                if is_processible:
                    logger.debug(f'Processing loop {b.state} {b.bounds} for segname {asymm_segname}')
                    loop_atoms = PSFAtomList([x for x in na_atoms.data if x.segname == asymm_segname and x.resid >= lr_resid and x.resid <= rr_resid])
                    logger.debug(f'Loop {b.state} has {len(loop_atoms)} atoms from PSFAtomList')
                    na_graph = loop_atoms.graph()
                    logger.debug(f'{na_graph}')
                    G = [na_graph.subgraph(c).copy() for c in nx.connected_components(na_graph)]
                    assert len(G) == 1, f'NA loop {b.state} has more than one connected component'
                    logger.debug(f'Loop {b.state} has {len(loop_atoms)} atoms')
                    g = G[0]
                    serials = [x.serial for x in g]
                    for at in g:
                        lig_ser = [x.serial for x in at.ligands]
                        for k, ls in enumerate(lig_ser):
                            if not ls in serials:
                                at.is_root = True
                                rp = at.ligands[k]
                                logger.debug(f'-> Atom {str(at)} is the root, bound to atom {str(rp)}')
                    indices = ' '.join([str(x.serial-1) for x in g])
                    vt.addline(f'set na_idx({i}) [list {indices}]')
                    vt.addline(f'set rbonds({i}) [list]')
                    vt.addline(f'set movers({i}) [list]')
                    for bond in nx.bridges(g):
                        ai, aj = bond
                        if not (ai.isH() or aj.isH()) and not ai.is_pep(aj):
                            g.remove_edge(ai, aj)
                            CC = [g.subgraph(c).copy() for c in nx.connected_components(g)]
                            assert len(CC) == 2, f'Bond {ai.serial-1}-{aj.serial-1} when cut makes more than 2 components'
                            for sg in CC:
                                is_root = any([hasattr(x, 'is_root') for x in sg])
                                if not is_root:
                                    if ai in sg:
                                        sg.remove_node(ai)
                                    if aj in sg:
                                        sg.remove_node(aj)
                                    if len(sg) > 1 or (len(sg) == 1 and not [x for x in sg.nodes][0].isH()):
                                        mover_serials = [x.serial for x in sg]
                                        mover_indices = " ".join([str(x-1) for x in mover_serials])
                                        logger.debug(f'{str(ai)}--{str(aj)} is a rotatable bridging bond')
                                        vt.addline(f'lappend rbonds({i}) [list {ai.serial-1} {aj.serial-1}]')
                                        logger.debug(f'  -> movers: {" ".join([str(x) for x in sg])}')
                                        vt.addline(f'lappend movers({i}) [list {mover_indices}]')
                            g.add_edge(ai, aj)
                    i += 1
        return i


[docs]
    def declash_glycans(self, specs):
        """
        Declash glycans in the base molecule using the custom ``PestiferDeclash`` TcL package.
        This method generates a VMD script to identify and declash glycans in the molecular structure.
        It uses the ``PestiferDeclash`` package to perform the declashing operation,
        which involves identifying and modifying the coordinates of atoms in glycan regions.
        """
        mol: Molecule = self.base_molecule
        cycles: int = specs['maxcycles']
        clashdist: float = specs['clashdist']
        if not mol.nglycans() or not cycles:
            logger.debug(f'Glycan declashing is intentionally not done.')
            return
        self.next_basename('declash-glycans')
        outpdb = f'{self.basename}.pdb'
        state: StateArtifacts = self.get_current_artifact('state')
        psf = state.psf.name
        pdb = state.pdb.name
        vt: VMDScripter = self.scripters['vmd']
        vt.newscript(self.basename, packages=['PestiferDeclash'])
        vt.addline(f'mol new {psf}')
        vt.addline(f'mol addfile {pdb} waitfor all')
        vt.addline(f'set a [atomselect top all]')
        vt.addline(f'set molid [molinfo top get id]')
        nglycans = self._write_glycans(vt)
        vt.addline(f'vmdcon -info "Declashing $nglycans glycans; clashdist {clashdist}; maxcycles {cycles}"')
        vt.addline(r'for {set i 0} {$i<$nglycans} {incr i} {')
        vt.addline(f'   declash_pendant $molid $glycan_idx($i) $rbonds($i) $movers($i) {cycles} {clashdist}')
        vt.addline(r'}')
        vt.addline(f'$a writepdb {outpdb}')
        vt.writescript()
        logger.debug(f'Declashing {nglycans} glycans')
        vt.runscript(progress_title='declash-glycans')
        self.register(dict(
            pdb=PDBFileArtifact(self.basename), 
            psf=state.psf), key='state', artifact_type=StateArtifacts)
        self.register(self.basename, key='tcl', artifact_type=VMDScriptArtifact)
        self.register(self.basename, key='log', artifact_type=VMDLogFileArtifact)


    def _write_glycans(self, fw: VMDScripter):
        state: StateArtifacts = self.get_current_artifact('state')
        psf = state.psf.name
        logger.debug(f'ingesting {psf}')
        struct = PSFContents(psf, parse_topology=['bonds'])
        logger.debug(f'Making graph structure of glycan atoms...')
        glycanatoms = struct.atoms.get(lambda x: x.segtype == 'glycan')
        logger.debug(f'{len(glycanatoms)} total glycan atoms')
        glycangraph = glycanatoms.graph()
        G = [glycangraph.subgraph(c).copy() for c in nx.connected_components(glycangraph)]
        logger.debug(f'Preparing declash input for {len(G)} glycans')
        fw.addline(f'set nglycans {len(G)}')
        for i, g in enumerate(G):
            rooted = False
            logger.debug(f'Glycan {i} has {len(g)} atoms')
            serials = [x.serial for x in g] # list of atoms in glycan by serial
            for j, at in enumerate(g):
                lig_ser = [x.serial for x in at.ligands] # list of ligands of this atom by serial
                logger.debug(f'    {j}: {str(at)} {len(lig_ser)} ligands')
                for k, ls in enumerate(lig_ser):
                    if not ls in serials: # this atom has a ligand that is not in the glycan
                        at.is_root = True
                        rooted = True
                        rp = at.ligands[k]
                        logger.debug(f'        -> {str(at)} ({at.resname} {at.segtype}) is the root, bound to outside atom {rp.serial}')
                        break
                if rooted:
                    break
            assert rooted, f'    Could not find a root atom for glycan {i} -- this is a bug'
            indices = ' '.join([str(x.serial-1) for x in g])
            # logger.debug(f'indices {indices}')
            fw.comment(f'Glycan {i}:')
            fw.addline(f'set glycan_idx({i}) [list {indices}]')
            fw.addline(f'set rbonds({i}) [list]')
            fw.addline(f'set movers({i}) [list]')
            # for each rotatable bond, define the two atoms in the bond by INDICES 
            # and identify movers when pendant is rotated around this bond
            # a rotatable bond is one that is a bridge, is not a peptide bond, and is not a bond to an H
            for bond in nx.bridges(g):
                ai, aj = bond
                if not (ai.isH() or aj.isH()) and not ai.is_pep(aj):
                    g.remove_edge(ai, aj)
                    S = [g.subgraph(c).copy() for c in nx.connected_components(g)]
                    assert len(S) == 2, f'Bond {str(ai)}-{str(aj)} when cut makes more than 2 components (not a bridge)'
                    is_root = [any([getattr(x, 'is_root', False) for x in sg]) for sg in S]
                    assert any(is_root), f'No root atom found in either half split by bond {str(ai)}-{str(aj)}'
                    unrooted_subgraph = S[is_root.index(False)]
                    if ai in unrooted_subgraph:
                        unrooted_subgraph.remove_node(ai)
                    if aj in unrooted_subgraph:
                        unrooted_subgraph.remove_node(aj)
                    if len(unrooted_subgraph) > 1 or (len(unrooted_subgraph) == 1 and not list(unrooted_subgraph.nodes)[0].isH()):
                        mover_serials = [x.serial for x in unrooted_subgraph]
                        mover_indices = " ".join([str(x-1) for x in mover_serials])
                        logger.debug(f'{str(ai)}--{str(aj)} is a rotatable bridging bond')
                        fw.addline(f'lappend rbonds({i}) [list {ai.serial-1} {aj.serial-1}]')
                        logger.debug(f'  -> movers: {",".join([str(x) for x in unrooted_subgraph])}')
                        fw.addline(f'lappend movers({i}) [list {mover_indices}]')
                    g.add_edge(ai, aj)
        return len(G)


[docs]
    def ingest_molecules(self):
        """
        Ingests the base molecule from the specifications provided in the task.
        This molecule is initialized from either base coordinates fetched in a prior fetch task,
        or a state defined by a previous continuation task.
        It also handles any graft sources specified in the sequence modifications and
        activates the biological assembly of the base molecule.
        """
        specs = self.specs
        self.source_specs = specs['source']
        assert not 'id' in self.source_specs, f'Version 2.0+ of Pestifer does not support "id" in source specs.  Please begin your task list with either a `fetch` or `continuation` task.'
        this_source = {}
        base_coordinates: Path = self.get_current_artifact_path('base_coordinates')
        if not base_coordinates:
            state: StateArtifacts = self.get_current_artifact('state')
            if not (state.pdb and state.psf):
                raise PestiferBuildError(f'No base_coordinates artifact found, and no prebuilt PDB/PSF/XSC files found in the pipeline context. Cannot ingest base molecule.')
            this_source['prebuilt'] = {
                'pdb': state.pdb.name,
                'psf': state.psf.name,
                'xsc': state.xsc.name if state.xsc else None
            }
            this_source['file_format'] = 'PDB'
        else:
            basename, ext = os.path.splitext(base_coordinates)
            this_source['source_id'] = basename
            this_source['source_db'] = 'local'
            if ext.lower() == '.pdb':
                this_source['file_format'] = 'PDB'
            elif ext.lower() == '.cif':
                this_source['file_format'] = 'mmCIF'
            else:
                raise PestiferBuildError(f'Unknown file format {ext} for base_coordinates artifact {base_coordinates.name}')
        self.source_specs.update(this_source)
        logger.debug(f'User-input modspecs:')
        my_logger(self.specs["mods"], logger.debug, depth=1)
        self.objmanager = ObjManager(self.specs['mods'])
        seqmods = self.objmanager.get('seq', {})
        logger.debug(f'Ingesting seqmods:')
        my_logger(seqmods, logger.debug, depth=1)
        if 'grafts' in seqmods:
            # logger.debug(f'looking for graft sources to ingest')
            Grafts: GraftList = seqmods['grafts']
            graft_artifacts = []
            for g in Grafts.data:
                if not g.source_pdbid in self.molecules:
                    logger.debug(f'Ingesting graft source {g.source_pdbid}')
                    this_source = {
                        'source_db': 'rcsb',
                        'source_id': g.source_pdbid,
                        'file_format': 'PDB',
                        'sequence': {'skip_glycan_renaming': True}
                    }
                    # the Molecule call below will fetch coordinates for graft pdbs
                    self.molecules[g.source_pdbid] = Molecule(source=this_source)
                    graft_artifacts.append(PDBFileArtifact(g.source_pdbid))
                g.activate(deepcopy(self.molecules[g.source_pdbid]))
            if len(graft_artifacts) > 0:
                self.register(graft_artifacts, key='graft_sources', artifact_type=PDBFileArtifactList)
        self.chainIDmanager = ChainIDManager(
            format = self.source_specs['file_format'],
            transform_reserves = self.source_specs.get('transform_reserves', {}),
            remap = self.source_specs.get('remap_chainIDs', {}))
        # if 'vmdatomselections' in self.source_specs['exclude']:
        #     self.apply_vmdexclusions()
        self.base_molecule = Molecule(source = self.source_specs,
                                      objmanager = self.objmanager,
                                      chainIDmanager = self.chainIDmanager).activate_biological_assembly(self.source_specs['biological_assembly'])
        # register self.base_molecule in the pipeline context
        self.register(self.base_molecule, key='base_molecule')
        for molid, molecule in self.molecules.items():
            logger.debug(f'Molecule "{molid}": {molecule.num_atoms()} atoms in {molecule.num_residues()} residues; {molecule.num_segments()} segments.')
        if 'cif' in self.source_specs.get('file_format', '').lower():
            if self.source_specs.get('cif_residue_map_file', ''):
                write_residue_map(self.base_molecule.asymmetric_unit.residues.cif_residue_map(), self.source_specs['cif_residue_map_file'])
                self.register(Path(self.source_specs['cif_residue_map_file']), key='cif_residue_map', artifact_type=CSVDataFileArtifact)
            logger.debug(f'Base molecule has {self.base_molecule.num_atoms()} atoms in {self.base_molecule.num_residues()} residues; {self.base_molecule.num_segments()} segments.')



[docs]
    def update_molecule(self):
        """
        Updates all segments of the base molecule based on the 
        current coordinate file.  All ssbonds and links are 
        carried forward. No biological assembly beyond the apparent
        asymmetric unit is assumed. This should permit generation
        of a new psfgen script based on this new molecule to 
        recreate it or modify it.
        """
        # get the key of the base_molecule
        logger.debug(f'{self.taskname} has {len(self.molecules)} entries in its molecules dict')
        base_key = 'base'
        for k, v in self.molecules.items():
            if v == self.base_molecule:
                base_key = k
        # assert base_key!='UNSET',f'Cannot update a non-existent base molecule'
        # get the psf, pdb, xsc from the pipeline context
        state: StateArtifacts = self.get_current_artifact('state')
        pdb: Path = state.pdb
        psf: Path = state.psf
        xsc: Path | None = state.xsc
        if not (pdb and psf):
            raise PestiferBuildError(f'No base_coordinates artifact found, and no prebuilt PDB/PSF/XSC files found in the pipeline context. Cannot ingest base molecule.')
        source = {}
        source['prebuilt'] = {
            'pdb': pdb.name,
            'psf': psf.name,
            'xsc': xsc.name if xsc else None
        }
        if hasattr(self, 'chainIDmanager') and hasattr(self, 'objmanager'):
            updated_molecule = Molecule(source=source, chainIDmanager=self.chainIDmanager, objmanager=self.objmanager).activate_biological_assembly(0)
        else:
            updated_molecule = Molecule(source=source).activate_biological_assembly(0)

        self.molecules[base_key] = updated_molecule
        self.base_molecule = updated_molecule
        self.register(self.base_molecule, key='base_molecule')