Source code for pestifer.charmmff.graphhelpers

# Author: ChatGPT 5
# Assistant: Cameron F. Abrams <cfa22@drexel.edu>

import networkx as nx
import logging

logger = logging.getLogger(__name__)


[docs]
def elem(G: nx.Graph, n: int, element_attr: str = 'element'):
    e = G.nodes[n].get(element_attr)
    return e if e else str(n)[0]  # fallback: first char of label



[docs]
def is_H(G: nx.Graph, n: int, element_attr: str = 'element'):  return elem(G, n, element_attr) == 'H'


[docs]
def is_C(G: nx.Graph, n: int, element_attr: str = 'element'):  return elem(G, n, element_attr) == 'C'


[docs]
def is_O(G: nx.Graph, n: int, element_attr: str = 'element'):  return elem(G, n, element_attr) == 'O'


[docs]
def is_N(G: nx.Graph, n: int, element_attr: str = 'element'):  return elem(G, n, element_attr) == 'N'


[docs]
def is_P(G: nx.Graph, n: int, element_attr: str = 'element'):  return elem(G, n, element_attr) == 'P'


[docs]
def is_S(G: nx.Graph, n: int, element_attr: str = 'element'):  return elem(G, n, element_attr) == 'S'



[docs]
def mark_cc_doubles_by_degree(G: nx.Graph, element_attr: str = 'element', set_other_cc_single: bool = True):
    """
    Set bond 'order' on C-C edges using node degrees only:
      - C-C with both endpoints degree==3 -> order=2
      - (optional) all other C-C -> order=1
    Non-C-C edges are untouched.
    Returns the list of edges marked as double.
    """
    deg = dict(G.degree())  # snapshot the DegreeView once

    # Optionally preset all C-C to single
    if set_other_cc_single:
        for u, v in G.edges():
            if is_C(G, u, element_attr) and is_C(G, v, element_attr):
                G.edges[u, v]['order'] = 1

    cc_doubles = []
    for u, v in G.edges():
        if is_C(G, u, element_attr) and is_C(G, v, element_attr):
            if deg.get(u, 0) == 3 and deg.get(v, 0) == 3:
                G.edges[u, v]['order'] = 2
                cc_doubles.append((u, v))
    return cc_doubles


import networkx as nx

HETERO_HEAVY = {'O', 'N', 'P', 'S'}  # extend if you like (F, Cl, Br, I, etc.)
ATOMIC_Z = {'H':1, 'C':6, 'N':7, 'O':8, 'P':15, 'S':16}  # extend if needed

# ---------------- helpers ----------------

[docs]
def heavy_neighbors(G: nx.Graph, n: int, element_attr: str = 'element'):
    return [x for x in G.neighbors(n) if not is_H(G, x, element_attr)]



[docs]
def heavy_degree(G: nx.Graph, n: int, element_attr: str = 'element'):
    return sum(1 for _ in heavy_neighbors(G, n, element_attr))



[docs]
def heavy_only_nodes(G: nx.Graph, element_attr: str = 'element'):
    return [n for n in G.nodes if not is_H(G, n, element_attr)]



[docs]
def cycle_atoms_heavy(G: nx.Graph, element_attr: str = 'element'):
    """Return nodes on any cycle, computed on heavy-atom induced subgraph."""
    Hsub = G.subgraph(heavy_only_nodes(G, element_attr))
    cyc_nodes = set()
    for comp in nx.connected_components(Hsub):
        sub = Hsub.subgraph(comp)
        for cyc in nx.cycle_basis(sub):
            cyc_nodes.update(cyc)
    return cyc_nodes



[docs]
def seed_set(G: nx.Graph, branch_deg_threshold: int = 3, element_attr: str = 'element'):
    """
    Seeds = hetero heavy atoms, ring atoms (heavy graph), and heavy-branch carbons (heavy_degree>=3).
    Hydrogens are never seeds.
    """
    cyc = cycle_atoms_heavy(G, element_attr)
    S = set()
    for n in G.nodes:
        if is_H(G, n, element_attr):
            continue  # exclude hydrogens from seeds
        e = elem(G, n, element_attr)
        if e in HETERO_HEAVY:
            S.add(n)
            continue
        if n in cyc:
            S.add(n)
            continue
        if e == 'C' and heavy_degree(G, n, element_attr) >= branch_deg_threshold:
            S.add(n)
    return S



[docs]
def carbon_neighbors_outside_S(G: nx.Graph, node: int, S: set[int], element_attr: str = 'element'):
    """Immediate carbon neighbors not in S."""
    return [v for v in G.neighbors(node)
            if v not in S and is_C(G, v, element_attr)]



[docs]
def carbon_component_from_neighbor(G: nx.Graph, start: int, S: set[int], element_attr: str = 'element'):
    """
    Carbon-only connected component reachable from 'start' without entering S.
    Hydrogens are ignored implicitly since we require is_C.
    """
    frontier = [start]
    visited = set()
    while frontier:
        v = frontier.pop()
        if v in visited:
            continue
        visited.add(v)
        for w in G.neighbors(v):
            if w in S:
                continue
            if is_C(G, w, element_attr) and w not in visited:
                frontier.append(w)
    return G.subgraph(visited).copy()



[docs]
def leafs_in(sub: nx.Graph, avoid: int = None, element_attr: str = 'element'):
    """Degree-1 nodes within sub; optionally ignore 'avoid'."""
    leaves = []
    for n in sub.nodes:
        if avoid is not None and n == avoid:
            continue
        if sub.degree(n) <= 1:
            leaves.append(n)
    return leaves



[docs]
def anchor_paths(sub: nx.Graph, anchor_adj: int, element_attr: str = 'element'):
    """One path per terminal leaf within the connected piece containing anchor_adj."""
    if anchor_adj not in sub:
        return []
    comp_nodes = nx.node_connected_component(sub, anchor_adj)
    comp = sub.subgraph(comp_nodes).copy()
    leaves = [n for n in comp.nodes if n != anchor_adj and comp.degree(n) <= 1]
    if not leaves:
        return [[anchor_adj]]
    return [nx.shortest_path(comp, anchor_adj, leaf) for leaf in leaves]



[docs]
def count_unsats_cc(G: nx.Graph, path: list[int], element_attr: str = 'element') -> int:
    """Count C=C along a carbon path using 'order' if present; otherwise 0."""
    uns = 0
    for u, v in zip(path, path[1:]):
        if is_C(G, u, element_attr) and is_C(G, v, element_attr):
            if G.edges[u, v].get('order', 1) >= 2:
                uns += 1
    return uns


# ---------------- main API ----------------

[docs]
def label_lipid_chains_heavy_aware(
    G: nx.Graph,
    min_len=6,
    split_branches=True,
    include_carbonyl_C_as_seed=True,
    element_attr: str = 'element',
):
    """
    Identify lipid tails robustly when hydrogens are present.

    Seeds:
      - hetero heavy atoms (O,N,P,S),
      - heavy-graph ring atoms,
      - heavy-branch carbons (heavy_degree >= 3),
      - (optionally) carbonyl carbons are naturally included via heavy_degree rule.

    Returns:
      node_to_chain_id: dict node -> int or None
      chains: list of dicts {id, anchor, nodes, path, length, unsaturations}
    """

    node_to_chain = {n: None for n in G.nodes}
    chains = []
    used = set()  # already assigned carbon nodes (when split_branches=True)

    # Build seeds on HEAVY logic (hydrogens excluded)
    S = seed_set(G, branch_deg_threshold=3, element_attr=element_attr)

    # Optionally ensure carbonyl C are treated as seeds even if not >=3 heavy neighbors
    # (Often they already are: neighbors are O(=), O(-), and C -> heavy_degree=3.)
    if include_carbonyl_C_as_seed:
        for n in heavy_only_nodes(G, element_attr):
            if not is_C(G, n, element_attr):
                continue
            # Carbonyl-like: has at least one oxygen neighbor; tend to be anchors
            if any(is_O(G, x, element_attr) for x in heavy_neighbors(G, n, element_attr)):
                S.add(n)

    # Case A: seeds exist (typical)
    if S:
        chain_id = 0
        for s in S:
            for cn in carbon_neighbors_outside_S(G, s, S, element_attr):
                if split_branches and cn in used:
                    continue
                comp = carbon_component_from_neighbor(G, cn, S, element_attr)
                if comp.number_of_nodes() == 0:
                    continue

                if split_branches:
                    paths = anchor_paths(comp, cn, element_attr)
                    for p in paths:
                        L = sum(1 for x in p if is_C(G, x, element_attr))
                        if L < min_len:
                            continue
                        uns = count_unsats_cc(G, p, element_attr)
                        for n in p:
                            node_to_chain[n] = chain_id
                            used.add(n)
                        chains.append({
                            'id': chain_id,
                            'anchor': s,
                            'nodes': set(p),
                            'path': p,
                            'length': L,
                            'unsaturations': uns,
                        })
                        chain_id += 1
                else:
                    comp_nodes = set(comp.nodes)
                    L = sum(1 for n in comp_nodes if is_C(G, n, element_attr))
                    if L >= min_len:
                        leaves = [n for n in comp.nodes if comp.degree(n) <= 1]
                        if leaves:
                            a = leaves[0]
                            b = max(nx.single_source_shortest_path_length(comp, a),
                                    key=lambda kv: kv[1])[0]
                            c = max(nx.single_source_shortest_path_length(comp, b),
                                    key=lambda kv: kv[1])[0]
                            p = nx.shortest_path(comp, b, c)
                        else:
                            p = [cn]
                        uns = count_unsats_cc(G, p, element_attr)
                        for n in comp_nodes:
                            node_to_chain[n] = chain_id
                            used.add(n)
                        chains.append({
                            'id': chain_id,
                            'anchor': s,
                            'nodes': comp_nodes,
                            'path': p,
                            'length': L,
                            'unsaturations': uns,
                        })
                        chain_id += 1

    # Case B: no seeds (pure hydrocarbon, no rings/branches/heteroatoms)
    if not chains and not S and G.number_of_nodes() > 0:
        comp_nodes = max(nx.connected_components(G), key=len)
        sub = G.subgraph(comp_nodes)
        a = next(iter(sub.nodes))
        b = max(nx.single_source_shortest_path_length(sub, a), key=lambda kv: kv[1])[0]
        c = max(nx.single_source_shortest_path_length(sub, b), key=lambda kv: kv[1])[0]
        p = nx.shortest_path(sub, b, c)
        L = sum(1 for x in p if is_C(G, x, element_attr))
        uns = count_unsats_cc(G, p, element_attr)
        for n in p:
            node_to_chain[n] = 0
        chains.append({
            'id': 0,
            'anchor': None,
            'nodes': set(p),
            'path': p,
            'length': L,
            'unsaturations': uns,
        })

    for h in G.nodes:
        if not is_H(G, h, element_attr):
            continue
        # only consider C–H; ignore O–H, N–H, etc.
        carbon_nbrs = [n for n in G.neighbors(h) if is_C(G, n, element_attr)]
        if len(carbon_nbrs) != 1:
            continue  # skip weird/ambiguous cases
        c = carbon_nbrs[0]
        cid = node_to_chain.get(c)
        if cid is not None:
            node_to_chain[h] = cid
    return node_to_chain, chains


def _is_heavy(G, n, element_attr: str = "element"):  # exclude hydrogens
    return not is_H(G, n, element_attr)

def _is_hetero_heavy(G, n, element_attr: str = "element"):
    return not is_H(G, n, element_attr) and not is_C(G, n, element_attr)

def _Z(G, n, element_attr: str = "element"):
    return ATOMIC_Z.get(elem(G, n, element_attr), 0)


[docs]
def head_tail_pairs(G: nx.Graph, chains: list[dict], element_attr: str = "element"):
    """
    For each chain in `chains`, return:
      {'chain_id', 'head', 'tail', 'head_element', 'head_distance'}
    where:
      tail = last node of the chain's path
      head = heaviest hetero atom at the minimum heavy-graph distance from the chain's anchor
    """
    # Heavy-atom-only view for distances (ignore hydrogens)
    heavy_nodes = [n for n in G.nodes if _is_heavy(G, n, element_attr)]
    H = G.subgraph(heavy_nodes)
    hetero_nodes = [n for n in H.nodes if _is_hetero_heavy(G, n, element_attr)]

    results = []
    for ch in chains:
        cid   = ch['id']
        anchor = ch.get('anchor')
        path   = ch.get('path', [])
        tail   = path[-1] if path else None

        # Fallback if anchor is None: use path start (first carbon) as anchor proxy
        anchor_proxy = anchor if anchor is not None else (path[0] if path else None)

        head = None
        head_dist = None
        tail_dist = None

        if anchor_proxy in H:
            # Distances on heavy graph only
            dist = nx.single_source_shortest_path_length(H, anchor_proxy)
            # Candidates: hetero heavy reachable nodes
            candidates = [n for n in hetero_nodes if n in dist]
            if candidates:
                # First minimize distance, then maximize atomic number, tie-break by id
                dmin = min(dist[n] for n in candidates)
                near = [n for n in candidates if dist[n] == dmin]
                head = max(near, key=lambda n: (_Z(G, n), str(n)))
                head_dist = dmin
        tail_dist = nx.shortest_path_length(H, head, tail) if (anchor_proxy in H and tail in H) else None

        results.append({
            'chain_id'      : cid,
            'head'          : head,
            'tail'          : tail,
            'tail_distance' : tail_dist,
            'head_element'  : (elem(G, head, element_attr) if head is not None else None),
            'head_distance' : head_dist,
        })

    return results