# Author: ChatGPT 5
# Assistant: Cameron F. Abrams <cfa22@drexel.edu>
import networkx as nx
import logging
logger = logging.getLogger(__name__)
[docs]
def elem(G: nx.Graph, n: int, element_attr: str = 'element'):
e = G.nodes[n].get(element_attr)
return e if e else str(n)[0] # fallback: first char of label
[docs]
def is_H(G: nx.Graph, n: int, element_attr: str = 'element'): return elem(G, n, element_attr) == 'H'
[docs]
def is_C(G: nx.Graph, n: int, element_attr: str = 'element'): return elem(G, n, element_attr) == 'C'
[docs]
def is_O(G: nx.Graph, n: int, element_attr: str = 'element'): return elem(G, n, element_attr) == 'O'
[docs]
def is_N(G: nx.Graph, n: int, element_attr: str = 'element'): return elem(G, n, element_attr) == 'N'
[docs]
def is_P(G: nx.Graph, n: int, element_attr: str = 'element'): return elem(G, n, element_attr) == 'P'
[docs]
def is_S(G: nx.Graph, n: int, element_attr: str = 'element'): return elem(G, n, element_attr) == 'S'
[docs]
def mark_cc_doubles_by_degree(G: nx.Graph, element_attr: str = 'element', set_other_cc_single: bool = True):
"""
Set bond 'order' on C-C edges using node degrees only:
- C-C with both endpoints degree==3 -> order=2
- (optional) all other C-C -> order=1
Non-C-C edges are untouched.
Returns the list of edges marked as double.
"""
deg = dict(G.degree()) # snapshot the DegreeView once
# Optionally preset all C-C to single
if set_other_cc_single:
for u, v in G.edges():
if is_C(G, u, element_attr) and is_C(G, v, element_attr):
G.edges[u, v]['order'] = 1
cc_doubles = []
for u, v in G.edges():
if is_C(G, u, element_attr) and is_C(G, v, element_attr):
if deg.get(u, 0) == 3 and deg.get(v, 0) == 3:
G.edges[u, v]['order'] = 2
cc_doubles.append((u, v))
return cc_doubles
import networkx as nx
HETERO_HEAVY = {'O', 'N', 'P', 'S'} # extend if you like (F, Cl, Br, I, etc.)
ATOMIC_Z = {'H':1, 'C':6, 'N':7, 'O':8, 'P':15, 'S':16} # extend if needed
# ---------------- helpers ----------------
[docs]
def heavy_neighbors(G: nx.Graph, n: int, element_attr: str = 'element'):
return [x for x in G.neighbors(n) if not is_H(G, x, element_attr)]
[docs]
def heavy_degree(G: nx.Graph, n: int, element_attr: str = 'element'):
return sum(1 for _ in heavy_neighbors(G, n, element_attr))
[docs]
def heavy_only_nodes(G: nx.Graph, element_attr: str = 'element'):
return [n for n in G.nodes if not is_H(G, n, element_attr)]
[docs]
def cycle_atoms_heavy(G: nx.Graph, element_attr: str = 'element'):
"""Return nodes on any cycle, computed on heavy-atom induced subgraph."""
Hsub = G.subgraph(heavy_only_nodes(G, element_attr))
cyc_nodes = set()
for comp in nx.connected_components(Hsub):
sub = Hsub.subgraph(comp)
for cyc in nx.cycle_basis(sub):
cyc_nodes.update(cyc)
return cyc_nodes
[docs]
def seed_set(G: nx.Graph, branch_deg_threshold: int = 3, element_attr: str = 'element'):
"""
Seeds = hetero heavy atoms, ring atoms (heavy graph), and heavy-branch carbons (heavy_degree>=3).
Hydrogens are never seeds.
"""
cyc = cycle_atoms_heavy(G, element_attr)
S = set()
for n in G.nodes:
if is_H(G, n, element_attr):
continue # exclude hydrogens from seeds
e = elem(G, n, element_attr)
if e in HETERO_HEAVY:
S.add(n)
continue
if n in cyc:
S.add(n)
continue
if e == 'C' and heavy_degree(G, n, element_attr) >= branch_deg_threshold:
S.add(n)
return S
[docs]
def carbon_neighbors_outside_S(G: nx.Graph, node: int, S: set[int], element_attr: str = 'element'):
"""Immediate carbon neighbors not in S."""
return [v for v in G.neighbors(node)
if v not in S and is_C(G, v, element_attr)]
[docs]
def carbon_component_from_neighbor(G: nx.Graph, start: int, S: set[int], element_attr: str = 'element'):
"""
Carbon-only connected component reachable from 'start' without entering S.
Hydrogens are ignored implicitly since we require is_C.
"""
frontier = [start]
visited = set()
while frontier:
v = frontier.pop()
if v in visited:
continue
visited.add(v)
for w in G.neighbors(v):
if w in S:
continue
if is_C(G, w, element_attr) and w not in visited:
frontier.append(w)
return G.subgraph(visited).copy()
[docs]
def leafs_in(sub: nx.Graph, avoid: int = None, element_attr: str = 'element'):
"""Degree-1 nodes within sub; optionally ignore 'avoid'."""
leaves = []
for n in sub.nodes:
if avoid is not None and n == avoid:
continue
if sub.degree(n) <= 1:
leaves.append(n)
return leaves
[docs]
def anchor_paths(sub: nx.Graph, anchor_adj: int, element_attr: str = 'element'):
"""One path per terminal leaf within the connected piece containing anchor_adj."""
if anchor_adj not in sub:
return []
comp_nodes = nx.node_connected_component(sub, anchor_adj)
comp = sub.subgraph(comp_nodes).copy()
leaves = [n for n in comp.nodes if n != anchor_adj and comp.degree(n) <= 1]
if not leaves:
return [[anchor_adj]]
return [nx.shortest_path(comp, anchor_adj, leaf) for leaf in leaves]
[docs]
def count_unsats_cc(G: nx.Graph, path: list[int], element_attr: str = 'element') -> int:
"""Count C=C along a carbon path using 'order' if present; otherwise 0."""
uns = 0
for u, v in zip(path, path[1:]):
if is_C(G, u, element_attr) and is_C(G, v, element_attr):
if G.edges[u, v].get('order', 1) >= 2:
uns += 1
return uns
# ---------------- main API ----------------
[docs]
def label_lipid_chains_heavy_aware(
G: nx.Graph,
min_len=6,
split_branches=True,
include_carbonyl_C_as_seed=True,
element_attr: str = 'element',
):
"""
Identify lipid tails robustly when hydrogens are present.
Seeds:
- hetero heavy atoms (O,N,P,S),
- heavy-graph ring atoms,
- heavy-branch carbons (heavy_degree >= 3),
- (optionally) carbonyl carbons are naturally included via heavy_degree rule.
Returns:
node_to_chain_id: dict node -> int or None
chains: list of dicts {id, anchor, nodes, path, length, unsaturations}
"""
node_to_chain = {n: None for n in G.nodes}
chains = []
used = set() # already assigned carbon nodes (when split_branches=True)
# Build seeds on HEAVY logic (hydrogens excluded)
S = seed_set(G, branch_deg_threshold=3, element_attr=element_attr)
# Optionally ensure carbonyl C are treated as seeds even if not >=3 heavy neighbors
# (Often they already are: neighbors are O(=), O(-), and C -> heavy_degree=3.)
if include_carbonyl_C_as_seed:
for n in heavy_only_nodes(G, element_attr):
if not is_C(G, n, element_attr):
continue
# Carbonyl-like: has at least one oxygen neighbor; tend to be anchors
if any(is_O(G, x, element_attr) for x in heavy_neighbors(G, n, element_attr)):
S.add(n)
# Case A: seeds exist (typical)
if S:
chain_id = 0
for s in S:
for cn in carbon_neighbors_outside_S(G, s, S, element_attr):
if split_branches and cn in used:
continue
comp = carbon_component_from_neighbor(G, cn, S, element_attr)
if comp.number_of_nodes() == 0:
continue
if split_branches:
paths = anchor_paths(comp, cn, element_attr)
for p in paths:
L = sum(1 for x in p if is_C(G, x, element_attr))
if L < min_len:
continue
uns = count_unsats_cc(G, p, element_attr)
for n in p:
node_to_chain[n] = chain_id
used.add(n)
chains.append({
'id': chain_id,
'anchor': s,
'nodes': set(p),
'path': p,
'length': L,
'unsaturations': uns,
})
chain_id += 1
else:
comp_nodes = set(comp.nodes)
L = sum(1 for n in comp_nodes if is_C(G, n, element_attr))
if L >= min_len:
leaves = [n for n in comp.nodes if comp.degree(n) <= 1]
if leaves:
a = leaves[0]
b = max(nx.single_source_shortest_path_length(comp, a),
key=lambda kv: kv[1])[0]
c = max(nx.single_source_shortest_path_length(comp, b),
key=lambda kv: kv[1])[0]
p = nx.shortest_path(comp, b, c)
else:
p = [cn]
uns = count_unsats_cc(G, p, element_attr)
for n in comp_nodes:
node_to_chain[n] = chain_id
used.add(n)
chains.append({
'id': chain_id,
'anchor': s,
'nodes': comp_nodes,
'path': p,
'length': L,
'unsaturations': uns,
})
chain_id += 1
# Case B: no seeds (pure hydrocarbon, no rings/branches/heteroatoms)
if not chains and not S and G.number_of_nodes() > 0:
comp_nodes = max(nx.connected_components(G), key=len)
sub = G.subgraph(comp_nodes)
a = next(iter(sub.nodes))
b = max(nx.single_source_shortest_path_length(sub, a), key=lambda kv: kv[1])[0]
c = max(nx.single_source_shortest_path_length(sub, b), key=lambda kv: kv[1])[0]
p = nx.shortest_path(sub, b, c)
L = sum(1 for x in p if is_C(G, x, element_attr))
uns = count_unsats_cc(G, p, element_attr)
for n in p:
node_to_chain[n] = 0
chains.append({
'id': 0,
'anchor': None,
'nodes': set(p),
'path': p,
'length': L,
'unsaturations': uns,
})
for h in G.nodes:
if not is_H(G, h, element_attr):
continue
# only consider C–H; ignore O–H, N–H, etc.
carbon_nbrs = [n for n in G.neighbors(h) if is_C(G, n, element_attr)]
if len(carbon_nbrs) != 1:
continue # skip weird/ambiguous cases
c = carbon_nbrs[0]
cid = node_to_chain.get(c)
if cid is not None:
node_to_chain[h] = cid
return node_to_chain, chains
def _is_heavy(G, n, element_attr: str = "element"): # exclude hydrogens
return not is_H(G, n, element_attr)
def _is_hetero_heavy(G, n, element_attr: str = "element"):
return not is_H(G, n, element_attr) and not is_C(G, n, element_attr)
def _Z(G, n, element_attr: str = "element"):
return ATOMIC_Z.get(elem(G, n, element_attr), 0)
[docs]
def head_tail_pairs(G: nx.Graph, chains: list[dict], element_attr: str = "element"):
"""
For each chain in `chains`, return:
{'chain_id', 'head', 'tail', 'head_element', 'head_distance'}
where:
tail = last node of the chain's path
head = heaviest hetero atom at the minimum heavy-graph distance from the chain's anchor
"""
# Heavy-atom-only view for distances (ignore hydrogens)
heavy_nodes = [n for n in G.nodes if _is_heavy(G, n, element_attr)]
H = G.subgraph(heavy_nodes)
hetero_nodes = [n for n in H.nodes if _is_hetero_heavy(G, n, element_attr)]
results = []
for ch in chains:
cid = ch['id']
anchor = ch.get('anchor')
path = ch.get('path', [])
tail = path[-1] if path else None
# Fallback if anchor is None: use path start (first carbon) as anchor proxy
anchor_proxy = anchor if anchor is not None else (path[0] if path else None)
head = None
head_dist = None
tail_dist = None
if anchor_proxy in H:
# Distances on heavy graph only
dist = nx.single_source_shortest_path_length(H, anchor_proxy)
# Candidates: hetero heavy reachable nodes
candidates = [n for n in hetero_nodes if n in dist]
if candidates:
# First minimize distance, then maximize atomic number, tie-break by id
dmin = min(dist[n] for n in candidates)
near = [n for n in candidates if dist[n] == dmin]
head = max(near, key=lambda n: (_Z(G, n), str(n)))
head_dist = dmin
tail_dist = nx.shortest_path_length(H, head, tail) if (anchor_proxy in H and tail in H) else None
results.append({
'chain_id' : cid,
'head' : head,
'tail' : tail,
'tail_distance' : tail_dist,
'head_element' : (elem(G, head, element_attr) if head is not None else None),
'head_distance' : head_dist,
})
return results