Source code for tooluniverse.coexpression_module_tool
"""Co-expression module detection from an expression matrix (NumPy + NetworkX).
Builds a gene co-expression network from a genes x samples expression matrix and
partitions it into modules of co-regulated genes — the analysis at the heart of
WGCNA-style workflows. Genes are connected by their pairwise correlation,
soft-thresholded (``|r|^power``, the WGCNA scale-free emphasis), and the network
is partitioned by greedy modularity maximization. For each module a **module
eigengene** (the first principal component of the module's expression) summarizes
its activity across samples.
This is a dependency-light implementation (NumPy correlation + NetworkX community
detection + a NumPy SVD eigengene), not a full WGCNA port — it omits the
topological-overlap matrix and dynamic tree cut, using modularity communities
instead. It is deterministic and testable directly.
``run()`` returns a ``{status, data, metadata}`` dict and never raises.
Reference
---------
Langfelder P, Horvath S. "WGCNA: an R package for weighted correlation network
analysis." BMC Bioinformatics 9, 559 (2008).
"""
from typing import Any, Dict, List, Optional
import networkx as nx
import numpy as np
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("CoexpressionModuleTool")
class CoexpressionModuleTool(BaseTool):
"""Detect co-expression modules + module eigengenes from an expression matrix."""
[docs]
def __init__(self, tool_config: Optional[Dict[str, Any]] = None):
super().__init__(tool_config)
self.tool_config = tool_config or {}
# ------------------------------------------------------------------ run
[docs]
def run(self, arguments: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
args = arguments or {}
parsed = self._parse_expression(args)
if isinstance(parsed, dict):
return parsed
genes, matrix = parsed # matrix: genes x samples
power = int(args.get("power") or 6)
threshold = float(args.get("correlation_threshold") or 0.5)
min_size = int(args.get("min_module_size") or 5)
corr = np.corrcoef(matrix)
corr = np.nan_to_num(corr) # constant genes -> 0 correlation
n = len(genes)
g = nx.Graph()
g.add_nodes_from(range(n))
iu = np.triu_indices(n, k=1)
for i, j in zip(*iu):
r = abs(corr[i, j])
if r >= threshold:
g.add_edge(int(i), int(j), weight=r**power)
communities = nx.community.greedy_modularity_communities(g, weight="weight")
modules = []
for k, comm in enumerate(communities):
idx = sorted(comm)
if len(idx) < min_size:
continue
modules.append(
{
"module_id": f"M{len(modules) + 1}",
"size": len(idx),
"genes": [genes[i] for i in idx],
"eigengene": self._eigengene(matrix[idx]),
}
)
assigned = sum(m["size"] for m in modules)
return {
"status": "success",
"data": {
"n_genes": n,
"n_samples": int(matrix.shape[1]),
"n_modules": len(modules),
"n_unassigned": n - assigned,
"modules": modules,
},
"metadata": {
"method": "co-expression modules (soft-thresholded correlation + modularity)",
"power": power,
"correlation_threshold": threshold,
"min_module_size": min_size,
"note": (
"Modules group co-expressed genes; eigengene = first PC of the "
"module's expression across samples (its summary profile). "
"Simplified WGCNA (no TOM / dynamic tree cut)."
),
},
}
# -------------------------------------------------------------- helpers
[docs]
@staticmethod
def _eigengene(block: np.ndarray) -> List[float]:
"""First principal component of a (genes x samples) block -> per-sample vector."""
centered = block - block.mean(axis=1, keepdims=True)
# right singular vector of the centered block = sample-space PC1
_, _, vt = np.linalg.svd(centered, full_matrices=False)
eig = vt[0]
# sign-align so the eigengene tracks the module's mean expression
if np.corrcoef(eig, block.mean(axis=0))[0, 1] < 0:
eig = -eig
return [float(x) for x in eig]
[docs]
def _parse_expression(self, args: Dict[str, Any]):
expr = args.get("expression")
if isinstance(expr, dict) and expr:
genes = [str(g) for g in expr]
try:
rows = [
[
float(v)
for v in (vals if isinstance(vals, (list, tuple)) else [vals])
]
for vals in expr.values()
]
except (TypeError, ValueError):
return self._err("expression values must be numbers (per sample).")
if len({len(r) for r in rows}) != 1:
return self._err(
"every gene must have the same number of sample values."
)
matrix = np.asarray(rows, dtype=float)
else:
return self._err(
"Provide expression ({gene: [value_per_sample]}) — a genes x samples matrix."
)
if len(genes) < 4:
return self._err("At least 4 genes are required to build a network.")
if matrix.shape[1] < 3:
return self._err("At least 3 samples are required to compute correlations.")
return genes, matrix
[docs]
@staticmethod
def _err(message: str) -> Dict[str, Any]:
return {"status": "error", "error": message, "source": "CoexpressionModuleTool"}