Source code for tooluniverse.msigdb_tool
"""MSigDB (Molecular Signatures Database) tool — access 33,000+ gene sets.
Covers GTRD TF targets, miRDB miRNA targets, oncogenic signatures,
hallmark gene sets, GO terms, KEGG pathways, and more.
API: https://www.gsea-msigdb.org/gsea/msigdb/ (no authentication required)
"""
import json
import urllib.parse
import urllib.request
from typing import Any, Dict, List
from .base_tool import BaseTool
from .tool_registry import register_tool
MSIGDB_BASE = "https://www.gsea-msigdb.org/gsea/msigdb"
[docs]
@register_tool("MSigDBTool")
class MSigDBTool(BaseTool):
"""Query MSigDB gene sets — TF targets, miRNA targets, oncogenic signatures, and more."""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
operation = arguments.get("operation", "get_gene_set")
if operation == "get_gene_set":
return self._get_gene_set(arguments)
elif operation == "check_gene_in_set":
return self._check_gene_in_set(arguments)
elif operation == "search_gene_sets":
return self._search_gene_sets(arguments)
else:
return {"status": "error", "error": f"Unknown operation: {operation}"}
[docs]
def _get_gene_set(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Fetch a gene set by exact name."""
gene_set_name = arguments.get("gene_set_name", "").strip()
if not gene_set_name:
return {"status": "error", "error": "gene_set_name is required"}
url = (
f"{MSIGDB_BASE}/human/download_geneset.jsp"
f"?geneSetName={urllib.parse.quote(gene_set_name)}&fileType=json"
)
try:
req = urllib.request.Request(
url, headers={"User-Agent": "ToolUniverse/1.0"}
)
with urllib.request.urlopen(req, timeout=30) as resp:
data = json.loads(resp.read().decode("utf-8"))
if not data:
return {
"status": "error",
"error": f"Gene set '{gene_set_name}' not found in MSigDB",
}
# Data is {gene_set_name: {info}}
info = list(data.values())[0] if data else {}
genes = info.get("geneSymbols", [])
return {
"status": "success",
"data": {
"gene_set_name": gene_set_name,
"systematic_name": info.get("systematicName", ""),
"collection": info.get("collection", ""),
"pmid": info.get("pmid", ""),
"num_genes": len(genes),
"genes": genes,
"description": info.get(
"briefDescription", info.get("exactSource", "")
),
"external_url": info.get("externalDetailsURL", []),
},
}
except urllib.error.HTTPError as e:
return {"status": "error", "error": f"HTTP {e.code}: {gene_set_name}"}
except Exception as e:
return {"status": "error", "error": str(e)[:200]}
[docs]
def _check_gene_in_set(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Check if a gene is in a specific gene set."""
gene_set_name = arguments.get("gene_set_name", "").strip()
gene = arguments.get("gene", "").strip().upper()
if not gene_set_name or not gene:
return {
"status": "error",
"error": "Both 'gene_set_name' and 'gene' are required",
}
result = self._get_gene_set({"gene_set_name": gene_set_name})
if result.get("status") != "success":
return result
genes = result["data"]["genes"]
genes_upper = [g.upper() for g in genes]
found = gene in genes_upper
return {
"status": "success",
"data": {
"gene_set_name": gene_set_name,
"gene": gene,
"is_member": found,
"total_genes_in_set": len(genes),
},
}
[docs]
def _search_gene_sets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search for gene sets by name pattern."""
query = arguments.get("query", "").strip()
if not query:
return {"status": "error", "error": "query is required"}
# Try exact match first
result = self._get_gene_set({"gene_set_name": query})
if result.get("status") == "success":
return {
"status": "success",
"data": {
"query": query,
"matches": [
{
"name": query,
"collection": result["data"]["collection"],
"num_genes": result["data"]["num_genes"],
}
],
},
}
# Try common name patterns
candidates = self._generate_candidates(query)
matches = []
for candidate in candidates[:5]:
r = self._get_gene_set({"gene_set_name": candidate})
if r.get("status") == "success":
matches.append(
{
"name": candidate,
"collection": r["data"]["collection"],
"num_genes": r["data"]["num_genes"],
}
)
if matches:
return {"status": "success", "data": {"query": query, "matches": matches}}
return {
"status": "error",
"error": f"No gene sets found matching '{query}'",
}
[docs]
def _generate_candidates(self, query: str) -> List[str]:
"""Generate candidate gene set names from a query."""
q = query.strip()
candidates = [q]
# GTRD TF targets: "ZNF549" → "ZNF549_TARGET_GENES"
candidates.append(f"{q}_TARGET_GENES")
# miRDB targets: "MIR675_3P" → "MIR675_3P_TARGET_GENES"
candidates.append(f"{q}_TARGET_GENES")
# With prefix: "hsa-miR-675-3p" → "MIR675_3P"
if q.startswith("hsa-"):
normalized = (
q.replace("hsa-", "").replace("-", "_").replace("miR", "MIR").upper()
)
candidates.append(normalized)
candidates.append(f"{normalized}_TARGET_GENES")
return candidates