Source code for tooluniverse.msigdb_tool

"""MSigDB (Molecular Signatures Database) tool — access 33,000+ gene sets.

Covers GTRD TF targets, miRDB miRNA targets, oncogenic signatures,
hallmark gene sets, GO terms, KEGG pathways, and more.

API: https://www.gsea-msigdb.org/gsea/msigdb/ (no authentication required)
"""

import json
import urllib.parse
import urllib.request
from typing import Any, Dict, List

from .base_tool import BaseTool
from .tool_registry import register_tool


MSIGDB_BASE = "https://www.gsea-msigdb.org/gsea/msigdb"


[docs] @register_tool("MSigDBTool") class MSigDBTool(BaseTool): """Query MSigDB gene sets — TF targets, miRNA targets, oncogenic signatures, and more."""
[docs] def __init__(self, tool_config: Dict[str, Any], **kwargs): super().__init__(tool_config)
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: operation = arguments.get("operation", "get_gene_set") if operation == "get_gene_set": return self._get_gene_set(arguments) elif operation == "check_gene_in_set": return self._check_gene_in_set(arguments) elif operation == "search_gene_sets": return self._search_gene_sets(arguments) else: return {"status": "error", "error": f"Unknown operation: {operation}"}
[docs] def _get_gene_set(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Fetch a gene set by exact name.""" gene_set_name = arguments.get("gene_set_name", "").strip() if not gene_set_name: return {"status": "error", "error": "gene_set_name is required"} url = ( f"{MSIGDB_BASE}/human/download_geneset.jsp" f"?geneSetName={urllib.parse.quote(gene_set_name)}&fileType=json" ) try: req = urllib.request.Request( url, headers={"User-Agent": "ToolUniverse/1.0"} ) with urllib.request.urlopen(req, timeout=30) as resp: data = json.loads(resp.read().decode("utf-8")) if not data: return { "status": "error", "error": f"Gene set '{gene_set_name}' not found in MSigDB", } # Data is {gene_set_name: {info}} info = list(data.values())[0] if data else {} genes = info.get("geneSymbols", []) return { "status": "success", "data": { "gene_set_name": gene_set_name, "systematic_name": info.get("systematicName", ""), "collection": info.get("collection", ""), "pmid": info.get("pmid", ""), "num_genes": len(genes), "genes": genes, "description": info.get( "briefDescription", info.get("exactSource", "") ), "external_url": info.get("externalDetailsURL", []), }, } except urllib.error.HTTPError as e: return {"status": "error", "error": f"HTTP {e.code}: {gene_set_name}"} except Exception as e: return {"status": "error", "error": str(e)[:200]}
[docs] def _check_gene_in_set(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Check if a gene is in a specific gene set.""" gene_set_name = arguments.get("gene_set_name", "").strip() gene = arguments.get("gene", "").strip().upper() if not gene_set_name or not gene: return { "status": "error", "error": "Both 'gene_set_name' and 'gene' are required", } result = self._get_gene_set({"gene_set_name": gene_set_name}) if result.get("status") != "success": return result genes = result["data"]["genes"] genes_upper = [g.upper() for g in genes] found = gene in genes_upper return { "status": "success", "data": { "gene_set_name": gene_set_name, "gene": gene, "is_member": found, "total_genes_in_set": len(genes), }, }
[docs] def _search_gene_sets(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search for gene sets by name pattern.""" query = arguments.get("query", "").strip() if not query: return {"status": "error", "error": "query is required"} # Try exact match first result = self._get_gene_set({"gene_set_name": query}) if result.get("status") == "success": return { "status": "success", "data": { "query": query, "matches": [ { "name": query, "collection": result["data"]["collection"], "num_genes": result["data"]["num_genes"], } ], }, } # Try common name patterns candidates = self._generate_candidates(query) matches = [] for candidate in candidates[:5]: r = self._get_gene_set({"gene_set_name": candidate}) if r.get("status") == "success": matches.append( { "name": candidate, "collection": r["data"]["collection"], "num_genes": r["data"]["num_genes"], } ) if matches: return {"status": "success", "data": {"query": query, "matches": matches}} return { "status": "error", "error": f"No gene sets found matching '{query}'", }
[docs] def _generate_candidates(self, query: str) -> List[str]: """Generate candidate gene set names from a query.""" q = query.strip() candidates = [q] # GTRD TF targets: "ZNF549" → "ZNF549_TARGET_GENES" candidates.append(f"{q}_TARGET_GENES") # miRDB targets: "MIR675_3P" → "MIR675_3P_TARGET_GENES" candidates.append(f"{q}_TARGET_GENES") # With prefix: "hsa-miR-675-3p" → "MIR675_3P" if q.startswith("hsa-"): normalized = ( q.replace("hsa-", "").replace("-", "_").replace("miR", "MIR").upper() ) candidates.append(normalized) candidates.append(f"{normalized}_TARGET_GENES") return candidates