Source code for tooluniverse.cosmic_tool

"""
COSMIC (Catalogue of Somatic Mutations in Cancer) API tool for ToolUniverse.

COSMIC is a comprehensive database of somatic mutations in human cancer.
This tool uses the NLM Clinical Tables Search Service API for COSMIC data.

API Documentation: https://clinicaltables.nlm.nih.gov/apidoc/cosmic/v4/doc.html
"""

import requests
from typing import Dict, Any, Optional, List
from .base_tool import BaseTool
from .tool_registry import register_tool

# Base URL for COSMIC via NLM Clinical Tables API
COSMIC_API_URL = "https://clinicaltables.nlm.nih.gov/api/cosmic/v4/search"


[docs] @register_tool("COSMICTool") class COSMICTool(BaseTool): """ Tool for querying COSMIC (Catalogue of Somatic Mutations in Cancer). COSMIC provides: - Somatic mutation data in human cancers - Gene-level mutation information - Mutation coordinates and amino acid changes - Associated cancer types Uses NLM Clinical Tables API. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout: int = tool_config.get("timeout", 30) self.parameter = tool_config.get("parameter", {})
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the COSMIC API call based on operation type.""" operation = arguments.get("operation") if not operation: # Infer operation from tool config const value (each COSMIC tool # has a fixed const in the schema, e.g., "search" or "get_by_gene") schema_op = ( self.parameter.get("properties", {}).get("operation", {}).get("const") ) operation = schema_op or "search" if operation == "search": return self._search_mutations(arguments) elif operation == "get_by_gene": return self._get_mutations_by_gene(arguments) else: return { "status": "error", "error": f"Unknown operation: {operation}. Supported: search, get_by_gene", }
[docs] def _search_mutations(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Search COSMIC for mutations by term. Args: arguments: Dict containing: - terms: Search query (gene name, mutation, etc.) - max_results: Maximum results to return (default 20, max 500) - genome_build: Genome build version (37 or 38, default 37) """ terms = ( arguments.get("terms") or arguments.get("query") or arguments.get("gene", "") ) if not terms: return { "status": "error", "error": "Missing required parameter: terms (or query/gene)", } max_results = min(arguments.get("max_results", 20), 500) genome_build = arguments.get("genome_build", 37) # Request more records than needed to compensate for deduplication: # COSMIC NLM API returns sample-level records (same mutation many times # across different cancer samples). Request up to 20x more to ensure # we get enough unique mutations after dedup. Cap at 500 (API limit). fetch_count = min(max_results * 20, 500) # Display fields: MutationID, GeneName, MutationCDS, MutationAA # Extra fields for more details params = { "terms": terms, "maxList": fetch_count, "grchv": genome_build, "df": "MutationID,GeneName,MutationCDS,MutationAA", "ef": "MutationID,GeneName,MutationCDS,MutationAA,PrimarySite,PrimaryHistology,MutationGenomePosition,MutationStrand", } try: response = requests.get( COSMIC_API_URL, params=params, timeout=self.timeout, headers={"User-Agent": "ToolUniverse/COSMIC"}, ) response.raise_for_status() data = response.json() # NLM API returns [total_count, code_list, extra_data, display_strings] if isinstance(data, list) and len(data) >= 4: total_count = data[0] codes = data[1] if data[1] else [] # extra_data is a dict of field_name -> list (indexed by position) extra_data = data[2] if data[2] else {} # Parse and deduplicate results results = [] seen = set() for i, code in enumerate(codes): gene = ( extra_data.get("GeneName", [])[i] if i < len(extra_data.get("GeneName", [])) else None ) mutation_cds = ( extra_data.get("MutationCDS", [])[i] if i < len(extra_data.get("MutationCDS", [])) else None ) mutation_aa = ( extra_data.get("MutationAA", [])[i] if i < len(extra_data.get("MutationAA", [])) else None ) prim_site = ( extra_data.get("PrimarySite", [])[i] if i < len(extra_data.get("PrimarySite", [])) else None ) prim_hist = ( extra_data.get("PrimaryHistology", [])[i] if i < len(extra_data.get("PrimaryHistology", [])) else None ) # Skip entries where both CDS and AA changes are placeholders _cds_placeholder = not mutation_cds or mutation_cds in ("c.?", "?") _aa_placeholder = not mutation_aa or mutation_aa in ("p.?", "?") if _cds_placeholder and _aa_placeholder: continue # Deduplicate by mutation_id + gene + CDS + AA (more specific key) key = (code, gene, mutation_cds, mutation_aa) if key in seen: continue seen.add(key) result = { "mutation_id": code, "gene": gene, "mutation_cds": mutation_cds, "mutation_aa": mutation_aa, "primary_site": prim_site, "primary_histology": prim_hist, } results.append(result) if len(results) >= max_results: break return { "status": "success", "data": { "total_count": total_count, "results": results, "genome_build": f"GRCh{genome_build}", }, "metadata": { "source": "COSMIC via NLM Clinical Tables API", "query": terms, }, } else: return { "status": "success", "data": {"total_count": 0, "results": []}, "metadata": {"source": "COSMIC via NLM Clinical Tables API"}, } except requests.exceptions.Timeout: return {"status": "error", "error": "Request timeout after 30 seconds"} except requests.exceptions.HTTPError as e: return {"status": "error", "error": f"HTTP error: {e.response.status_code}"} except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs] def _get_mutations_by_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get all mutations for a specific gene. Args: arguments: Dict containing: - gene: Gene symbol (e.g., BRAF, TP53) - max_results: Maximum results (default 100, max 500) - genome_build: Genome build version (37 or 38) """ gene = arguments.get("gene") or arguments.get("gene_name", "") if not gene: return { "status": "error", "error": "Missing required parameter: gene (or gene_name)", } max_results = min(arguments.get("max_results", 100), 500) genome_build = arguments.get("genome_build", 37) fetch_count = min(max_results * 20, 500) params = { "terms": gene, "maxList": fetch_count, "grchv": genome_build, "q": f"GeneName:{gene}", "df": "MutationID,GeneName,MutationCDS,MutationAA", "ef": "MutationID,GeneName,MutationCDS,MutationAA,PrimarySite,PrimaryHistology,MutationGenomePosition,MutationStrand,Fathmm", } try: response = requests.get( COSMIC_API_URL, params=params, timeout=self.timeout, headers={"User-Agent": "ToolUniverse/COSMIC"}, ) response.raise_for_status() data = response.json() if isinstance(data, list) and len(data) >= 4: total_count = data[0] codes = data[1] if data[1] else [] # extra_data is a dict of field_name -> list (indexed by position) extra_data = data[2] if data[2] else {} # Parse and deduplicate results results = [] seen = set() for i, code in enumerate(codes): gene_name = ( extra_data.get("GeneName", [])[i] if i < len(extra_data.get("GeneName", [])) else gene ) mutation_cds = ( extra_data.get("MutationCDS", [])[i] if i < len(extra_data.get("MutationCDS", [])) else None ) mutation_aa = ( extra_data.get("MutationAA", [])[i] if i < len(extra_data.get("MutationAA", [])) else None ) prim_site = ( extra_data.get("PrimarySite", [])[i] if i < len(extra_data.get("PrimarySite", [])) else None ) prim_hist = ( extra_data.get("PrimaryHistology", [])[i] if i < len(extra_data.get("PrimaryHistology", [])) else None ) genome_pos = ( extra_data.get("MutationGenomePosition", [])[i] if i < len(extra_data.get("MutationGenomePosition", [])) else None ) fathmm = ( extra_data.get("Fathmm", [])[i] if i < len(extra_data.get("Fathmm", [])) else None ) # Skip entries where both CDS and AA changes are placeholders _cds_placeholder = not mutation_cds or mutation_cds in ("c.?", "?") _aa_placeholder = not mutation_aa or mutation_aa in ("p.?", "?") if _cds_placeholder and _aa_placeholder: continue # Deduplicate by mutation_id + gene + CDS + AA (more specific key) key = (code, gene_name, mutation_cds, mutation_aa) if key in seen: continue seen.add(key) result = { "mutation_id": code, "gene": gene_name, "mutation_cds": mutation_cds, "mutation_aa": mutation_aa, "primary_site": prim_site, "primary_histology": prim_hist, "genome_position": genome_pos, "fathmm_prediction": fathmm, } results.append(result) if len(results) >= max_results: break return { "status": "success", "data": { "gene": gene, "total_count": total_count, "results": results, "genome_build": f"GRCh{genome_build}", }, "metadata": { "source": "COSMIC via NLM Clinical Tables API", "gene": gene, }, } else: return { "status": "success", "data": {"gene": gene, "total_count": 0, "results": []}, "metadata": {"source": "COSMIC via NLM Clinical Tables API"}, } except requests.exceptions.Timeout: return {"status": "error", "error": "Request timeout"} except requests.exceptions.HTTPError as e: return {"status": "error", "error": f"HTTP error: {e.response.status_code}"} except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}