Source code for tooluniverse.gencc_tool
"""
GenCC (Gene Curation Coalition) tool for ToolUniverse.
GenCC aggregates gene-disease validity classifications from multiple expert
curators (ClinGen, Ambry Genetics, Genomics England PanelApp, OMIM, Orphanet,
etc.). It provides standardized gene-disease validity evidence levels:
Definitive, Strong, Moderate, Limited, Disputed, Refuted, Animal Model Only,
and No Known Disease Relationship.
Since GenCC has no REST API, this tool downloads and parses the TSV bulk
export from https://search.thegencc.org/download/action/submissions-export-tsv
API Documentation:
- GenCC website: https://thegencc.org
- Data downloads: https://search.thegencc.org/download
Data is cached for 1 hour to avoid repeated downloads.
"""
import csv
import io
import time
import requests
from typing import Dict, Any, List
from .base_tool import BaseTool
from .tool_registry import register_tool
GENCC_TSV_URL = "https://search.thegencc.org/download/action/submissions-export-tsv"
# Cache for GenCC data (module-level)
_gencc_cache = {
"data": None,
"timestamp": 0,
"ttl": 3600, # 1 hour cache
}
# Classification hierarchy from most to least confident
CLASSIFICATION_ORDER = {
"Definitive": 1,
"Strong": 2,
"Moderate": 3,
"Limited": 4,
"Animal Model Only": 5,
"Disputed": 6,
"Refuted": 7,
"No Known Disease Relationship": 8,
}
def _download_gencc_data() -> List[Dict[str, str]]:
"""Download and parse GenCC TSV data with caching."""
now = time.time()
if (
_gencc_cache["data"] is not None
and (now - _gencc_cache["timestamp"]) < _gencc_cache["ttl"]
):
return _gencc_cache["data"]
response = requests.get(
GENCC_TSV_URL,
timeout=120,
headers={"User-Agent": "ToolUniverse/GenCC"},
)
response.raise_for_status()
reader = csv.DictReader(io.StringIO(response.text), delimiter="\t")
records = list(reader)
_gencc_cache["data"] = records
_gencc_cache["timestamp"] = now
return records
[docs]
@register_tool("GenCCTool")
class GenCCTool(BaseTool):
"""
Tool for querying GenCC gene-disease validity classifications.
GenCC (Gene Curation Coalition) aggregates gene-disease validity
assessments from multiple expert curators worldwide. Classifications
range from Definitive to Refuted, following ClinGen gene-disease
validity framework standards.
No authentication required. Data is downloaded from GenCC bulk export.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout: int = tool_config.get("timeout", 120)
self.parameter = tool_config.get("parameter", {})
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute GenCC operation."""
operation = arguments.get("operation", "")
# Auto-fill operation from tool config const if not provided by user
if not operation:
operation = self.get_schema_const_operation()
if operation == "search_gene":
return self._search_gene(arguments)
elif operation == "search_disease":
return self._search_disease(arguments)
elif operation == "get_classifications":
return self._get_classifications(arguments)
else:
return {
"status": "error",
"error": f"Unknown operation: {operation}. Supported: search_gene, search_disease, get_classifications",
}
[docs]
def _search_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Get gene-disease validity classifications for a gene.
Args:
arguments: Dict containing:
- gene_symbol: HGNC gene symbol (e.g., BRCA2, TP53)
- classification: Optional filter by classification level
"""
gene_symbol = arguments.get("gene_symbol", "").strip().upper()
if not gene_symbol:
return {
"status": "error",
"error": "Missing required parameter: gene_symbol",
}
classification_filter = arguments.get("classification", "")
try:
records = _download_gencc_data()
# Filter by gene symbol
matches = [
r for r in records if r.get("gene_symbol", "").upper() == gene_symbol
]
# Optional classification filter
if classification_filter:
matches = [
r
for r in matches
if classification_filter.lower()
in r.get("classification_title", "").lower()
]
# Build structured results
results = []
seen = set()
for r in matches:
key = (
r.get("disease_curie", ""),
r.get("submitter_title", ""),
r.get("classification_title", ""),
)
if key in seen:
continue
seen.add(key)
results.append(
{
"gene_symbol": r.get("gene_symbol", ""),
"gene_curie": r.get("gene_curie", ""),
"disease_title": r.get("disease_title", ""),
"disease_curie": r.get("disease_curie", ""),
"classification": r.get("classification_title", ""),
"mode_of_inheritance": r.get("moi_title", ""),
"submitter": r.get("submitter_title", ""),
"submitted_date": r.get("submitted_as_date", ""),
}
)
# Sort by classification strength
results.sort(
key=lambda x: CLASSIFICATION_ORDER.get(x["classification"], 99)
)
return {
"status": "success",
"data": {
"gene_symbol": gene_symbol,
"submissions": results,
"submission_count": len(results),
"unique_diseases": len(set(r["disease_curie"] for r in results)),
},
"metadata": {
"source": "GenCC (Gene Curation Coalition)",
"gene_symbol": gene_symbol,
},
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Failed to download GenCC data: {str(e)}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs]
def _search_disease(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Find genes with validity evidence for a disease.
Args:
arguments: Dict containing:
- disease: Disease name or MONDO/OMIM ID to search
- classification: Optional filter by classification level
"""
disease = arguments.get("disease", "").strip()
if not disease:
return {"status": "error", "error": "Missing required parameter: disease"}
classification_filter = arguments.get("classification", "")
try:
records = _download_gencc_data()
# Search by disease name (case-insensitive contains) or disease curie
disease_lower = disease.lower()
matches = [
r
for r in records
if disease_lower in r.get("disease_title", "").lower()
or disease_lower in r.get("disease_curie", "").lower()
or disease_lower in r.get("disease_original_curie", "").lower()
]
# Optional classification filter
if classification_filter:
matches = [
r
for r in matches
if classification_filter.lower()
in r.get("classification_title", "").lower()
]
# Deduplicate and structure results
results = []
seen = set()
for r in matches:
key = (
r.get("gene_symbol", ""),
r.get("disease_curie", ""),
r.get("submitter_title", ""),
r.get("classification_title", ""),
)
if key in seen:
continue
seen.add(key)
results.append(
{
"gene_symbol": r.get("gene_symbol", ""),
"gene_curie": r.get("gene_curie", ""),
"disease_title": r.get("disease_title", ""),
"disease_curie": r.get("disease_curie", ""),
"classification": r.get("classification_title", ""),
"mode_of_inheritance": r.get("moi_title", ""),
"submitter": r.get("submitter_title", ""),
"submitted_date": r.get("submitted_as_date", ""),
}
)
# Sort by classification strength
results.sort(
key=lambda x: CLASSIFICATION_ORDER.get(x["classification"], 99)
)
return {
"status": "success",
"data": {
"disease": disease,
"submissions": results,
"submission_count": len(results),
"unique_genes": len(set(r["gene_symbol"] for r in results)),
},
"metadata": {
"source": "GenCC (Gene Curation Coalition)",
"disease": disease,
},
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Failed to download GenCC data: {str(e)}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs]
def _get_classifications(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Get summary of all gene-disease validity classification levels.
Returns classification levels with counts and descriptions.
Optionally filter by submitter organization.
Args:
arguments: Dict containing:
- submitter: Optional filter by submitting organization name
"""
submitter_filter = arguments.get("submitter", "")
try:
records = _download_gencc_data()
if submitter_filter:
records = [
r
for r in records
if submitter_filter.lower() in r.get("submitter_title", "").lower()
]
# Count classifications
classification_counts = {}
submitter_counts = {}
for r in records:
cls = r.get("classification_title", "Unknown")
classification_counts[cls] = classification_counts.get(cls, 0) + 1
sub = r.get("submitter_title", "Unknown")
submitter_counts[sub] = submitter_counts.get(sub, 0) + 1
# Build ordered classification summary
classifications = []
for cls_name in sorted(
classification_counts.keys(),
key=lambda x: CLASSIFICATION_ORDER.get(x, 99),
):
classifications.append(
{
"classification": cls_name,
"count": classification_counts[cls_name],
"rank": CLASSIFICATION_ORDER.get(cls_name, 99),
}
)
# Top submitters
top_submitters = sorted(submitter_counts.items(), key=lambda x: -x[1])[:20]
return {
"status": "success",
"data": {
"classifications": classifications,
"total_submissions": len(records),
"unique_genes": len(set(r.get("gene_symbol", "") for r in records)),
"unique_diseases": len(
set(r.get("disease_curie", "") for r in records)
),
"top_submitters": [
{"name": s[0], "count": s[1]} for s in top_submitters
],
},
"metadata": {
"source": "GenCC (Gene Curation Coalition)",
"note": "Classifications follow ClinGen gene-disease validity framework",
},
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Failed to download GenCC data: {str(e)}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}