tooluniverse.gencc_tool 源代码

"""
GenCC (Gene Curation Coalition) tool for ToolUniverse.

GenCC aggregates gene-disease validity classifications from multiple expert
curators (ClinGen, Ambry Genetics, Genomics England PanelApp, OMIM, Orphanet,
etc.). It provides standardized gene-disease validity evidence levels:
Definitive, Strong, Moderate, Limited, Disputed, Refuted, Animal Model Only,
and No Known Disease Relationship.

Since GenCC has no REST API, this tool downloads and parses the TSV bulk
export from https://search.thegencc.org/download/action/submissions-export-tsv

API Documentation:
- GenCC website: https://thegencc.org
- Data downloads: https://search.thegencc.org/download

Data is cached for 1 hour to avoid repeated downloads.
"""

import csv
import io
import time
import requests
from typing import Dict, Any, List
from .base_tool import BaseTool
from .tool_registry import register_tool

GENCC_TSV_URL = "https://search.thegencc.org/download/action/submissions-export-tsv"

# Cache for GenCC data (module-level)
_gencc_cache = {
    "data": None,
    "timestamp": 0,
    "ttl": 3600,  # 1 hour cache
}

# Classification hierarchy from most to least confident
CLASSIFICATION_ORDER = {
    "Definitive": 1,
    "Strong": 2,
    "Moderate": 3,
    "Limited": 4,
    "Animal Model Only": 5,
    "Disputed": 6,
    "Refuted": 7,
    "No Known Disease Relationship": 8,
}


def _disease_matches(record: Dict[str, str], query_lower: str) -> bool:
    """Match a GenCC record by disease name or curie, with word-tokenized fallback.

    Exact substring match first. If that fails, falls back to requiring all
    query words to appear in the disease title (handles hyphens in GenCC names,
    e.g. 'breast cancer' matches 'breast-ovarian cancer, familial...').
    """
    title = record.get("disease_title", "").lower()
    curie = record.get("disease_curie", "").lower()
    orig = record.get("disease_original_curie", "").lower()
    if query_lower in title or query_lower in curie or query_lower in orig:
        return True
    # Word-tokenized fallback: all words in the query must appear in title
    words = query_lower.split()
    return len(words) > 1 and all(w in title for w in words)


def _gene_matches(record: Dict[str, str], gene_symbol: str) -> bool:
    """Match a GenCC record by gene symbol, checking both current and submitted HGNC symbol.

    Needed because HGNC renames genes (e.g. GBA → GBA1); submissions may use the old name.
    """
    return (
        record.get("gene_symbol", "").upper() == gene_symbol
        or record.get("submitted_as_hgnc_symbol", "").upper() == gene_symbol
    )


def _download_gencc_data() -> List[Dict[str, str]]:
    """Download and parse GenCC TSV data with caching."""
    now = time.time()
    if (
        _gencc_cache["data"] is not None
        and (now - _gencc_cache["timestamp"]) < _gencc_cache["ttl"]
    ):
        return _gencc_cache["data"]

    response = requests.get(
        GENCC_TSV_URL,
        timeout=120,
        headers={"User-Agent": "ToolUniverse/GenCC"},
    )
    response.raise_for_status()

    reader = csv.DictReader(io.StringIO(response.text), delimiter="\t")
    records = list(reader)

    _gencc_cache["data"] = records
    _gencc_cache["timestamp"] = now
    return records


[文档] @register_tool("GenCCTool") class GenCCTool(BaseTool): """ Tool for querying GenCC gene-disease validity classifications. GenCC (Gene Curation Coalition) aggregates gene-disease validity assessments from multiple expert curators worldwide. Classifications range from Definitive to Refuted, following ClinGen gene-disease validity framework standards. No authentication required. Data is downloaded from GenCC bulk export. """
[文档] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout: int = tool_config.get("timeout", 120) self.parameter = tool_config.get("parameter", {})
[文档] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute GenCC operation.""" operation = arguments.get("operation", "") # Auto-fill operation from tool config const if not provided by user if not operation: operation = self.get_schema_const_operation() if operation == "search_gene": return self._search_gene(arguments) elif operation == "search_disease": return self._search_disease(arguments) elif operation == "get_classifications": return self._get_classifications(arguments) else: return { "status": "error", "error": f"Unknown operation: {operation}. Supported: search_gene, search_disease, get_classifications", }
[文档] def _search_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get gene-disease validity classifications for a gene. Args: arguments: Dict containing: - gene_symbol: HGNC gene symbol (e.g., BRCA2, TP53) - classification: Optional filter by classification level """ gene_symbol = arguments.get("gene_symbol", "").strip().upper() if not gene_symbol: return { "status": "error", "error": "Missing required parameter: gene_symbol", } classification_filter = arguments.get("classification", "") try: records = _download_gencc_data() # Filter by gene symbol matches = [r for r in records if _gene_matches(r, gene_symbol)] # Optional classification filter if classification_filter: matches = [ r for r in matches if classification_filter.lower() in r.get("classification_title", "").lower() ] # Build structured results results = [] seen = set() for r in matches: key = ( r.get("disease_curie", ""), r.get("submitter_title", ""), r.get("classification_title", ""), ) if key in seen: continue seen.add(key) results.append( { "gene_symbol": r.get("gene_symbol", ""), "gene_curie": r.get("gene_curie", ""), "disease_title": r.get("disease_title", ""), "disease_curie": r.get("disease_curie", ""), "classification": r.get("classification_title", ""), "mode_of_inheritance": r.get("moi_title", ""), "submitter": r.get("submitter_title", ""), "submitted_date": r.get("submitted_as_date", ""), } ) # Sort by classification strength results.sort( key=lambda x: CLASSIFICATION_ORDER.get(x["classification"], 99) ) metadata: Dict[str, Any] = { "source": "GenCC (Gene Curation Coalition)", "gene_symbol": gene_symbol, } if not results: # GenCC uses current HGNC-approved symbols; older/alias symbols return empty. metadata["note"] = ( f"No GenCC submissions found for '{gene_symbol}'. " "GenCC uses current HGNC-approved gene symbols. " "If the gene was recently renamed (e.g. GBA→GBA1), " "try the current approved symbol from HGNC." ) return { "status": "success", "data": { "gene_symbol": gene_symbol, "submissions": results, "submission_count": len(results), "unique_diseases": len(set(r["disease_curie"] for r in results)), }, "metadata": metadata, } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"Failed to download GenCC data: {str(e)}", } except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[文档] def _search_disease(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Find genes with validity evidence for a disease. Args: arguments: Dict containing: - disease: Disease name or MONDO/OMIM ID to search - classification: Optional filter by classification level """ disease = ( arguments.get("disease") or arguments.get("disease_name", "") ).strip() if not disease: return {"status": "error", "error": "Missing required parameter: disease"} classification_filter = arguments.get("classification", "") try: records = _download_gencc_data() # Search by disease name (case-insensitive contains) or disease curie disease_lower = disease.lower() matches = [r for r in records if _disease_matches(r, disease_lower)] # Optional classification filter if classification_filter: matches = [ r for r in matches if classification_filter.lower() in r.get("classification_title", "").lower() ] # Deduplicate and structure results results = [] seen = set() for r in matches: key = ( r.get("gene_symbol", ""), r.get("disease_curie", ""), r.get("submitter_title", ""), r.get("classification_title", ""), ) if key in seen: continue seen.add(key) results.append( { "gene_symbol": r.get("gene_symbol", ""), "gene_curie": r.get("gene_curie", ""), "disease_title": r.get("disease_title", ""), "disease_curie": r.get("disease_curie", ""), "classification": r.get("classification_title", ""), "mode_of_inheritance": r.get("moi_title", ""), "submitter": r.get("submitter_title", ""), "submitted_date": r.get("submitted_as_date", ""), } ) # Sort by classification strength results.sort( key=lambda x: CLASSIFICATION_ORDER.get(x["classification"], 99) ) return { "status": "success", "data": { "disease": disease, "submissions": results, "submission_count": len(results), "unique_genes": len(set(r["gene_symbol"] for r in results)), }, "metadata": { "source": "GenCC (Gene Curation Coalition)", "disease": disease, }, } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"Failed to download GenCC data: {str(e)}", } except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[文档] def _get_classifications(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get summary of all gene-disease validity classification levels. Returns classification levels with counts and descriptions. Optionally filter by submitter organization. Args: arguments: Dict containing: - submitter: Optional filter by submitting organization name """ submitter_filter = arguments.get("submitter", "").strip() gene_symbol = arguments.get("gene_symbol", "").strip().upper() disease_filter = arguments.get("disease", "").strip().lower() try: records = _download_gencc_data() if gene_symbol: records = [r for r in records if _gene_matches(r, gene_symbol)] if disease_filter: records = [r for r in records if _disease_matches(r, disease_filter)] if submitter_filter: records = [ r for r in records if submitter_filter.lower() in r.get("submitter_title", "").lower() ] # Count classifications classification_counts = {} submitter_counts = {} for r in records: cls = r.get("classification_title", "Unknown") classification_counts[cls] = classification_counts.get(cls, 0) + 1 sub = r.get("submitter_title", "Unknown") submitter_counts[sub] = submitter_counts.get(sub, 0) + 1 # Build ordered classification summary classifications = [] for cls_name in sorted( classification_counts.keys(), key=lambda x: CLASSIFICATION_ORDER.get(x, 99), ): classifications.append( { "classification": cls_name, "count": classification_counts[cls_name], "rank": CLASSIFICATION_ORDER.get(cls_name, 99), } ) # Top submitters top_submitters = sorted(submitter_counts.items(), key=lambda x: -x[1])[:20] return { "status": "success", "data": { "classifications": classifications, "total_submissions": len(records), "unique_genes": len(set(r.get("gene_symbol", "") for r in records)), "unique_diseases": len( set(r.get("disease_curie", "") for r in records) ), "top_submitters": [ {"name": s[0], "count": s[1]} for s in top_submitters ], }, "metadata": { "source": "GenCC (Gene Curation Coalition)", "note": "Classifications follow ClinGen gene-disease validity framework", }, } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"Failed to download GenCC data: {str(e)}", } except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}