tooluniverse.gdc_tool 源代码

import json
from typing import Any, Dict
from urllib.parse import urlencode
from urllib.request import Request, urlopen

from tooluniverse.tool_registry import register_tool


def _http_get(
    url: str,
    headers: Dict[str, str] | None = None,
    timeout: int = 30,
) -> Dict[str, Any]:
    req = Request(url, headers=headers or {})
    with urlopen(req, timeout=timeout) as resp:
        data = resp.read()
        try:
            return json.loads(data.decode("utf-8", errors="ignore"))
        except Exception:
            return {"raw": data.decode("utf-8", errors="ignore")}


def _http_post(
    url: str,
    payload: Dict[str, Any],
    headers: Dict[str, str] | None = None,
    timeout: int = 30,
) -> Dict[str, Any]:
    """POST request helper for GDC API."""
    headers = headers or {}
    headers["Content-Type"] = "application/json"
    data = json.dumps(payload).encode("utf-8")
    req = Request(url, data=data, headers=headers, method="POST")
    with urlopen(req, timeout=timeout) as resp:
        response_data = resp.read()
        try:
            return json.loads(response_data.decode("utf-8", errors="ignore"))
        except Exception:
            return {"raw": response_data.decode("utf-8", errors="ignore")}


[文档] @register_tool( "GDCCasesTool", config={ "name": "GDC_search_cases", "type": "GDCCasesTool", "description": "Search NCI GDC cases via /cases", "parameter": { "type": "object", "properties": { "project_id": { "type": "string", "description": "GDC project identifier (e.g., 'TCGA-BRCA')", }, "size": { "type": "integer", "default": 10, "minimum": 1, "maximum": 100, "description": "Number of results (1–100)", }, "offset": { "type": "integer", "default": 0, "minimum": 0, "description": "Offset for pagination (0-based)", }, }, }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCCasesTool:
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) query: Dict[str, Any] = {} if arguments.get("project_id"): # Build filters JSON for project_id filters = { "op": "=", "content": { "field": "project.project_id", "value": [arguments["project_id"]], }, } query["filters"] = json.dumps(filters) if arguments.get("size") is not None: query["size"] = int(arguments["size"]) if arguments.get("offset") is not None: query["from"] = int(arguments["offset"]) url = f"{base}/cases?{urlencode(query)}" try: data = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) return { "status": "success", "source": "GDC", "endpoint": "cases", "query": query, "data": data, "success": True, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", "endpoint": "cases", "success": False, }
[文档] @register_tool( "GDCFilesTool", config={ "name": "GDC_list_files", "type": "GDCFilesTool", "description": "List NCI GDC files via /files with optional data_type filter", "parameter": { "type": "object", "properties": { "data_type": { "type": "string", "description": "Data type filter (e.g., 'Gene Expression Quantification')", }, "size": { "type": "integer", "default": 10, "minimum": 1, "maximum": 100, "description": "Number of results (1–100)", }, "offset": { "type": "integer", "default": 0, "minimum": 0, "description": "Offset for pagination (0-based)", }, }, }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCFilesTool:
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) query: Dict[str, Any] = {} if arguments.get("data_type"): filters = { "op": "=", "content": { "field": "files.data_type", "value": [arguments["data_type"]], }, } query["filters"] = json.dumps(filters) if arguments.get("size") is not None: query["size"] = int(arguments["size"]) if arguments.get("offset") is not None: query["from"] = int(arguments["offset"]) url = f"{base}/files?{urlencode(query)}" try: data = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) return { "status": "success", "source": "GDC", "endpoint": "files", "query": query, "data": data, "success": True, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", "endpoint": "files", "success": False, }
[文档] @register_tool( "GDCProjectsTool", config={ "name": "GDC_list_projects", "type": "GDCProjectsTool", "description": "List GDC projects (TCGA, TARGET, etc.) with summary statistics", "parameter": { "type": "object", "properties": { "program": { "type": "string", "description": "Filter by program (e.g., 'TCGA', 'TARGET')", }, "size": { "type": "integer", "default": 20, "minimum": 1, "maximum": 100, "description": "Number of results (1–100)", }, }, }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCProjectsTool: """List GDC projects including TCGA and TARGET cohorts."""
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) query: Dict[str, Any] = { "fields": "project_id,name,primary_site,disease_type,program.name,summary.case_count,summary.file_count", } if arguments.get("program"): filters = { "op": "=", "content": { "field": "program.name", "value": [arguments["program"]], }, } query["filters"] = json.dumps(filters) if arguments.get("size") is not None: query["size"] = int(arguments["size"]) url = f"{base}/projects?{urlencode(query)}" try: data = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) return { "status": "success", "source": "GDC", "endpoint": "projects", "data": data, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", "endpoint": "projects", }
[文档] @register_tool( "GDCSSMTool", config={ "name": "GDC_get_ssm_by_gene", "type": "GDCSSMTool", "description": "Get somatic mutations (SSMs) for a gene across TCGA/GDC projects", "parameter": { "type": "object", "properties": { "gene_symbol": { "type": "string", "description": "Gene symbol (e.g., 'TP53', 'EGFR', 'BRAF')", }, "project_id": { "type": "string", "description": "Optional: Filter by project (e.g., 'TCGA-BRCA')", }, "size": { "type": "integer", "default": 20, "minimum": 1, "maximum": 100, "description": "Number of results (1–100)", }, }, "required": ["gene_symbol"], }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCSSMTool: """Query somatic mutations from GDC/TCGA."""
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) gene_symbol = arguments.get("gene_symbol") if not gene_symbol: return {"status": "error", "error": "gene_symbol parameter is required"} # Build filters filter_content = [ { "op": "in", "content": { "field": "consequence.transcript.gene.symbol", "value": [gene_symbol], }, } ] if arguments.get("project_id"): filter_content.append( { "op": "=", "content": { "field": "cases.project.project_id", "value": [arguments["project_id"]], }, } ) filters = {"op": "and", "content": filter_content} query = { "filters": json.dumps(filters), "fields": "ssm_id,genomic_dna_change,mutation_type,consequence.transcript.gene.symbol,consequence.transcript.aa_change,consequence.transcript.consequence_type", "size": arguments.get("size", 20), } url = f"{base}/ssms?{urlencode(query)}" try: data = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) return { "status": "success", "source": "GDC", "endpoint": "ssms", "gene": gene_symbol, "data": data, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", "endpoint": "ssms", }
[文档] @register_tool( "GDCGeneExpressionTool", config={ "name": "GDC_get_gene_expression", "type": "GDCGeneExpressionTool", "description": "Query gene expression data availability from GDC/TCGA", "parameter": { "type": "object", "properties": { "project_id": { "type": "string", "description": "GDC project (e.g., 'TCGA-BRCA', 'TCGA-LUAD')", }, "gene_id": { "type": "string", "description": "Ensembl gene ID (e.g., 'ENSG00000141510' for TP53)", }, "size": { "type": "integer", "default": 10, "minimum": 1, "maximum": 100, "description": "Number of results", }, }, "required": ["project_id"], }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCGeneExpressionTool: """Query gene expression files from GDC."""
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) project_id = arguments.get("project_id") if not project_id: return {"status": "error", "error": "project_id parameter is required"} # Build filters for gene expression files filters = { "op": "and", "content": [ { "op": "=", "content": { "field": "cases.project.project_id", "value": [project_id], }, }, { "op": "=", "content": { "field": "data_type", "value": ["Gene Expression Quantification"], }, }, { "op": "=", "content": { "field": "experimental_strategy", "value": ["RNA-Seq"], }, }, ], } query = { "filters": json.dumps(filters), "fields": "file_id,file_name,data_type,experimental_strategy,workflow_type,cases.case_id,cases.submitter_id", "size": arguments.get("size", 10), } url = f"{base}/files?{urlencode(query)}" try: data = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) return { "status": "success", "source": "GDC", "endpoint": "gene_expression", "project": project_id, "data": data, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", }
[文档] @register_tool( "GDCCNVTool", config={ "name": "GDC_get_cnv_data", "type": "GDCCNVTool", "description": "Query copy number variation (CNV) data from GDC/TCGA", "parameter": { "type": "object", "properties": { "project_id": { "type": "string", "description": "GDC project (e.g., 'TCGA-BRCA')", }, "gene_symbol": { "type": "string", "description": "Optional: Gene symbol to filter CNVs", }, "size": { "type": "integer", "default": 10, "minimum": 1, "maximum": 100, "description": "Number of results", }, }, "required": ["project_id"], }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCCNVTool: """Query copy number variation data from GDC."""
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) project_id = arguments.get("project_id") if not project_id: return {"status": "error", "error": "project_id parameter is required"} # Build filters for CNV files filters = { "op": "and", "content": [ { "op": "=", "content": { "field": "cases.project.project_id", "value": [project_id], }, }, { "op": "in", "content": { "field": "data_type", "value": ["Copy Number Segment", "Gene Level Copy Number"], }, }, ], } query = { "filters": json.dumps(filters), "fields": "file_id,file_name,data_type,experimental_strategy,workflow_type,cases.case_id", "size": arguments.get("size", 10), } url = f"{base}/files?{urlencode(query)}" try: data = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) return { "status": "success", "source": "GDC", "endpoint": "cnv", "project": project_id, "data": data, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", }
[文档] @register_tool( "GDCMutationFrequencyTool", config={ "name": "GDC_get_mutation_frequency", "type": "GDCMutationFrequencyTool", "description": ( "Get pan-cancer mutation frequency statistics for a gene across all TCGA projects. " "Returns overall and per-project mutation rates. Note: this tool is pan-cancer only " "and does not support filtering by cancer type." ), "parameter": { "type": "object", "properties": { "gene_symbol": { "type": "string", "description": "Gene symbol (e.g., 'TP53', 'KRAS')", }, "gene": { "type": "string", "description": "Gene symbol alias — alternative to gene_symbol", }, }, "required": [], }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCMutationFrequencyTool: """Get mutation frequency for a gene across cancer types."""
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) gene_symbol = arguments.get("gene_symbol") or arguments.get("gene") if not gene_symbol: return {"status": "error", "error": "gene_symbol parameter is required"} # Step 1: Get gene metadata gene_filters = json.dumps( {"op": "=", "content": {"field": "symbol", "value": [gene_symbol]}} ) gene_url = f"{base}/genes?{urlencode({'filters': gene_filters, 'fields': 'symbol,name,gene_id,biotype,description,is_cancer_gene_census'})}" gene_info = {} try: gene_data = _http_get( gene_url, headers={"Accept": "application/json"}, timeout=timeout ) hits = gene_data.get("data", {}).get("hits", []) if hits: gene_info = hits[0] except Exception: pass # Step 2: Get SSM occurrence count via /ssm_occurrences with gene filter + project facet # Feature-81A-003: /ssm_occurrences requires the nested "ssm." prefix; # /ssms uses "consequence.transcript.gene.symbol" directly. ssm_filters = json.dumps( { "op": "in", "content": { "field": "ssm.consequence.transcript.gene.symbol", "value": [gene_symbol], }, } ) # Feature-83A-004: /ssm_occurrences does not support facets on # cases.project.project_id (returns warnings and empty aggregations). # Use size=0 for a count-only query. ssm_query = {"filters": ssm_filters, "size": 0} ssm_url = f"{base}/ssm_occurrences?{urlencode(ssm_query)}" try: ssm_data = _http_get( ssm_url, headers={"Accept": "application/json"}, timeout=timeout ) pagination = ssm_data.get("data", {}).get("pagination", {}) total_ssm_occurrences = pagination.get("total", 0) return { "status": "success", "source": "GDC", "gene": gene_symbol, "data": { "gene_info": gene_info, "total_ssm_occurrences": total_ssm_occurrences, "is_cancer_gene_census": gene_info.get( "is_cancer_gene_census", None ), }, } except Exception as e: return { "status": "error", "error": str(e), "source": "GDC", }
[文档] @register_tool( "GDCClinicalDataTool", config={ "name": "GDC_get_clinical_data", "type": "GDCClinicalDataTool", "description": ( "Get detailed clinical data for cancer cases from NCI GDC/TCGA. " "Returns demographics (gender, race, vital_status, age_at_index), " "diagnoses (primary_diagnosis, tumor_stage, age_at_diagnosis, days_to_last_follow_up), " "and treatments (therapeutic_agents, treatment_type). " "Filter by project, primary_site, disease_type, or vital_status." ), "parameter": { "type": "object", "properties": { "project_id": { "type": "string", "description": "GDC project identifier (e.g., 'TCGA-BRCA', 'TCGA-LUAD', 'TARGET-AML')", }, "primary_site": { "type": "string", "description": "Primary anatomical site (e.g., 'Breast', 'Lung', 'Brain')", }, "disease_type": { "type": "string", "description": "Disease type filter (e.g., 'Ductal and Lobular Neoplasms')", }, "vital_status": { "type": "string", "description": "Vital status filter: 'Alive' or 'Dead'", "enum": ["Alive", "Dead"], }, "gender": { "type": "string", "description": "Gender filter: 'female' or 'male'", "enum": ["female", "male"], }, "size": { "type": "integer", "default": 10, "minimum": 1, "maximum": 100, "description": "Number of cases to return (1-100)", }, "offset": { "type": "integer", "default": 0, "minimum": 0, "description": "Pagination offset (0-based)", }, }, }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCClinicalDataTool: """Get detailed clinical data for GDC/TCGA cancer cases.""" _CLINICAL_FIELDS = ",".join( [ "case_id", "submitter_id", "project.project_id", "project.name", "primary_site", "disease_type", ] ) _FILTER_MAP = { "project_id": "project.project_id", "primary_site": "primary_site", "disease_type": "disease_type", "vital_status": "demographic.vital_status", "gender": "demographic.gender", }
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) conditions = [] for param, field in self._FILTER_MAP.items(): value = arguments.get(param) if value: conditions.append( {"op": "=", "content": {"field": field, "value": [value]}} ) query: Dict[str, Any] = { "fields": self._CLINICAL_FIELDS, "expand": "diagnoses,demographic,treatments", "size": min( int(arguments.get("size") or arguments.get("limit") or 10), 100 ), "from": int(arguments.get("offset", 0)), } if conditions: if len(conditions) == 1: query["filters"] = json.dumps(conditions[0]) else: query["filters"] = json.dumps({"op": "and", "content": conditions}) url = f"{base}/cases?{urlencode(query)}" try: raw = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) hits = raw.get("data", {}).get("hits", []) pagination = raw.get("data", {}).get("pagination", {}) cases = [] for hit in hits: demo = hit.get("demographic", {}) or {} diagnoses_raw = hit.get("diagnoses", []) or [] treatments_raw = hit.get("treatments", []) or [] project = hit.get("project", {}) or {} case_record = { "case_id": hit.get("case_id"), "submitter_id": hit.get("submitter_id"), "project_id": project.get("project_id"), "project_name": project.get("name"), "primary_site": hit.get("primary_site"), "disease_type": hit.get("disease_type"), "gender": demo.get("gender"), "race": demo.get("race"), "ethnicity": demo.get("ethnicity"), "vital_status": demo.get("vital_status"), "age_at_index": demo.get("age_at_index"), "days_to_birth": demo.get("days_to_birth"), "days_to_death": demo.get("days_to_death"), "year_of_death": demo.get("year_of_death"), "diagnoses": [ { "primary_diagnosis": dx.get("primary_diagnosis"), "age_at_diagnosis": dx.get("age_at_diagnosis"), "tumor_stage": dx.get("ajcc_pathologic_stage"), "tumor_grade": dx.get("tumor_grade"), "morphology": dx.get("morphology"), "tissue_or_organ_of_origin": dx.get( "tissue_or_organ_of_origin" ), "days_to_last_follow_up": dx.get("days_to_last_follow_up"), "classification_of_tumor": dx.get( "classification_of_tumor" ), "icd_10_code": dx.get("icd_10_code"), "year_of_diagnosis": dx.get("year_of_diagnosis"), } for dx in diagnoses_raw ], "treatments": [ { "treatment_type": tx.get("treatment_type"), "therapeutic_agents": tx.get("therapeutic_agents"), "treatment_or_therapy": tx.get("treatment_or_therapy"), } for tx in treatments_raw ], } cases.append(case_record) return { "status": "success", "data": { "cases": cases, "pagination": { "total": pagination.get("total", 0), "count": pagination.get("count", 0), "page": pagination.get("page", 0), "pages": pagination.get("pages", 0), }, }, } except Exception as e: return {"status": "error", "error": str(e)}
[文档] @register_tool( "GDCSurvivalTool", config={ "name": "GDC_get_survival", "type": "GDCSurvivalTool", "description": ( "Get Kaplan-Meier survival data for a GDC/TCGA cancer cohort. " "Returns time-to-event data with censoring status and survival estimates " "for each patient. Filter by project and optionally by gene mutation status. " "Use for overall survival analysis of TCGA cancer types." ), "parameter": { "type": "object", "properties": { "project_id": { "type": "string", "description": "GDC project identifier (e.g., 'TCGA-BRCA', 'TCGA-LUAD', 'TCGA-GBM')", }, "gene_symbol": { "type": "string", "description": "Optional: gene symbol to filter cases with mutations in this gene (e.g., 'TP53', 'KRAS')", }, }, "required": ["project_id"], }, "settings": {"base_url": "https://api.gdc.cancer.gov", "timeout": 30}, }, ) class GDCSurvivalTool: """Get Kaplan-Meier survival data for GDC/TCGA cohorts."""
[文档] def __init__(self, tool_config=None): self.tool_config = tool_config or {}
[文档] def run(self, arguments: Dict[str, Any]): base = self.tool_config.get("settings", {}).get( "base_url", "https://api.gdc.cancer.gov" ) timeout = int(self.tool_config.get("settings", {}).get("timeout", 30)) project_id = arguments.get("project_id") if not project_id: return {"status": "error", "error": "project_id parameter is required"} # Build filter for project conditions = [ { "op": "=", "content": { "field": "project.project_id", "value": project_id, }, } ] gene_symbol = arguments.get("gene_symbol") if gene_symbol: conditions.append( { "op": "in", "content": { "field": "gene.symbol", "value": [gene_symbol], }, } ) if len(conditions) == 1: filters = conditions[0] else: filters = {"op": "and", "content": conditions} query = {"filters": json.dumps(filters)} url = f"{base}/analysis/survival?{urlencode(query)}" try: raw = _http_get( url, headers={"Accept": "application/json"}, timeout=timeout ) results = raw.get("results", []) if not results: return { "status": "success", "data": { "project_id": project_id, "gene_symbol": gene_symbol, "total_donors": 0, "donors": [], }, } donors = results[0].get("donors", []) # Summarize survival statistics alive_count = sum(1 for d in donors if d.get("censored")) dead_count = len(donors) - alive_count times = [d.get("time", 0) for d in donors] max_time = max(times) if times else 0 median_time = sorted(times)[len(times) // 2] if times else 0 return { "status": "success", "data": { "project_id": project_id, "gene_symbol": gene_symbol, "total_donors": len(donors), "alive_censored": alive_count, "deceased": dead_count, "max_follow_up_days": max_time, "median_follow_up_days": median_time, "donors": donors[:50], "note": ( f"Showing first 50 of {len(donors)} donors. " "Each donor has: time (days), censored (true=alive), survivalEstimate (KM estimate)." if len(donors) > 50 else None ), }, } except Exception as e: return {"status": "error", "error": str(e)}