Source code for tooluniverse.pdc_tool

"""
PDC (Proteomics Data Commons) Tool - NCI Cancer Proteomics Database

Provides access to the PDC GraphQL API for querying cancer proteomics data
from programs like CPTAC, ICPC, APOLLO, HTAN, and others.

API: https://pdc.cancer.gov/graphql
Authentication: None required (free public API).
"""

import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool

PDC_GRAPHQL_URL = "https://pdc.cancer.gov/graphql"


def _execute_graphql(
    query: str, variables: Optional[Dict] = None, timeout: int = 30
) -> Dict[str, Any]:
    """Execute a GraphQL query against PDC."""
    payload = {"query": query}
    if variables:
        payload["variables"] = variables
    try:
        response = requests.post(
            PDC_GRAPHQL_URL,
            json=payload,
            headers={"Content-Type": "application/json"},
            timeout=timeout,
        )
        if response.status_code != 200:
            return {
                "ok": False,
                "error": "PDC API returned HTTP %d" % response.status_code,
            }
        data = response.json()
        if "errors" in data:
            msgs = "; ".join(e.get("message", str(e)) for e in data["errors"])
            return {"ok": False, "error": "GraphQL error: %s" % msgs}
        return {"ok": True, "data": data.get("data", {})}
    except requests.exceptions.Timeout:
        return {"ok": False, "error": "PDC API request timed out"}
    except requests.exceptions.ConnectionError:
        return {"ok": False, "error": "Failed to connect to PDC API"}
    except Exception as e:
        return {"ok": False, "error": "Request failed: %s" % str(e)}


[docs] @register_tool("PDCTool") class PDCTool(BaseTool): """ Tool for querying the NCI Proteomics Data Commons (PDC). PDC houses annotated proteomics data from CPTAC, ICPC, APOLLO, CBTN, and other cancer research programs covering 19+ cancer types with 160+ datasets. Provides access to: - Study search and metadata (disease type, analytical fraction, experiment type) - Gene/protein information with spectral counts across studies - Program and project listings (CPTAC, ICPC, APOLLO, etc.) - Detailed study summaries with file counts - Clinical data per study (demographics, diagnoses) """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.parameter = tool_config.get("parameter", {}) self.required = self.parameter.get("required", [])
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute a PDC query.""" operation = arguments.get("operation") if not operation: return {"status": "error", "error": "Missing required parameter: operation"} handlers = { "search_studies": self._search_studies, "get_gene_protein": self._get_gene_protein, "list_programs": self._list_programs, "get_study_summary": self._get_study_summary, "get_clinical_data": self._get_clinical_data, } handler = handlers.get(operation) if not handler: return { "status": "error", "error": "Unknown operation: %s" % operation, "available_operations": list(handlers.keys()), } try: return handler(arguments) except requests.exceptions.Timeout: return {"status": "error", "error": "PDC API request timed out"} except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to PDC API"} except Exception as e: return {"status": "error", "error": "Operation failed: %s" % str(e)}
[docs] def _search_studies(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search PDC studies by name/keyword.""" query_text = arguments.get("query") if not query_text: return { "status": "error", "error": "query parameter is required for study search", } gql = """ { studySearch(name: "%s") { studies { study_id name pdc_study_id submitter_id_name } total } } """ % query_text.replace('"', '\\"') result = _execute_graphql(gql, timeout=30) if not result["ok"]: return {"status": "error", "error": result["error"]} search_data = result["data"].get("studySearch", {}) studies = search_data.get("studies", []) return { "status": "success", "data": { "query": query_text, "studies": studies, "num_results": len(studies), }, }
[docs] def _get_gene_protein(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get protein information and study coverage for a gene symbol.""" gene_name = arguments.get("gene_name") if not gene_name: return { "status": "error", "error": "gene_name parameter is required", } gql = """ { geneSpectralCount(gene_name: "%s") { gene_id gene_name NCBI_gene_id authority description organism proteins spectral_counts { study_id pdc_study_id project_id spectral_count distinct_peptide unshared_peptide } } } """ % gene_name.replace('"', '\\"') result = _execute_graphql(gql, timeout=30) if not result["ok"]: return {"status": "error", "error": result["error"]} gene_data = result["data"].get("geneSpectralCount", []) if not gene_data: return { "status": "error", "error": "Gene '%s' not found in PDC" % gene_name, } # The API returns a list but typically one entry for the gene gene_info = gene_data[0] # Parse protein accessions (semicolon-separated string) proteins_str = gene_info.get("proteins", "") protein_list = ( [p.strip() for p in proteins_str.split(";") if p.strip()] if proteins_str else [] ) return { "status": "success", "data": { "gene_id": gene_info.get("gene_id"), "gene_name": gene_info.get("gene_name"), "ncbi_gene_id": gene_info.get("NCBI_gene_id"), "authority": gene_info.get("authority"), "description": gene_info.get("description"), "organism": gene_info.get("organism"), "proteins": protein_list, "num_proteins": len(protein_list), "spectral_counts": gene_info.get("spectral_counts", []), "num_studies": len(gene_info.get("spectral_counts", [])), }, }
[docs] def _list_programs(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """List all PDC programs and their projects.""" gql = """ { allPrograms { program_id name projects { project_id name } } } """ result = _execute_graphql(gql, timeout=30) if not result["ok"]: return {"status": "error", "error": result["error"]} programs = result["data"].get("allPrograms", []) return { "status": "success", "data": { "programs": programs, "num_programs": len(programs), }, }
[docs] def _get_study_summary(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed metadata for a specific study by PDC study ID.""" pdc_study_id = arguments.get("pdc_study_id") if not pdc_study_id: return { "status": "error", "error": "pdc_study_id parameter is required (e.g., 'PDC000127')", } gql = """ { study(pdc_study_id: "%s") { study_id study_name pdc_study_id disease_type primary_site analytical_fraction experiment_type cases_count aliquots_count program_name project_name embargo_date filesCount { data_category file_type files_count } } } """ % pdc_study_id.replace('"', '\\"') result = _execute_graphql(gql, timeout=30) if not result["ok"]: return {"status": "error", "error": result["error"]} study_data = result["data"].get("study", []) if not study_data: return { "status": "error", "error": "Study '%s' not found in PDC" % pdc_study_id, } study = study_data[0] return { "status": "success", "data": { "study_id": study.get("study_id"), "study_name": study.get("study_name"), "pdc_study_id": study.get("pdc_study_id"), "disease_type": study.get("disease_type"), "primary_site": study.get("primary_site"), "analytical_fraction": study.get("analytical_fraction"), "experiment_type": study.get("experiment_type"), "cases_count": study.get("cases_count"), "aliquots_count": study.get("aliquots_count"), "program_name": study.get("program_name"), "project_name": study.get("project_name"), "embargo_date": study.get("embargo_date"), "file_counts": study.get("filesCount", []), }, }
[docs] def _get_clinical_data(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get clinical metadata for samples in a study.""" pdc_study_id = arguments.get("pdc_study_id") if not pdc_study_id: return { "status": "error", "error": "pdc_study_id parameter is required (e.g., 'PDC000127')", } offset = arguments.get("offset", 0) limit = arguments.get("limit", 20) gql = """ { paginatedCaseDemographicsPerStudy( pdc_study_id: "%s", offset: %d, limit: %d ) { total caseDemographicsPerStudy { case_id case_submitter_id disease_type primary_site demographics { gender ethnicity race } } } } """ % (pdc_study_id.replace('"', '\\"'), offset, limit) result = _execute_graphql(gql, timeout=30) if not result["ok"]: return {"status": "error", "error": result["error"]} paginated = result["data"].get("paginatedCaseDemographicsPerStudy", {}) cases = paginated.get("caseDemographicsPerStudy", []) total = paginated.get("total", 0) return { "status": "success", "data": { "pdc_study_id": pdc_study_id, "total_cases": total, "offset": offset, "limit": limit, "cases": cases, "num_returned": len(cases), }, }