Source code for tooluniverse.rcsb_graphql_tool

# rcsb_graphql_tool.py
"""
RCSB PDB GraphQL Data API tool for ToolUniverse.

The RCSB PDB Data API (GraphQL) provides rich, structured access to
PDB structure details, ligand/chemical component information, and
polymer entity data. Supports batch queries for multiple entries.

API: https://data.rcsb.org/graphql
No authentication required. Free public access.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

RCSB_GRAPHQL_URL = "https://data.rcsb.org/graphql"


[docs] @register_tool("RCSBGraphQLTool") class RCSBGraphQLTool(BaseTool): """ Tool for querying the RCSB PDB Data API via GraphQL. Supports: - Structure summary (title, resolution, method, citation, etc.) - Chemical component / ligand details (formula, SMILES, targets) - Polymer entity details (sequence, annotations, descriptions) No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "structure_summary")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the RCSB GraphQL query.""" try: return self._query(arguments) except requests.exceptions.Timeout: return {"error": f"RCSB Data API timed out after {self.timeout}s"} except requests.exceptions.ConnectionError: return {"error": "Failed to connect to RCSB Data API"} except Exception as e: return {"error": f"Unexpected error querying RCSB Data API: {str(e)}"}
[docs] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate GraphQL query.""" if self.endpoint == "structure_summary": return self._get_structure_summary(arguments) elif self.endpoint == "ligand_info": return self._get_ligand_info(arguments) elif self.endpoint == "polymer_entity": return self._get_polymer_entity(arguments) else: return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs] def _execute_graphql(self, query_str: str) -> Dict[str, Any]: """Execute a GraphQL query against the RCSB Data API.""" response = requests.post( RCSB_GRAPHQL_URL, json={"query": query_str}, headers={"Content-Type": "application/json"}, timeout=self.timeout, ) response.raise_for_status() result = response.json() if "errors" in result: error_msgs = [e.get("message", "") for e in result["errors"]] return {"error": f"GraphQL errors: {'; '.join(error_msgs)}"} return result
[docs] def _get_structure_summary(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get comprehensive structure summary for one or more PDB IDs.""" pdb_ids = arguments.get("pdb_ids") pdb_id = arguments.get("pdb_id") if pdb_id and not pdb_ids: pdb_ids = [pdb_id] elif isinstance(pdb_ids, str): pdb_ids = [p.strip() for p in pdb_ids.split(",")] if not pdb_ids: return { "error": "pdb_id or pdb_ids parameter is required (e.g., '4HHB' or '4HHB,1TUP')" } # Uppercase PDB IDs pdb_ids = [p.upper().strip() for p in pdb_ids] ids_str = '", "'.join(pdb_ids) query_str = f'''{{ entries(entry_ids: ["{ids_str}"]) {{ rcsb_id struct {{ title }} rcsb_entry_info {{ resolution_combined molecular_weight deposited_atom_count polymer_entity_count nonpolymer_entity_count experimental_method }} rcsb_accession_info {{ deposit_date initial_release_date }} rcsb_primary_citation {{ pdbx_database_id_PubMed title journal_abbrev year }} exptl {{ method }} cell {{ length_a length_b length_c }} }} }}''' result = self._execute_graphql(query_str) if "error" in result: return result entries = result.get("data", {}).get("entries", []) if not entries: return { "data": [], "metadata": { "source": "RCSB PDB GraphQL Data API", "requested_ids": pdb_ids, "returned": 0, }, } structures = [] for entry in entries: if entry is None: continue info = entry.get("rcsb_entry_info") or {} accession = entry.get("rcsb_accession_info") or {} citation = entry.get("rcsb_primary_citation") or {} struct = entry.get("struct") or {} cell = entry.get("cell") or {} resolution = info.get("resolution_combined") if isinstance(resolution, list) and resolution: resolution = resolution[0] structures.append( { "pdb_id": entry.get("rcsb_id"), "title": struct.get("title"), "resolution": resolution, "molecular_weight_kda": info.get("molecular_weight"), "atom_count": info.get("deposited_atom_count"), "polymer_entity_count": info.get("polymer_entity_count"), "nonpolymer_entity_count": info.get("nonpolymer_entity_count"), "experimental_method": info.get("experimental_method"), "deposit_date": accession.get("deposit_date"), "release_date": accession.get("initial_release_date"), "citation_pubmed_id": citation.get("pdbx_database_id_PubMed"), "citation_title": citation.get("title"), "citation_journal": citation.get("journal_abbrev"), "citation_year": citation.get("year"), "unit_cell_a": cell.get("length_a"), "unit_cell_b": cell.get("length_b"), "unit_cell_c": cell.get("length_c"), } ) return { "data": structures, "metadata": { "source": "RCSB PDB GraphQL Data API", "requested_ids": pdb_ids, "returned": len(structures), }, }
[docs] def _get_ligand_info(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get chemical component (ligand) information from PDB.""" comp_id = arguments.get("comp_id", "") if not comp_id: return { "error": "comp_id parameter is required (e.g., 'ATP', 'HEM', 'NAG')" } comp_id = comp_id.upper().strip() query_str = f'''{{ chem_comp(comp_id: "{comp_id}") {{ chem_comp {{ id name formula formula_weight type mon_nstd_parent_comp_id }} rcsb_chem_comp_descriptor {{ InChIKey SMILES }} rcsb_chem_comp_info {{ initial_release_date }} rcsb_chem_comp_target {{ target_actions comp_id name provenance_source reference_database_accession_code reference_database_name }} }} }}''' result = self._execute_graphql(query_str) if "error" in result: return result comp = result.get("data", {}).get("chem_comp") if not comp: return {"error": f"Chemical component '{comp_id}' not found in PDB"} basic = comp.get("chem_comp") or {} descriptor = comp.get("rcsb_chem_comp_descriptor") or {} info = comp.get("rcsb_chem_comp_info") or {} targets_raw = comp.get("rcsb_chem_comp_target") or [] targets = [] for t in targets_raw: targets.append( { "name": t.get("name"), "actions": t.get("target_actions"), "provenance": t.get("provenance_source"), "accession": t.get("reference_database_accession_code"), "database": t.get("reference_database_name"), } ) return { "data": { "comp_id": basic.get("id"), "name": basic.get("name"), "formula": basic.get("formula"), "formula_weight": basic.get("formula_weight"), "type": basic.get("type"), "parent_comp_id": basic.get("mon_nstd_parent_comp_id"), "inchikey": descriptor.get("InChIKey"), "smiles": descriptor.get("SMILES"), "initial_release_date": info.get("initial_release_date"), "targets": targets, }, "metadata": { "source": "RCSB PDB GraphQL Data API", "comp_id": comp_id, "target_count": len(targets), }, }
[docs] def _get_polymer_entity(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get polymer entity details (sequence, annotations, etc.).""" entity_ids = arguments.get("entity_ids") pdb_id = arguments.get("pdb_id") entity_num = arguments.get("entity_num") or 1 if pdb_id and not entity_ids: entity_ids = [f"{pdb_id.upper().strip()}_{entity_num}"] elif isinstance(entity_ids, str): entity_ids = [e.strip() for e in entity_ids.split(",")] if not entity_ids: return { "error": "pdb_id or entity_ids parameter is required (e.g., pdb_id='4HHB' or entity_ids='4HHB_1,4HHB_2')" } ids_str = '", "'.join(entity_ids) query_str = f'''{{ polymer_entities(entity_ids: ["{ids_str}"]) {{ rcsb_id rcsb_polymer_entity {{ pdbx_description }} entity_poly {{ pdbx_strand_id rcsb_entity_polymer_type type pdbx_seq_one_letter_code_can }} rcsb_polymer_entity_annotation {{ annotation_id type description }} rcsb_entity_source_organism {{ scientific_name ncbi_taxonomy_id }} }} }}''' result = self._execute_graphql(query_str) if "error" in result: return result entities = result.get("data", {}).get("polymer_entities", []) if not entities: return { "data": [], "metadata": { "source": "RCSB PDB GraphQL Data API", "requested_ids": entity_ids, "returned": 0, }, } polymer_list = [] for entity in entities: if entity is None: continue desc_obj = entity.get("rcsb_polymer_entity") or {} poly = entity.get("entity_poly") or {} annotations_raw = entity.get("rcsb_polymer_entity_annotation") or [] organisms_raw = entity.get("rcsb_entity_source_organism") or [] annotations = [] for a in annotations_raw: annotations.append( { "id": a.get("annotation_id"), "type": a.get("type"), "description": a.get("description"), } ) organisms = [] for o in organisms_raw: organisms.append( { "scientific_name": o.get("scientific_name"), "ncbi_taxonomy_id": o.get("ncbi_taxonomy_id"), } ) sequence = poly.get("pdbx_seq_one_letter_code_can", "") polymer_list.append( { "entity_id": entity.get("rcsb_id"), "description": desc_obj.get("pdbx_description"), "chain_ids": poly.get("pdbx_strand_id"), "polymer_type": poly.get("rcsb_entity_polymer_type"), "entity_type": poly.get("type"), "sequence": sequence, "sequence_length": len(sequence) if sequence else 0, "annotations": annotations, "source_organisms": organisms, } ) return { "data": polymer_list, "metadata": { "source": "RCSB PDB GraphQL Data API", "requested_ids": entity_ids, "returned": len(polymer_list), }, }