Source code for tooluniverse.rcsb_data_tool
# rcsb_data_tool.py
"""
RCSB PDB Data API tool for ToolUniverse.
Provides access to the RCSB PDB Data API REST endpoints for retrieving
detailed structural metadata, assembly information, and non-polymer entity
(ligand/small molecule) data from PDB structures.
API: https://data.rcsb.org/
No authentication required. Free public access.
"""
import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
RCSB_DATA_BASE_URL = "https://data.rcsb.org/rest/v1/core"
[docs]
class RCSBDataTool(BaseTool):
"""
Tool for RCSB PDB Data API providing direct REST access to PDB
structure metadata, biological assembly info, and non-polymer entities.
Complements existing RCSB GraphQL and Search tools by providing
simpler, direct access to individual resource endpoints.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "entry")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the RCSB Data API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"RCSB Data API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to RCSB Data API"}
except requests.exceptions.HTTPError as e:
code = e.response.status_code if e.response is not None else "unknown"
if code == 404:
return {"error": f"Entry not found in RCSB PDB: {arguments}"}
return {"error": f"RCSB Data API HTTP error: {code}"}
except Exception as e:
return {"error": f"Unexpected error querying RCSB Data API: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "entry":
return self._get_entry(arguments)
elif self.endpoint == "assembly":
return self._get_assembly(arguments)
elif self.endpoint == "nonpolymer_entity":
return self._get_nonpolymer_entity(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _get_entry(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get comprehensive entry details for a PDB structure."""
pdb_id = arguments.get("pdb_id", "").upper()
if not pdb_id:
return {"error": "pdb_id parameter is required (e.g., '4HHB', '1TUP')"}
url = f"{RCSB_DATA_BASE_URL}/entry/{pdb_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
data = response.json()
entry_info = data.get("rcsb_entry_info", {})
struct = data.get("struct", {})
exptl = data.get("exptl", [])
accession = data.get("rcsb_accession_info", {})
cell = data.get("cell", {})
symmetry = data.get("symmetry", {})
# Extract experiment details
experiments = []
for exp in exptl:
experiments.append(
{
"method": exp.get("method"),
"crystals_number": exp.get("crystals_number"),
}
)
# Extract resolution
resolution = entry_info.get("resolution_combined", [])
return {
"data": {
"pdb_id": data.get("rcsb_id"),
"title": struct.get("title"),
"method": experiments[0].get("method") if experiments else None,
"resolution": resolution[0] if resolution else None,
"deposit_date": accession.get("deposit_date"),
"release_date": accession.get("initial_release_date"),
"revision_date": accession.get("revision_date"),
"polymer_entity_count": entry_info.get("polymer_entity_count"),
"nonpolymer_entity_count": entry_info.get("nonpolymer_entity_count"),
"deposited_atom_count": entry_info.get("deposited_atom_count"),
"deposited_model_count": entry_info.get(
"deposited_modeled_polymer_monomer_count"
),
"molecular_weight": entry_info.get("molecular_weight"),
"assembly_count": entry_info.get("assembly_count"),
"space_group": symmetry.get("space_group_name_H_M"),
"unit_cell": {
"a": cell.get("length_a"),
"b": cell.get("length_b"),
"c": cell.get("length_c"),
"alpha": cell.get("angle_alpha"),
"beta": cell.get("angle_beta"),
"gamma": cell.get("angle_gamma"),
}
if cell
else None,
},
"metadata": {
"source": "RCSB PDB Data API",
"pdb_id": pdb_id,
},
}
[docs]
def _get_assembly(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get biological assembly details for a PDB structure."""
pdb_id = arguments.get("pdb_id", "").upper()
assembly_id = arguments.get("assembly_id", "1")
if not pdb_id:
return {"error": "pdb_id parameter is required (e.g., '4HHB', '1TUP')"}
url = f"{RCSB_DATA_BASE_URL}/assembly/{pdb_id}/{assembly_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
data = response.json()
assembly_info = data.get("rcsb_assembly_info", {})
struct_assembly = data.get("pdbx_struct_assembly", {})
oper_list = data.get("pdbx_struct_oper_list", [])
auth_evidence = data.get("pdbx_struct_assembly_auth_evidence", [])
# Extract operations
operations = []
for op in oper_list[:10]:
operations.append(
{
"id": op.get("id"),
"type": op.get("type"),
"name": op.get("name"),
}
)
# Extract evidence
evidence = []
for ev in auth_evidence[:5]:
evidence.append(
{
"experimental_support": ev.get("experimental_support"),
"details": ev.get("details"),
}
)
return {
"data": {
"pdb_id": pdb_id,
"assembly_id": data.get("rcsb_id"),
"oligomeric_details": struct_assembly.get("oligomeric_details"),
"oligomeric_count": struct_assembly.get("oligomeric_count"),
"method_details": struct_assembly.get("method_details"),
"polymer_entity_count": assembly_info.get("polymer_entity_count"),
"nonpolymer_entity_count": assembly_info.get("nonpolymer_entity_count"),
"polymer_entity_instance_count": assembly_info.get(
"polymer_entity_instance_count"
),
"total_polymer_monomer_count": assembly_info.get(
"total_polymer_monomer_count"
),
"operations": operations,
"evidence": evidence,
},
"metadata": {
"source": "RCSB PDB Data API - Assembly",
"pdb_id": pdb_id,
"assembly_id": assembly_id,
},
}
[docs]
def _get_nonpolymer_entity(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get non-polymer entity (ligand/small molecule) details."""
pdb_id = arguments.get("pdb_id", "").upper()
entity_id = arguments.get("entity_id", "")
if not pdb_id or not entity_id:
return {
"error": "Both pdb_id and entity_id are required (e.g., pdb_id='4HHB', entity_id='3')"
}
url = f"{RCSB_DATA_BASE_URL}/nonpolymer_entity/{pdb_id}/{entity_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
data = response.json()
entity_nonpoly = data.get("pdbx_entity_nonpoly", {})
nonpoly_info = data.get("rcsb_nonpolymer_entity", {})
container_ids = data.get("rcsb_nonpolymer_entity_container_identifiers", {})
data.get("nonpolymer_comp", {})
drugbank = data.get("rcsb_nonpolymer_entity_annotation", [])
# Extract annotations
annotations = []
for ann in drugbank[:10]:
annotations.append(
{
"type": ann.get("type"),
"annotation_id": ann.get("annotation_id"),
"name": ann.get("name"),
"description": ann.get("description"),
}
)
return {
"data": {
"pdb_id": pdb_id,
"entity_id": data.get("rcsb_id"),
"name": entity_nonpoly.get("name"),
"comp_id": entity_nonpoly.get("comp_id"),
"formula_weight": nonpoly_info.get("formula_weight"),
"details": nonpoly_info.get("details"),
"auth_asym_ids": container_ids.get("auth_asym_ids", []),
"annotations": annotations,
},
"metadata": {
"source": "RCSB PDB Data API - Nonpolymer Entity",
"pdb_id": pdb_id,
"entity_id": entity_id,
},
}