Source code for tooluniverse.rcsb_search_tool
"""
RCSB PDB Structure Search Tool
Tool for searching similar protein structures using RCSB PDB Search API v2.
Supports both sequence-based and structure-based similarity search.
"""
import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("RCSBSearchTool")
class RCSBSearchTool(BaseTool):
"""
Tool for searching similar protein structures using RCSB PDB Search API v2.
Supports:
- Sequence-based similarity search
- Structure-based similarity search (using PDB ID)
- Text-based search (by name, keyword, etc.)
"""
[docs]
def __init__(self, tool_config: Dict):
super().__init__(tool_config)
self.api_url = "https://search.rcsb.org/rcsbsearch/v2/query"
self.timeout = 60 # API request timeout in seconds
[docs]
def _validate_pdb_id(self, pdb_id: str) -> bool:
"""Validate PDB ID format (4 characters, alphanumeric)"""
if not isinstance(pdb_id, str):
return False
pdb_id = pdb_id.strip().upper()
return len(pdb_id) == 4 and pdb_id.isalnum()
[docs]
def _validate_sequence(self, sequence: str) -> bool:
"""Validate protein sequence (amino acids only)"""
if not isinstance(sequence, str):
return False
sequence = sequence.strip().upper()
if len(sequence) < 10:
return False
# Valid amino acid codes
valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
return all(c in valid_aa for c in sequence)
[docs]
def _build_sequence_query(
self, sequence: str, identity_cutoff: float, max_results: int
) -> Dict[str, Any]:
"""
Build sequence similarity search query.
Uses the correct RCSB Search API v2 format:
- Uses "value" parameter (not "target")
- Includes evalue_cutoff (required, default 0.1)
- Includes identity_cutoff (optional, 0-1)
- Includes sequence_type ("protein")
"""
# Convert identity_cutoff to evalue_cutoff if needed
# Lower identity_cutoff means higher similarity requirement
# We use a reasonable evalue_cutoff based on identity
evalue_cutoff = 0.1 # Default evalue cutoff
if identity_cutoff > 0.9:
evalue_cutoff = 0.001 # High similarity
elif identity_cutoff > 0.7:
evalue_cutoff = 0.01 # Medium-high similarity
else:
evalue_cutoff = 0.1 # Lower similarity
return {
"query": {
"type": "terminal",
"service": "sequence",
"parameters": {
"value": sequence.upper(),
"evalue_cutoff": evalue_cutoff,
"identity_cutoff": identity_cutoff,
"sequence_type": "protein",
},
},
"return_type": "entry",
"request_options": {
"paginate": {
"start": 0,
"rows": max_results,
},
"sort": [{"sort_by": "score", "direction": "desc"}],
},
}
[docs]
def _build_structure_query(
self, pdb_id: str, similarity_threshold: float, max_results: int
) -> Dict[str, Any]:
"""
Build structure similarity search query.
Uses the correct RCSB Search API v2 format:
- Uses "value" as an object with "entry_id" and "assembly_id"
- Includes "operator" (default: "strict_shape_match")
- Includes "target_search_space" (default: "assembly")
"""
return {
"query": {
"type": "terminal",
"service": "structure",
"parameters": {
"value": {
"entry_id": pdb_id.upper(),
"assembly_id": "1", # Default to first assembly
},
"operator": "strict_shape_match",
"target_search_space": "assembly",
},
},
"return_type": "entry",
"request_options": {
"paginate": {
"start": 0,
"rows": max_results,
},
"sort": [{"sort_by": "score", "direction": "desc"}],
},
}
[docs]
def _build_text_query(self, search_text: str, max_results: int) -> Dict[str, Any]:
"""
Build text search query.
Uses the correct RCSB Search API v2 format:
- Searches in multiple attributes
(struct.title, struct_keywords.pdbx_keywords)
- Uses OR logic to combine search conditions
- Supports pagination and sorting
"""
# Search in multiple attributes using OR logic
search_nodes = [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "struct.title",
"operator": "contains_words",
"value": search_text,
},
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "struct_keywords.pdbx_keywords",
"operator": "contains_words",
"value": search_text,
},
},
]
return {
"query": {
"type": "group",
"logical_operator": "or",
"nodes": search_nodes,
},
"return_type": "entry",
"request_options": {
"paginate": {
"start": 0,
"rows": max_results,
},
"sort": [{"sort_by": "score", "direction": "desc"}],
},
}
[docs]
def _parse_search_results(self, response_data: Dict[str, Any]) -> list:
"""
Parse RCSB Search API response.
Expected response format:
{
"query_id": "...",
"result_type": "entry",
"total_count": 123,
"result_set": [
{"identifier": "6B3Q", "score": 1.0},
...
]
}
"""
results = []
if not isinstance(response_data, dict):
return results
# Extract result identifiers from result_set
result_set = response_data.get("result_set", [])
if not result_set:
return results
for idx, entry in enumerate(result_set):
# Entry is a dict with "identifier" and optionally "score"
if isinstance(entry, dict):
pdb_id = entry.get("identifier", entry.get("pdb_id", ""))
score = entry.get("score")
elif isinstance(entry, str):
# Fallback: if entry is just a string, use it as PDB ID
pdb_id = entry
score = None
else:
continue
if pdb_id:
result = {
"pdb_id": pdb_id,
"rank": idx + 1,
}
if score is not None:
result["score"] = score
results.append(result)
return results
[docs]
def run(
self,
arguments: Optional[Dict[str, Any]] = None,
stream_callback: Optional[Any] = None,
use_cache: bool = False,
validate: bool = True,
) -> Dict[str, Any]:
"""
Execute structure similarity search.
Args:
arguments: Dictionary containing:
- query: PDB ID, protein sequence, or search text
- search_type: "sequence", "structure", or "text"
(default: "sequence")
- similarity_threshold: Similarity threshold 0-1 (default: 0.7)
(not used for text search)
- max_results: Maximum number of results (default: 20)
stream_callback: Optional callback for streaming
use_cache: Whether to use caching
validate: Whether to validate parameters
Returns:
Dictionary with search results or error information
"""
if arguments is None:
arguments = {}
query = arguments.get("query", "")
if query:
query = str(query).strip()
search_type = arguments.get("search_type", "sequence")
if search_type:
search_type = str(search_type).lower()
else:
search_type = "sequence"
# Get and validate similarity_threshold with clamping
similarity_threshold_raw = arguments.get("similarity_threshold", 0.7)
try:
similarity_threshold = float(similarity_threshold_raw)
similarity_threshold = max(0.0, min(1.0, similarity_threshold))
except (ValueError, TypeError):
similarity_threshold = 0.7
# Get and validate max_results with clamping
max_results_raw = arguments.get("max_results", 20)
try:
max_results = int(max_results_raw)
max_results = max(1, min(100, max_results))
except (ValueError, TypeError):
max_results = 20
# Validate parameters
if not query:
return {
"error": (
"Missing required parameter: query. "
"Provide either a PDB ID (e.g., '1ABC'), "
"a protein sequence (amino acids), "
"or search text (e.g., drug name, keyword)."
),
}
# Build query based on search type
if search_type == "structure":
# Structure-based search using PDB ID
if not self._validate_pdb_id(query):
return {
"error": (
f"Invalid PDB ID format: '{query}'. "
"PDB ID must be 4 alphanumeric characters "
"(e.g., '1ABC')."
),
}
api_query = self._build_structure_query(
query, similarity_threshold, max_results
)
query_type = "structure"
elif search_type == "sequence":
# Sequence-based search
if not self._validate_sequence(query):
return {
"error": (
f"Invalid protein sequence: '{query[:50]}...'. "
"Sequence must be at least 10 amino acids long "
"and contain only valid amino acid codes "
"(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, "
"R, S, T, V, W, Y)."
),
}
api_query = self._build_sequence_query(
query, similarity_threshold, max_results
)
query_type = "sequence"
elif search_type == "text":
# Text-based search (by name, keyword, etc.)
if not query or not query.strip():
return {
"error": (
"Invalid search text. "
"Provide a non-empty search term "
"(e.g., drug name, protein name, keyword)."
),
}
api_query = self._build_text_query(query.strip(), max_results)
query_type = "text"
else:
return {
"error": (
f"Invalid search_type: '{search_type}'. "
"Must be 'sequence', 'structure', or 'text'."
),
}
# Make API request
try:
response = requests.post(
self.api_url,
json=api_query,
headers={"Content-Type": "application/json"},
timeout=self.timeout,
) # noqa: E501
response.raise_for_status()
# Handle HTTP 204 No Content (empty result set)
# RCSB API returns 204 when no results are found
if response.status_code == 204 or len(response.content) == 0:
response_data = {
"result_set": [],
"total_count": 0,
}
else:
response_data = response.json()
except requests.exceptions.Timeout:
return {
"error": (
"Request timeout. The RCSB PDB Search API "
"did not respond in time. Please try again later."
),
}
except requests.exceptions.HTTPError as e:
# Try to extract detailed error message from API response
error_detail = str(e)
try:
if hasattr(e, "response") and e.response is not None:
error_response = e.response.json()
if isinstance(error_response, dict):
api_message = error_response.get("message", "")
if api_message:
error_detail = f"{str(e)}. API message: {api_message}"
except Exception:
pass # Use default error message if parsing fails
if e.response.status_code == 400:
return {
"error": (
f"Invalid request to RCSB PDB Search API: "
f"{error_detail}. "
"Please check your query parameters. "
"Note: The API query format may need adjustment. "
"See documentation at "
"https://search.rcsb.org/redoc/index.html"
),
}
elif e.response.status_code == 404:
# 404 can mean the PDB ID doesn't exist or
# doesn't support this search type
pdb_id_msg = query if search_type == "structure" else "provided"
error_msg = (
"Structure not found or does not support "
"similarity search. "
f"The PDB ID '{pdb_id_msg}' "
"may not exist in the database or may not support "
"structure similarity search. "
"Please verify the PDB ID is correct."
)
return {"error": error_msg}
else:
return {
"error": (
f"RCSB PDB Search API error "
f"(HTTP {e.response.status_code}): {error_detail}"
),
}
except requests.exceptions.RequestException as e:
return {
"error": (
f"Network error while connecting to RCSB PDB Search API: {str(e)}"
),
}
except Exception as e:
return {
"error": f"Unexpected error during search: {str(e)}",
}
# Parse results
try:
results = self._parse_search_results(response_data)
# Get total_count from API response if available
# This represents the total number of matches in the database,
# not just the number of results returned
# (which may be limited by max_results)
total_found = response_data.get("total_count", len(results))
if not results:
if query_type == "text":
message = f"No structures found matching '{query}'."
else:
message = (
f"No similar structures found with "
f"similarity threshold >= {similarity_threshold}."
)
return {
"query": query,
"search_type": query_type,
"similarity_threshold": (
similarity_threshold if query_type != "text" else None
),
"total_found": total_found,
"results": [],
"message": message,
}
result_dict = {
"query": query,
"search_type": query_type,
"total_found": total_found,
"results": results,
}
# Only include similarity_threshold for sequence/structure searches
if query_type != "text":
result_dict["similarity_threshold"] = similarity_threshold
return result_dict
except Exception as e:
return {
"error": f"Error parsing search results: {str(e)}",
"raw_response": str(response_data)[:500],
}