Source code for tooluniverse.proteins_api_tool

"""
Proteins API Tool

This tool provides access to the EBI Proteins API for comprehensive protein
annotations, variation data, proteomics, and reference genome mappings.
"""

import requests
from typing import Any, Dict, Optional, List, Union
from concurrent.futures import ThreadPoolExecutor, as_completed
from .base_tool import BaseTool
from .tool_registry import register_tool


[docs] @register_tool("ProteinsAPIRESTTool") class ProteinsAPIRESTTool(BaseTool): """ Proteins API REST tool. Generic wrapper for Proteins API endpoints defined in proteins_api_tools.json. """
[docs] def __init__(self, tool_config: Dict): super().__init__(tool_config) self.base_url = "https://www.ebi.ac.uk/proteins/api" self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": "ToolUniverse/1.0"} ) self.timeout = 30
[docs] def _build_url(self, args: Dict[str, Any]) -> str: """Build URL from endpoint template and arguments""" endpoint_template = self.tool_config["fields"].get("endpoint", "") tool_name = self.tool_config.get("name", "") if endpoint_template: url = endpoint_template for k, v in args.items(): url = url.replace(f"{{{k}}}", str(v)) return url # Build URL based on tool name if tool_name == "proteins_api_get_protein": accession = args.get("accession", "") if accession: return f"{self.base_url}/proteins/{accession}" elif tool_name == "proteins_api_get_variants": accession = args.get("accession", "") if accession: # Use the variation API endpoint (not the proteins endpoint) return f"{self.base_url}/variation" elif tool_name == "proteins_api_get_proteomics": accession = args.get("accession", "") if accession: # Try proteomics endpoint, fallback to main protein endpoint return f"{self.base_url}/proteins/{accession}/proteomics" elif tool_name == "proteins_api_get_epitopes": accession = args.get("accession", "") if accession: # Try epitopes endpoint, fallback to main protein endpoint return f"{self.base_url}/proteins/{accession}/epitopes" elif tool_name == "proteins_api_search": # Proteins API search uses query parameter, not path return f"{self.base_url}/proteins/search" return self.base_url
[docs] def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]: """Build query parameters for Proteins API""" params = {} tool_name = self.tool_config.get("name", "") if tool_name == "proteins_api_search": # Proteins API search requires specific parameters: # gene, protein, accession, organism, taxid, etc. if "query" in args: query = args["query"] # Try to intelligently map query to the right parameter # If it looks like an accession (starts with letter and 5-6 chars) if query and len(query) <= 10 and any(c.isalpha() for c in query): if query[0].isalpha() and len(query) == 6: params["accession"] = query else: # Default to gene parameter (works for gene names like BRCA1) params["gene"] = query else: # For longer queries, try protein parameter params["protein"] = query if "size" in args: params["size"] = args["size"] if "offset" in args: params["offset"] = args["offset"] elif tool_name == "proteins_api_get_variants": # Variation API uses accession query parameter if "accession" in args: params["accession"] = args["accession"] if "size" in args: params["size"] = args.get("size", 100) if "offset" in args: params["offset"] = args["offset"] # Format parameter if "format" in args: params["format"] = args["format"] else: params["format"] = "json" return params
[docs] def _extract_from_protein_endpoint( self, accession: str, tool_name: str ) -> Optional[Dict[str, Any]]: """Extract data from main protein endpoint when specific endpoints don't exist""" try: protein_url = f"{self.base_url}/proteins/{accession}" response = self.session.get(protein_url, timeout=self.timeout) response.raise_for_status() protein_data = response.json() # Extract relevant data based on tool name if tool_name == "proteins_api_get_proteomics": # Look for proteomics-related data in response proteomics_data = [] # Check comments for proteomics information if "comments" in protein_data: for comment in protein_data["comments"]: comment_type = str(comment.get("commentType", "")).upper() if any( x in comment_type for x in [ "PTM", "MODIFIED", "MASS", "SPECTROMETRY", "PROTEOMICS", ] ): proteomics_data.append(comment) # Check features for proteomics-related features if "features" in protein_data: for feature in protein_data["features"]: feature_type = str(feature.get("type", "")).lower() if any( x in feature_type for x in ["modified", "mutagenesis", "site", "variant"] ): proteomics_data.append(feature) return { "status": "success", "data": proteomics_data, "url": response.url, "count": len(proteomics_data), "note": "Proteomics data extracted from main protein endpoint (proteomics endpoint not available). Includes PTM comments, modified residues, and related features.", "fallback_used": True, "source": "main_protein_endpoint", } elif tool_name == "proteins_api_get_epitopes": # Look for epitope-related data epitopes_data = [] # Check comments for epitope information if "comments" in protein_data: for comment in protein_data["comments"]: comment_str = str(comment).lower() comment_type = str(comment.get("commentType", "")).upper() if "epitope" in comment_str or comment_type == "IMMUNOLOGY": epitopes_data.append(comment) # Check features for epitope sites if "features" in protein_data: for feature in protein_data["features"]: feature_str = str(feature).lower() feature_type = str(feature.get("type", "")).lower() if "epitope" in feature_str or "epitope" in feature_type: epitopes_data.append(feature) return { "status": "success", "data": epitopes_data, "url": response.url, "count": len(epitopes_data), "note": "Epitope data extracted from main protein endpoint (epitopes endpoint not available). Includes immunology comments and epitope features if present.", "fallback_used": True, "source": "main_protein_endpoint", } elif tool_name == "proteins_api_get_features": # Extract features directly from main protein endpoint features_data = protein_data.get("features", []) return { "status": "success", "data": features_data, "url": response.url, "count": len(features_data), "note": "Features extracted from main protein endpoint (features endpoint not available as separate endpoint).", "fallback_used": True, "source": "main_protein_endpoint", } elif tool_name == "proteins_api_get_comments": # Extract comments directly from main protein endpoint comments_data = protein_data.get("comments", []) return { "status": "success", "data": comments_data, "url": response.url, "count": len(comments_data), "note": "Comments extracted from main protein endpoint (comments endpoint not available as separate endpoint).", "fallback_used": True, "source": "main_protein_endpoint", } elif tool_name == "proteins_api_get_xrefs": # Extract cross-references (dbReferences) from main protein endpoint xrefs_data = protein_data.get("dbReferences", []) return { "status": "success", "data": xrefs_data, "url": response.url, "count": len(xrefs_data), "note": "Cross-references extracted from main protein endpoint (xrefs endpoint not available as separate endpoint).", "fallback_used": True, "source": "main_protein_endpoint", } elif tool_name == "proteins_api_get_publications": # Extract references (publications) from main protein endpoint publications_data = protein_data.get("references", []) return { "status": "success", "data": publications_data, "url": response.url, "count": len(publications_data), "note": "Publications extracted from main protein endpoint (publications endpoint not available as separate endpoint).", "fallback_used": True, "source": "main_protein_endpoint", } elif tool_name == "proteins_api_get_genome_mappings": # Extract genome-related cross-references (Ensembl, RefSeq, etc.) genome_mappings = [] db_references = protein_data.get("dbReferences", []) # Look for Ensembl, RefSeq, and other genome-related cross-references genome_db_types = ["Ensembl", "RefSeq", "EMBL", "GenBank"] for ref in db_references: ref_type = ref.get("type", "") if ref_type in genome_db_types: # Try to extract genome-related information mapping_entry = { "database": ref_type, "id": ref.get("id", ""), "properties": ref.get("properties", {}), } genome_mappings.append(mapping_entry) return { "status": "success", "data": genome_mappings, "url": response.url, "count": len(genome_mappings), "note": "Genome mappings extracted from cross-references in main protein endpoint (genome endpoint not available as separate endpoint). Includes Ensembl, RefSeq, EMBL, and GenBank cross-references that may contain genomic location information.", "fallback_used": True, "source": "main_protein_endpoint", } except Exception: return None
[docs] def _parse_accessions(self, accession: Union[str, List[str]]) -> List[str]: """Parse accession parameter - handle string, list, or comma-separated string""" if isinstance(accession, list): return [str(acc).strip() for acc in accession if acc] elif isinstance(accession, str): # Check if it's comma-separated if "," in accession: return [acc.strip() for acc in accession.split(",") if acc.strip()] else: return [accession.strip()] else: return [str(accession).strip()]
[docs] def _handle_batch_request( self, accessions: List[str], tool_name: str, format: str = "json" ) -> Dict[str, Any]: """Handle batch requests by making multiple API calls and aggregating results""" results = [] errors = [] successful_count = 0 # Use ThreadPoolExecutor for parallel requests (max 5 concurrent) max_workers = min(5, len(accessions)) def fetch_single( acc: str, ) -> tuple[str, Optional[Dict[str, Any]], Optional[str]]: """Fetch data for a single accession""" try: # Build arguments for single accession single_args = {"accession": acc, "format": format} url = self._build_url(single_args) params = self._build_params(single_args) # For variants tool, params should contain accession if tool_name == "proteins_api_get_variants": params["accession"] = acc response = self.session.get(url, params=params, timeout=self.timeout) # Handle fallback for endpoints that may not exist fallback_tools = [ "proteins_api_get_proteomics", "proteins_api_get_epitopes", "proteins_api_get_features", "proteins_api_get_comments", "proteins_api_get_xrefs", "proteins_api_get_publications", "proteins_api_get_genome_mappings", ] if tool_name in fallback_tools and response.status_code == 404: fallback_result = self._extract_from_protein_endpoint( acc, tool_name ) if fallback_result: return (acc, fallback_result, None) response.raise_for_status() data = response.json() return ( acc, {"status": "success", "data": data, "url": response.url}, None, ) except Exception as e: return (acc, None, str(e)) # Execute requests in parallel with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_acc = { executor.submit(fetch_single, acc): acc for acc in accessions } for future in as_completed(future_to_acc): acc, result, error = future.result() if result: results.append({"accession": acc, **result}) successful_count += 1 else: errors.append({"accession": acc, "error": error}) # Aggregate results response_data = { "status": "success" if successful_count > 0 else "error", "data": results, "count": successful_count, "total_requested": len(accessions), "errors": errors if errors else None, } if errors: response_data["note"] = ( f"Successfully retrieved {successful_count} of {len(accessions)} accessions. {len(errors)} accessions failed." ) return response_data
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Proteins API call""" tool_name = self.tool_config.get("name", "") # Check if this is a batch operation (accession is list or comma-separated) # Only apply to tools that use accession parameter batch_tools = [ "proteins_api_get_protein", "proteins_api_get_variants", "proteins_api_get_proteomics", "proteins_api_get_epitopes", "proteins_api_get_features", "proteins_api_get_comments", "proteins_api_get_xrefs", "proteins_api_get_publications", "proteins_api_get_genome_mappings", ] if tool_name in batch_tools and "accession" in arguments: accession = arguments.get("accession") accessions = self._parse_accessions(accession) # If multiple accessions, use batch handler if len(accessions) > 1: format_param = arguments.get("format", "json") return self._handle_batch_request(accessions, tool_name, format_param) # Single accession - continue with normal flow elif len(accessions) == 1: arguments["accession"] = accessions[0] try: url = self._build_url(arguments) params = self._build_params(arguments) response = self.session.get(url, params=params, timeout=self.timeout) # Handle endpoints that may not exist - fallback to main protein endpoint fallback_tools = [ "proteins_api_get_proteomics", "proteins_api_get_epitopes", "proteins_api_get_features", "proteins_api_get_comments", "proteins_api_get_xrefs", "proteins_api_get_publications", "proteins_api_get_genome_mappings", ] if tool_name in fallback_tools: if response.status_code == 404: fallback_result = self._extract_from_protein_endpoint( arguments.get("accession", ""), tool_name ) if fallback_result: return fallback_result # Handle search endpoint which may not exist if tool_name == "proteins_api_search" and response.status_code == 400: return { "status": "error", "error": "Proteins API search endpoint may not be available. Use proteins_api_get_protein with a specific accession instead, or use EBI Search API with 'uniprot' domain.", "url": response.url, "suggestion": "Try using ebi_search_domain with domain='uniprot' and your query instead.", } response.raise_for_status() data = response.json() response_data = { "status": "success", "data": data, "url": response.url, } if isinstance(data, list): response_data["count"] = len(data) elif isinstance(data, dict): if "results" in data and isinstance(data["results"], list): response_data["count"] = len(data["results"]) return response_data except requests.exceptions.RequestException as e: tool_name = self.tool_config.get("name", "") # For endpoints that may not exist, try fallback fallback_tools = [ "proteins_api_get_proteomics", "proteins_api_get_epitopes", "proteins_api_get_features", "proteins_api_get_comments", "proteins_api_get_xrefs", "proteins_api_get_publications", "proteins_api_get_genome_mappings", ] if tool_name in fallback_tools: # Check if it's a 404 error (either in exception message or response status) is_404 = "404" in str(e) or ( hasattr(e, "response") and e.response is not None and e.response.status_code == 404 ) if is_404: fallback_result = self._extract_from_protein_endpoint( arguments.get("accession", ""), tool_name ) if fallback_result: return fallback_result # For variations endpoint, provide helpful error if tool_name == "proteins_api_get_variants": if "404" in str(e): return { "status": "error", "error": "No variations found for this protein accession.", "url": url if "url" in locals() else None, "note": "The protein may not have annotated variants. Try using proteins_api_get_protein to get other protein information.", } elif "400" in str(e): return { "status": "error", "error": "Invalid accession format for variation query.", "url": url if "url" in locals() else None, "note": "Ensure you're using a valid UniProt accession (e.g., P05067).", } return { "status": "error", "error": f"Proteins API error: {str(e)}", "url": url if "url" in locals() else None, } except Exception as e: tool_name = self.tool_config.get("name", "") return { "status": "error", "error": f"Unexpected error: {str(e)}", "url": url if "url" in locals() else None, }