Source code for tooluniverse.proteins_api_tool

"""
Proteins API Tool

This tool provides access to the EBI Proteins API for comprehensive protein
annotations, variation data, proteomics, and reference genome mappings.
"""

import requests
from typing import Any, Dict, Optional, List, Union
from concurrent.futures import ThreadPoolExecutor, as_completed
from .base_tool import BaseTool
from .tool_registry import register_tool



[docs]
@register_tool("ProteinsAPIRESTTool")
class ProteinsAPIRESTTool(BaseTool):
    """
    Proteins API REST tool.
    Generic wrapper for Proteins API endpoints defined in proteins_api_tools.json.
    """


[docs]
    def __init__(self, tool_config: Dict):
        super().__init__(tool_config)
        self.base_url = "https://www.ebi.ac.uk/proteins/api"
        self.session = requests.Session()
        self.session.headers.update(
            {"Accept": "application/json", "User-Agent": "ToolUniverse/1.0"}
        )
        self.timeout = 30



[docs]
    def _build_url(self, args: Dict[str, Any]) -> str:
        """Build URL from endpoint template and arguments"""
        endpoint_template = self.tool_config["fields"].get("endpoint", "")
        tool_name = self.tool_config.get("name", "")

        if endpoint_template:
            url = endpoint_template
            for k, v in args.items():
                url = url.replace(f"{{{k}}}", str(v))
            return url

        # Build URL based on tool name
        if tool_name == "proteins_api_get_protein":
            accession = args.get("accession", "")
            if accession:
                return f"{self.base_url}/proteins/{accession}"

        elif tool_name == "proteins_api_get_variants":
            accession = args.get("accession", "")
            if accession:
                # Use the variation API endpoint (not the proteins endpoint)
                return f"{self.base_url}/variation"

        elif tool_name == "proteins_api_get_proteomics":
            accession = args.get("accession", "")
            if accession:
                # Try proteomics endpoint, fallback to main protein endpoint
                return f"{self.base_url}/proteins/{accession}/proteomics"

        elif tool_name == "proteins_api_get_epitopes":
            accession = args.get("accession", "")
            if accession:
                # Try epitopes endpoint, fallback to main protein endpoint
                return f"{self.base_url}/proteins/{accession}/epitopes"

        elif tool_name == "proteins_api_search":
            # Proteins API search uses query parameter, not path
            return f"{self.base_url}/proteins/search"

        return self.base_url



[docs]
    def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]:
        """Build query parameters for Proteins API"""
        params = {}
        tool_name = self.tool_config.get("name", "")

        if tool_name == "proteins_api_search":
            # Proteins API search requires specific parameters:
            # gene, protein, accession, organism, taxid, etc.
            if "query" in args:
                query = args["query"]
                # Try to intelligently map query to the right parameter
                # If it looks like an accession (starts with letter and 5-6 chars)
                if query and len(query) <= 10 and any(c.isalpha() for c in query):
                    if query[0].isalpha() and len(query) == 6:
                        params["accession"] = query
                    else:
                        # Default to gene parameter (works for gene names like BRCA1)
                        params["gene"] = query
                else:
                    # For longer queries, try protein parameter
                    params["protein"] = query
            if "size" in args:
                params["size"] = args["size"]
            if "offset" in args:
                params["offset"] = args["offset"]

        elif tool_name == "proteins_api_get_variants":
            # Variation API uses accession query parameter
            if "accession" in args:
                params["accession"] = args["accession"]
            if "size" in args:
                params["size"] = args.get("size", 100)
            if "offset" in args:
                params["offset"] = args["offset"]

        # Format parameter
        if "format" in args:
            params["format"] = args["format"]
        else:
            params["format"] = "json"

        return params



[docs]
    def _extract_from_protein_endpoint(
        self, accession: str, tool_name: str
    ) -> Optional[Dict[str, Any]]:
        """Extract data from main protein endpoint when specific endpoints don't exist"""
        try:
            protein_url = f"{self.base_url}/proteins/{accession}"
            response = self.session.get(protein_url, timeout=self.timeout)
            response.raise_for_status()
            protein_data = response.json()

            # Extract relevant data based on tool name
            if tool_name == "proteins_api_get_proteomics":
                # Look for proteomics-related data in response
                proteomics_data = []

                # Check comments for proteomics information
                if "comments" in protein_data:
                    for comment in protein_data["comments"]:
                        comment_type = str(comment.get("commentType", "")).upper()
                        if any(
                            x in comment_type
                            for x in [
                                "PTM",
                                "MODIFIED",
                                "MASS",
                                "SPECTROMETRY",
                                "PROTEOMICS",
                            ]
                        ):
                            proteomics_data.append(comment)

                # Check features for proteomics-related features
                if "features" in protein_data:
                    for feature in protein_data["features"]:
                        feature_type = str(feature.get("type", "")).lower()
                        if any(
                            x in feature_type
                            for x in ["modified", "mutagenesis", "site", "variant"]
                        ):
                            proteomics_data.append(feature)

                return {
                    "status": "success",
                    "data": proteomics_data,
                    "url": response.url,
                    "count": len(proteomics_data),
                    "note": "Proteomics data extracted from main protein endpoint (proteomics endpoint not available). Includes PTM comments, modified residues, and related features.",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

            elif tool_name == "proteins_api_get_epitopes":
                # Look for epitope-related data
                epitopes_data = []

                # Check comments for epitope information
                if "comments" in protein_data:
                    for comment in protein_data["comments"]:
                        comment_str = str(comment).lower()
                        comment_type = str(comment.get("commentType", "")).upper()
                        if "epitope" in comment_str or comment_type == "IMMUNOLOGY":
                            epitopes_data.append(comment)

                # Check features for epitope sites
                if "features" in protein_data:
                    for feature in protein_data["features"]:
                        feature_str = str(feature).lower()
                        feature_type = str(feature.get("type", "")).lower()
                        if "epitope" in feature_str or "epitope" in feature_type:
                            epitopes_data.append(feature)

                return {
                    "status": "success",
                    "data": epitopes_data,
                    "url": response.url,
                    "count": len(epitopes_data),
                    "note": "Epitope data extracted from main protein endpoint (epitopes endpoint not available). Includes immunology comments and epitope features if present.",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

            elif tool_name == "proteins_api_get_features":
                # Extract features directly from main protein endpoint
                features_data = protein_data.get("features", [])
                return {
                    "status": "success",
                    "data": features_data,
                    "url": response.url,
                    "count": len(features_data),
                    "note": "Features extracted from main protein endpoint (features endpoint not available as separate endpoint).",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

            elif tool_name == "proteins_api_get_comments":
                # Extract comments directly from main protein endpoint
                comments_data = protein_data.get("comments", [])
                return {
                    "status": "success",
                    "data": comments_data,
                    "url": response.url,
                    "count": len(comments_data),
                    "note": "Comments extracted from main protein endpoint (comments endpoint not available as separate endpoint).",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

            elif tool_name == "proteins_api_get_xrefs":
                # Extract cross-references (dbReferences) from main protein endpoint
                xrefs_data = protein_data.get("dbReferences", [])
                return {
                    "status": "success",
                    "data": xrefs_data,
                    "url": response.url,
                    "count": len(xrefs_data),
                    "note": "Cross-references extracted from main protein endpoint (xrefs endpoint not available as separate endpoint).",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

            elif tool_name == "proteins_api_get_publications":
                # Extract references (publications) from main protein endpoint
                publications_data = protein_data.get("references", [])
                return {
                    "status": "success",
                    "data": publications_data,
                    "url": response.url,
                    "count": len(publications_data),
                    "note": "Publications extracted from main protein endpoint (publications endpoint not available as separate endpoint).",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

            elif tool_name == "proteins_api_get_genome_mappings":
                # Extract genome-related cross-references (Ensembl, RefSeq, etc.)
                genome_mappings = []
                db_references = protein_data.get("dbReferences", [])

                # Look for Ensembl, RefSeq, and other genome-related cross-references
                genome_db_types = ["Ensembl", "RefSeq", "EMBL", "GenBank"]
                for ref in db_references:
                    ref_type = ref.get("type", "")
                    if ref_type in genome_db_types:
                        # Try to extract genome-related information
                        mapping_entry = {
                            "database": ref_type,
                            "id": ref.get("id", ""),
                            "properties": ref.get("properties", {}),
                        }
                        genome_mappings.append(mapping_entry)

                return {
                    "status": "success",
                    "data": genome_mappings,
                    "url": response.url,
                    "count": len(genome_mappings),
                    "note": "Genome mappings extracted from cross-references in main protein endpoint (genome endpoint not available as separate endpoint). Includes Ensembl, RefSeq, EMBL, and GenBank cross-references that may contain genomic location information.",
                    "fallback_used": True,
                    "source": "main_protein_endpoint",
                }

        except Exception:
            return None



[docs]
    def _parse_accessions(self, accession: Union[str, List[str]]) -> List[str]:
        """Parse accession parameter - handle string, list, or comma-separated string"""
        if isinstance(accession, list):
            return [str(acc).strip() for acc in accession if acc]
        elif isinstance(accession, str):
            # Check if it's comma-separated
            if "," in accession:
                return [acc.strip() for acc in accession.split(",") if acc.strip()]
            else:
                return [accession.strip()]
        else:
            return [str(accession).strip()]



[docs]
    def _handle_batch_request(
        self, accessions: List[str], tool_name: str, format: str = "json"
    ) -> Dict[str, Any]:
        """Handle batch requests by making multiple API calls and aggregating results"""
        results = []
        errors = []
        successful_count = 0

        # Use ThreadPoolExecutor for parallel requests (max 5 concurrent)
        max_workers = min(5, len(accessions))

        def fetch_single(
            acc: str,
        ) -> tuple[str, Optional[Dict[str, Any]], Optional[str]]:
            """Fetch data for a single accession"""
            try:
                # Build arguments for single accession
                single_args = {"accession": acc, "format": format}
                url = self._build_url(single_args)
                params = self._build_params(single_args)

                # For variants tool, params should contain accession
                if tool_name == "proteins_api_get_variants":
                    params["accession"] = acc

                response = self.session.get(url, params=params, timeout=self.timeout)

                # Handle fallback for endpoints that may not exist
                fallback_tools = [
                    "proteins_api_get_proteomics",
                    "proteins_api_get_epitopes",
                    "proteins_api_get_features",
                    "proteins_api_get_comments",
                    "proteins_api_get_xrefs",
                    "proteins_api_get_publications",
                    "proteins_api_get_genome_mappings",
                ]

                if tool_name in fallback_tools and response.status_code == 404:
                    fallback_result = self._extract_from_protein_endpoint(
                        acc, tool_name
                    )
                    if fallback_result:
                        return (acc, fallback_result, None)

                response.raise_for_status()
                data = response.json()

                return (
                    acc,
                    {"status": "success", "data": data, "url": response.url},
                    None,
                )
            except Exception as e:
                return (acc, None, str(e))

        # Execute requests in parallel
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_acc = {
                executor.submit(fetch_single, acc): acc for acc in accessions
            }

            for future in as_completed(future_to_acc):
                acc, result, error = future.result()
                if result:
                    results.append({"accession": acc, **result})
                    successful_count += 1
                else:
                    errors.append({"accession": acc, "error": error})

        # Aggregate results
        response_data = {
            "status": "success" if successful_count > 0 else "error",
            "data": results,
            "count": successful_count,
            "total_requested": len(accessions),
            "errors": errors if errors else None,
        }

        if errors:
            response_data["note"] = (
                f"Successfully retrieved {successful_count} of {len(accessions)} accessions. {len(errors)} accessions failed."
            )

        return response_data



[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the Proteins API call"""
        tool_name = self.tool_config.get("name", "")

        # Check if this is a batch operation (accession is list or comma-separated)
        # Only apply to tools that use accession parameter
        batch_tools = [
            "proteins_api_get_protein",
            "proteins_api_get_variants",
            "proteins_api_get_proteomics",
            "proteins_api_get_epitopes",
            "proteins_api_get_features",
            "proteins_api_get_comments",
            "proteins_api_get_xrefs",
            "proteins_api_get_publications",
            "proteins_api_get_genome_mappings",
        ]

        if tool_name in batch_tools and "accession" in arguments:
            accession = arguments.get("accession")
            accessions = self._parse_accessions(accession)

            # If multiple accessions, use batch handler
            if len(accessions) > 1:
                format_param = arguments.get("format", "json")
                return self._handle_batch_request(accessions, tool_name, format_param)
            # Single accession - continue with normal flow
            elif len(accessions) == 1:
                arguments["accession"] = accessions[0]

        try:
            url = self._build_url(arguments)
            params = self._build_params(arguments)

            response = self.session.get(url, params=params, timeout=self.timeout)

            # Handle endpoints that may not exist - fallback to main protein endpoint
            fallback_tools = [
                "proteins_api_get_proteomics",
                "proteins_api_get_epitopes",
                "proteins_api_get_features",
                "proteins_api_get_comments",
                "proteins_api_get_xrefs",
                "proteins_api_get_publications",
                "proteins_api_get_genome_mappings",
            ]
            if tool_name in fallback_tools:
                if response.status_code == 404:
                    fallback_result = self._extract_from_protein_endpoint(
                        arguments.get("accession", ""), tool_name
                    )
                    if fallback_result:
                        return fallback_result

            # Handle search endpoint which may not exist
            if tool_name == "proteins_api_search" and response.status_code == 400:
                return {
                    "status": "error",
                    "error": "Proteins API search endpoint may not be available. Use proteins_api_get_protein with a specific accession instead, or use EBI Search API with 'uniprot' domain.",
                    "url": response.url,
                    "suggestion": "Try using ebi_search_domain with domain='uniprot' and your query instead.",
                }

            response.raise_for_status()
            data = response.json()

            response_data = {
                "status": "success",
                "data": data,
                "url": response.url,
            }

            if isinstance(data, list):
                response_data["count"] = len(data)
            elif isinstance(data, dict):
                if "results" in data and isinstance(data["results"], list):
                    response_data["count"] = len(data["results"])

            return response_data

        except requests.exceptions.RequestException as e:
            tool_name = self.tool_config.get("name", "")

            # For endpoints that may not exist, try fallback
            fallback_tools = [
                "proteins_api_get_proteomics",
                "proteins_api_get_epitopes",
                "proteins_api_get_features",
                "proteins_api_get_comments",
                "proteins_api_get_xrefs",
                "proteins_api_get_publications",
                "proteins_api_get_genome_mappings",
            ]
            if tool_name in fallback_tools:
                # Check if it's a 404 error (either in exception message or response status)
                is_404 = "404" in str(e) or (
                    hasattr(e, "response")
                    and e.response is not None
                    and e.response.status_code == 404
                )
                if is_404:
                    fallback_result = self._extract_from_protein_endpoint(
                        arguments.get("accession", ""), tool_name
                    )
                    if fallback_result:
                        return fallback_result

            # For variations endpoint, provide helpful error
            if tool_name == "proteins_api_get_variants":
                if "404" in str(e):
                    return {
                        "status": "error",
                        "error": "No variations found for this protein accession.",
                        "url": url if "url" in locals() else None,
                        "note": "The protein may not have annotated variants. Try using proteins_api_get_protein to get other protein information.",
                    }
                elif "400" in str(e):
                    return {
                        "status": "error",
                        "error": "Invalid accession format for variation query.",
                        "url": url if "url" in locals() else None,
                        "note": "Ensure you're using a valid UniProt accession (e.g., P05067).",
                    }
            return {
                "status": "error",
                "error": f"Proteins API error: {str(e)}",
                "url": url if "url" in locals() else None,
            }
        except Exception as e:
            tool_name = self.tool_config.get("name", "")
            return {
                "status": "error",
                "error": f"Unexpected error: {str(e)}",
                "url": url if "url" in locals() else None,
            }