Source code for tooluniverse.ebi_proteins_ext_tool

# ebi_proteins_ext_tool.py
"""
EBI Proteins API Extended tool for ToolUniverse.

Extended endpoints for the EBI Proteins API covering mutagenesis data
and post-translational modification (PTM) proteomics evidence.

API: https://www.ebi.ac.uk/proteins/api/
No authentication required. Free public access.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

PROTEINS_API_BASE_URL = "https://www.ebi.ac.uk/proteins/api"



[docs]
@register_tool("EBIProteinsExtTool")
class EBIProteinsExtTool(BaseTool):
    """
    Extended tool for EBI Proteins API covering mutagenesis and PTM data.

    These endpoints provide detailed mutagenesis experiment results and
    mass spectrometry-based post-translational modification evidence
    mapped to UniProt protein sequences.

    Supports: mutagenesis data, proteomics PTM evidence.

    No authentication required.
    """


[docs]
    def __init__(self, tool_config: Dict[str, Any]):
        super().__init__(tool_config)
        self.timeout = tool_config.get("timeout", 30)
        fields = tool_config.get("fields", {})
        self.endpoint = fields.get("endpoint", "mutagenesis")



[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the EBI Proteins API call."""
        try:
            return self._query(arguments)
        except requests.exceptions.Timeout:
            return {
                "status": "error",
                "error": f"EBI Proteins API timed out after {self.timeout}s",
            }
        except requests.exceptions.ConnectionError:
            return {"status": "error", "error": "Failed to connect to EBI Proteins API"}
        except requests.exceptions.HTTPError as e:
            return {
                "status": "error",
                "error": f"EBI Proteins API HTTP error: {e.response.status_code}",
            }
        except Exception as e:
            return {
                "status": "error",
                "error": f"Unexpected error querying EBI Proteins API: {str(e)}",
            }



[docs]
    def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Route to appropriate endpoint."""
        if self.endpoint == "mutagenesis":
            return self._get_mutagenesis(arguments)
        elif self.endpoint == "proteomics_ptm":
            return self._get_proteomics_ptm(arguments)
        elif self.endpoint == "variation":
            return self._get_variation(arguments)
        elif self.endpoint == "features":
            return self._get_features(arguments)
        elif self.endpoint == "antigen":
            return self._get_antigen(arguments)
        elif self.endpoint == "coordinates":
            return self._get_coordinates(arguments)
        elif self.endpoint == "proteomics":
            return self._get_proteomics(arguments)
        else:
            return {"status": "error", "error": f"Unknown endpoint: {self.endpoint}"}



[docs]
    def _get_mutagenesis(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get mutagenesis experiment data for a protein."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }

        url = f"{PROTEINS_API_BASE_URL}/mutagenesis/{accession}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()

        features = []
        for f in data.get("features", []):
            # Extract evidence details
            evidences = []
            for ev in f.get("evidences", []):
                src = ev.get("source", {})
                evidences.append(
                    {
                        "code": ev.get("code"),
                        "source_name": src.get("name"),
                        "source_id": src.get("id"),
                        "source_url": src.get("url"),
                    }
                )

            features.append(
                {
                    "type": f.get("type"),
                    "position_start": f.get("begin"),
                    "position_end": f.get("end"),
                    "original_sequence": f.get("alternativeSequence"),
                    "description": f.get("description"),
                    "evidences": evidences[:5],
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "entry_name": data.get("entryName"),
                "gene_name": None,
                "features": features[:100],
                "total_features": len(data.get("features", [])),
            },
            "metadata": {
                "source": "EBI Proteins API - Mutagenesis",
                "accession": accession,
            },
        }



[docs]
    def _get_proteomics_ptm(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get post-translational modification evidence from mass spec proteomics."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }

        url = f"{PROTEINS_API_BASE_URL}/proteomics-ptm/{accession}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()

        features = []
        for f in data.get("features", []):
            # Extract source databases
            evidences = []
            for ev in f.get("evidences", []):
                src = ev.get("source", {})
                props = src.get("properties", {})
                evidences.append(
                    {
                        "source": src.get("name"),
                        "id": src.get("id"),
                        "url": src.get("url"),
                        "properties": props,
                    }
                )

            features.append(
                {
                    "type": f.get("type"),
                    "position_start": f.get("begin"),
                    "position_end": f.get("end"),
                    "description": f.get("description"),
                    "evidences": evidences[:5],
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "entry_name": data.get("entryName"),
                "features": features[:100],
                "total_features": len(data.get("features", [])),
            },
            "metadata": {
                "source": "EBI Proteins API - Proteomics PTM",
                "accession": accession,
            },
        }



[docs]
    def _get_variation(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get protein sequence variants from multiple sources."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }
        source_type = arguments.get("source_type")
        disease_only = arguments.get("disease_only", False)

        url = f"{PROTEINS_API_BASE_URL}/variation/{accession}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()

        raw_features = data.get("features", [])

        # Filter by source type if specified
        if source_type:
            raw_features = [
                f for f in raw_features if f.get("sourceType") == source_type
            ]

        # Filter to disease-associated only
        if disease_only:
            raw_features = [
                f
                for f in raw_features
                if any(a.get("disease") for a in f.get("association", []))
            ]

        # Count by source
        source_counts = {}
        for f in data.get("features", []):
            src = f.get("sourceType", "unknown")
            source_counts[src] = source_counts.get(src, 0) + 1

        variants = []
        for f in raw_features[:100]:
            # Extract clinical significance
            clinical = []
            for cs in f.get("clinicalSignificances", []):
                clinical.append(
                    {
                        "type": cs.get("type"),
                        "sources": cs.get("sources", []),
                    }
                )

            # Extract disease associations
            associations = []
            for a in f.get("association", []):
                associations.append(
                    {
                        "name": a.get("name"),
                        "description": a.get("description"),
                        "is_disease": a.get("disease", False),
                    }
                )

            # Extract cross-references (first 3)
            xrefs = []
            for x in f.get("xrefs", [])[:3]:
                xrefs.append(
                    {
                        "database": x.get("name"),
                        "id": x.get("id"),
                        "url": x.get("url"),
                    }
                )

            variants.append(
                {
                    "position_start": f.get("begin"),
                    "position_end": f.get("end"),
                    "wild_type": f.get("wildType"),
                    "alternative": f.get("alternativeSequence"),
                    "source_type": f.get("sourceType"),
                    "clinical_significances": clinical if clinical else None,
                    "associations": associations if associations else None,
                    "xrefs": xrefs if xrefs else None,
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "entry_name": data.get("entryName"),
                "variants": variants,
                "total_variants": len(raw_features),
                "total_all_sources": len(data.get("features", [])),
                "source_counts": source_counts,
            },
            "metadata": {
                "source": "EBI Proteins API - Variation",
                "accession": accession,
            },
        }



[docs]
    def _get_features(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get protein features filtered by category."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }
        category = arguments.get("category", "DOMAINS_AND_SITES")

        url = f"{PROTEINS_API_BASE_URL}/features/{accession}"
        params = {"categories": category}
        headers = {"Accept": "application/json"}
        response = requests.get(
            url, params=params, headers=headers, timeout=self.timeout
        )
        response.raise_for_status()
        data = response.json()

        features = []
        for f in data.get("features", []):
            evidences = []
            for ev in f.get("evidences", [])[:3]:
                src = ev.get("source", {})
                evidences.append(
                    {
                        "code": ev.get("code"),
                        "source_name": src.get("name"),
                        "source_id": src.get("id"),
                    }
                )

            features.append(
                {
                    "type": f.get("type"),
                    "category": f.get("category"),
                    "position_start": f.get("begin"),
                    "position_end": f.get("end"),
                    "description": f.get("description"),
                    "evidences": evidences if evidences else None,
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "entry_name": data.get("entryName"),
                "sequence_length": len(data.get("sequence") or ""),
                "category_queried": category,
                "features": features[:100],
                "total_features": len(data.get("features", [])),
            },
            "metadata": {
                "source": "EBI Proteins API - Features",
                "accession": accession,
            },
        }



[docs]
    def _get_antigen(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get antigenic regions for a protein (useful for antibody design)."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }

        url = f"{PROTEINS_API_BASE_URL}/antigen/{accession}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()

        regions = []
        for f in data.get("features", []):
            evidences = []
            for ev in f.get("evidences", [])[:3]:
                src = ev.get("source", {})
                evidences.append(
                    {
                        "code": ev.get("code"),
                        "source_name": src.get("name"),
                        "source_id": src.get("id"),
                    }
                )

            regions.append(
                {
                    "type": f.get("type"),
                    "position_start": f.get("begin"),
                    "position_end": f.get("end"),
                    "match_score": f.get("matchScore"),
                    "antigen_sequence": f.get("antigenSequence"),
                    "evidences": evidences if evidences else None,
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "entry_name": data.get("entryName"),
                "antigenic_regions": regions,
                "total_regions": len(regions),
            },
            "metadata": {
                "source": "EBI Proteins API - Antigen",
                "accession": accession,
            },
        }



[docs]
    def _get_coordinates(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get genomic coordinates for a protein."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }

        url = f"{PROTEINS_API_BASE_URL}/coordinates/{accession}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()

        mappings = []
        for gc in data.get("gnCoordinate", []):
            gen_loc = gc.get("genomicLocation", {})
            exons = gen_loc.get("exon", [])
            mappings.append(
                {
                    "ensembl_gene_id": gc.get("ensemblGeneId"),
                    "ensembl_transcript_id": gc.get("ensemblTranscriptId"),
                    "chromosome": gen_loc.get("chromosome"),
                    "start": gen_loc.get("start"),
                    "end": gen_loc.get("end"),
                    "reverseStrand": gen_loc.get("reverseStrand"),
                    "num_exons": len(exons),
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "name": data.get("name"),
                "sequence_length": len(data.get("sequence", "")),
                "genomic_mappings": mappings,
                "total_mappings": len(mappings),
            },
            "metadata": {
                "source": "EBI Proteins API - Coordinates",
                "accession": accession,
            },
        }



[docs]
    def _get_proteomics(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get proteomics peptide evidence for a protein."""
        accession = arguments.get("accession", "")
        if not accession:
            return {
                "status": "error",
                "error": "accession parameter is required (UniProt accession, e.g., P04637)",
            }

        url = f"{PROTEINS_API_BASE_URL}/proteomics/{accession}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()

        peptides = []
        for f in data.get("features", []):
            evidences = []
            for ev in f.get("evidences", [])[:3]:
                src = ev.get("source", {})
                evidences.append(
                    {
                        "source": src.get("name"),
                        "id": src.get("id"),
                    }
                )

            peptides.append(
                {
                    "type": f.get("type"),
                    "position_start": f.get("begin"),
                    "position_end": f.get("end"),
                    "peptide": f.get("peptide"),
                    "unique": f.get("unique"),
                    "evidences": evidences if evidences else None,
                }
            )

        return {
            "status": "success",
            "data": {
                "accession": data.get("accession"),
                "entry_name": data.get("entryName"),
                "peptides": peptides[:100],
                "total_peptides": len(data.get("features", [])),
            },
            "metadata": {
                "source": "EBI Proteins API - Proteomics",
                "accession": accession,
            },
        }