Source code for tooluniverse.ensembl_compara_tool

# ensembl_compara_tool.py
"""
Ensembl Compara API tool for ToolUniverse.

Ensembl Compara provides access to comparative genomics data including
orthologues, paralogues, gene trees, and genome alignments across species.

API: https://rest.ensembl.org/
No authentication required (rate limited to 15 req/s).
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}


[docs] @register_tool("EnsemblComparaTool") class EnsemblComparaTool(BaseTool): """ Tool for querying Ensembl Compara comparative genomics data. Ensembl Compara contains whole-genome alignments, gene trees, and homology data for vertebrates and other eukaryotes. Supports finding orthologues (between-species homologs) and paralogues (within-species gene duplications). Supports: orthologue search, paralogue search, gene tree retrieval. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "orthologues")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Ensembl Compara API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return {"error": f"Ensembl Compara API timed out after {self.timeout}s"} except requests.exceptions.ConnectionError: return {"error": "Failed to connect to Ensembl REST API"} except requests.exceptions.HTTPError as e: return {"error": f"Ensembl API HTTP error: {e.response.status_code}"} except Exception as e: return {"error": f"Unexpected error querying Ensembl Compara: {str(e)}"}
[docs] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate Ensembl Compara endpoint.""" if self.endpoint == "orthologues": return self._get_orthologues(arguments) elif self.endpoint == "paralogues": return self._get_paralogues(arguments) elif self.endpoint == "gene_tree": return self._get_gene_tree(arguments) else: return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs] def _get_orthologues(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get orthologues for a gene across species.""" gene = arguments.get("gene", "") if not gene: return {"error": "gene parameter is required (symbol or Ensembl ID)"} species = arguments.get("species", "human") target_species = arguments.get("target_species") target_taxon = arguments.get("target_taxon") # Determine if gene is Ensembl ID or symbol if gene.startswith("ENS"): url = f"{ENSEMBL_BASE_URL}/homology/id/{species}/{gene}" else: url = f"{ENSEMBL_BASE_URL}/homology/symbol/{species}/{gene}" params = {"type": "orthologues", "format": "condensed"} if target_species: params["target_species"] = target_species if target_taxon: params["target_taxon"] = target_taxon headers = {**ENSEMBL_HEADERS, "Content-Type": "application/json"} response = requests.get( url, params=params, headers=headers, timeout=self.timeout ) response.raise_for_status() data = response.json() results = [] for d in data.get("data", []): gene_id = d.get("id") for h in d.get("homologies", []): results.append( { "source_gene": gene_id, "target_gene": h.get("id"), "target_protein": h.get("protein_id"), "target_species": h.get("species"), "homology_type": h.get("type"), "taxonomy_level": h.get("taxonomy_level"), "method": h.get("method_link_type"), } ) return { "data": results, "metadata": { "source": "Ensembl Compara", "query_gene": gene, "query_species": species, "total_orthologues": len(results), }, }
[docs] def _get_paralogues(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get within-species paralogues (gene duplicates) for a gene.""" gene = arguments.get("gene", "") if not gene: return {"error": "gene parameter is required (symbol or Ensembl ID)"} species = arguments.get("species", "human") if gene.startswith("ENS"): url = f"{ENSEMBL_BASE_URL}/homology/id/{species}/{gene}" else: url = f"{ENSEMBL_BASE_URL}/homology/symbol/{species}/{gene}" params = {"type": "paralogues", "format": "condensed"} headers = {**ENSEMBL_HEADERS, "Content-Type": "application/json"} response = requests.get( url, params=params, headers=headers, timeout=self.timeout ) response.raise_for_status() data = response.json() results = [] for d in data.get("data", []): gene_id = d.get("id") for h in d.get("homologies", []): results.append( { "source_gene": gene_id, "paralogue_gene": h.get("id"), "paralogue_protein": h.get("protein_id"), "species": h.get("species"), "paralogy_type": h.get("type"), "taxonomy_level": h.get("taxonomy_level"), } ) return { "data": results, "metadata": { "source": "Ensembl Compara", "query_gene": gene, "query_species": species, "total_paralogues": len(results), }, }
[docs] def _get_gene_tree(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get gene tree (phylogenetic tree of homologous genes).""" gene = arguments.get("gene", "") if not gene: return {"error": "gene parameter is required (Ensembl gene ID)"} species = arguments.get("species", "human") # Gene tree uses /genetree/member/id or /genetree/member/symbol if gene.startswith("ENS"): url = f"{ENSEMBL_BASE_URL}/genetree/member/id/{gene}" else: url = f"{ENSEMBL_BASE_URL}/genetree/member/symbol/{species}/{gene}" params = {"nh_format": "simple"} headers = {**ENSEMBL_HEADERS, "Content-Type": "application/json"} response = requests.get( url, params=params, headers=headers, timeout=self.timeout ) response.raise_for_status() data = response.json() # Extract tree info tree_id = ( data.get("tree", {}).get("id") if isinstance(data.get("tree"), dict) else data.get("id") ) rooted = data.get("rooted", True) # Get Newick tree from the response if available newick = None tree_data = data.get("tree", data) if isinstance(tree_data, dict): newick = tree_data.get("newick") # Count members in the tree members = [] self._collect_members(tree_data, members) return { "data": { "tree_id": tree_id, "newick": newick, "rooted": rooted, "members": members[:50], "total_members": len(members), }, "metadata": { "source": "Ensembl Compara", "query_gene": gene, }, }
[docs] def _collect_members(self, node, members, max_members=200): """Recursively collect leaf members from gene tree.""" if len(members) >= max_members: return if isinstance(node, dict): # Leaf node has 'id' and 'species' if "id" in node and "species" in node: gene_id = node.get("id", {}) if isinstance(gene_id, dict): gene_id = gene_id.get("accession", "") members.append( { "gene_id": str(gene_id), "species": node.get("species", {}).get("scientific_name", "") if isinstance(node.get("species"), dict) else str(node.get("species", "")), } ) # Traverse children for child in node.get("children", []): self._collect_members(child, members, max_members)