Source code for tooluniverse.ensembl_info_tool

# ensembl_info_tool.py
"""
Ensembl Info API tool for ToolUniverse.

Provides access to Ensembl REST API info endpoints for retrieving
genome assembly metadata and species information.

API: https://rest.ensembl.org/
No authentication required. Free public access.
"""

import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool


ENSEMBL_REST_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}


[docs] class EnsemblInfoTool(BaseTool): """ Tool for Ensembl info endpoints providing genome assembly metadata and species catalog. These endpoints complement existing Ensembl tools (sequence, variation, overlap, xrefs, etc.) by providing assembly-level and species-level information needed for genomic coordinate interpretation. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 90) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "assembly")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Ensembl info API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return {"error": f"Ensembl API timed out after {self.timeout}s"} except requests.exceptions.ConnectionError: return {"error": "Failed to connect to Ensembl REST API"} except requests.exceptions.HTTPError as e: code = e.response.status_code if e.response is not None else "unknown" if code == 404: return {"error": f"Species not found: {arguments.get('species', '')}"} return {"error": f"Ensembl API HTTP error: {code}"} except Exception as e: return {"error": f"Unexpected error querying Ensembl API: {str(e)}"}
[docs] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint == "assembly": return self._get_assembly_info(arguments) elif self.endpoint == "species": return self._get_species_info(arguments) else: return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs] def _get_assembly_info(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get genome assembly metadata for a species.""" species = arguments.get("species", "") if not species: return { "error": "species parameter is required (e.g., 'homo_sapiens', 'mus_musculus')" } url = f"{ENSEMBL_REST_BASE_URL}/info/assembly/{species}" params = {"content-type": "application/json"} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() data = response.json() # Extract chromosome information top_level = data.get("top_level_region", []) chromosomes = [] for region in top_level: if region.get("coord_system") == "chromosome": chromosomes.append( { "name": region.get("name"), "length": region.get("length"), } ) # Sort chromosomes (numeric then alphabetic) def chrom_sort_key(c): name = c.get("name", "") try: return (0, int(name)) except ValueError: return (1, name) chromosomes.sort(key=chrom_sort_key) return { "data": { "species": species, "assembly_name": data.get("assembly_name"), "assembly_accession": data.get("assembly_accession"), "assembly_date": data.get("assembly_date"), "genebuild_method": data.get("genebuild_method"), "genebuild_last_update": data.get("genebuild_last_geneset_update"), "golden_path_length": data.get("golden_path"), "karyotype": data.get("karyotype", []), "coordinate_system_versions": data.get("coord_system_versions", []), "total_regions": len(top_level), "chromosomes": chromosomes[:30], }, "metadata": { "source": "Ensembl REST API - Assembly Info", "species": species, }, }
[docs] def _get_species_info(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get list of species available in Ensembl with genome info.""" search = arguments.get("search", "") url = f"{ENSEMBL_REST_BASE_URL}/info/species" params = {"content-type": "application/json"} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() data = response.json() species_list = data.get("species", []) # Filter if search term provided if search: search_lower = search.lower() species_list = [ s for s in species_list if search_lower in s.get("display_name", "").lower() or search_lower in s.get("name", "").lower() or search_lower in s.get("common_name", "").lower() or str(s.get("taxon_id", "")) == search ] # Format results species_results = [] for s in species_list[:50]: taxon_id_raw = s.get("taxon_id") try: taxon_id = int(taxon_id_raw) if taxon_id_raw is not None else None except (ValueError, TypeError): taxon_id = None species_results.append( { "name": s.get("name"), "display_name": s.get("display_name"), "common_name": s.get("common_name"), "taxon_id": taxon_id, "assembly": s.get("assembly"), "accession": s.get("accession"), "division": s.get("division"), "strain": s.get("strain"), "strain_collection": s.get("strain_collection"), } ) return { "data": { "total_species": len(data.get("species", [])), "matched_species": len(species_results), "search_term": search if search else None, "species": species_results, }, "metadata": { "source": "Ensembl REST API - Species Info", "search": search if search else "all", }, }