Source code for tooluniverse.ucsc_genome_tool

# ucsc_genome_tool.py
"""
UCSC Genome Browser REST API tool for ToolUniverse.

The UCSC Genome Browser provides access to genome assemblies, gene annotations,
regulatory elements, conservation scores, and hundreds of other tracks for
220+ organisms. The API enables genomic search, DNA sequence retrieval,
and annotation track data access.

API: https://api.genome.ucsc.edu
No authentication required. Rate limit: ~1 request/second recommended.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

UCSC_BASE_URL = "https://api.genome.ucsc.edu"


[docs] @register_tool("UCSCGenomeTool") class UCSCGenomeTool(BaseTool): """ Tool for querying the UCSC Genome Browser REST API. Provides genomic search, DNA sequence retrieval, and annotation track data for 220+ genome assemblies (hg38, mm39, etc.). No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) self.endpoint_type = tool_config.get("fields", {}).get( "endpoint_type", "search" )
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the UCSC Genome Browser API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "error": f"UCSC Genome Browser API request timed out after {self.timeout} seconds" } except requests.exceptions.ConnectionError: return { "error": "Failed to connect to UCSC Genome Browser API. Check network connectivity." } except requests.exceptions.HTTPError as e: return { "error": f"UCSC Genome Browser API HTTP error: {e.response.status_code}" } except Exception as e: return {"error": f"Unexpected error querying UCSC Genome Browser: {str(e)}"}
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint based on config.""" if self.endpoint_type == "search": return self._search(arguments) elif self.endpoint_type == "get_sequence": return self._get_sequence(arguments) elif self.endpoint_type == "get_track": return self._get_track(arguments) else: return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs] def _get_sequence(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get DNA sequence for a specified genomic region.""" genome = arguments.get("genome", "") chrom = arguments.get("chrom", "") start = arguments.get("start", None) end = arguments.get("end", None) if not genome or not chrom or start is None or end is None: return { "error": "genome, chrom, start, and end parameters are all required" } if end <= start: return {"error": "end must be greater than start"} if end - start > 100000: return { "error": "Maximum sequence length is 100,000 bp. Please reduce the range." } url = f"{UCSC_BASE_URL}/getData/sequence?genome={genome};chrom={chrom};start={start};end={end}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() dna = raw.get("dna", "") result = { "genome": genome, "chrom": chrom, "start": start, "end": end, "length": len(dna), "dna": dna, } return { "data": result, "metadata": { "source": "UCSC Genome Browser", "query": f"{genome}:{chrom}:{start}-{end}", "endpoint": "getData/sequence", }, }
[docs] def _get_track(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get annotation track data for a specified genomic region.""" genome = arguments.get("genome", "") track = arguments.get("track", "") chrom = arguments.get("chrom", "") start = arguments.get("start", None) end = arguments.get("end", None) max_items = arguments.get("maxItemsOutput", 100) if not genome or not track or not chrom or start is None or end is None: return { "error": "genome, track, chrom, start, and end parameters are all required" } url = ( f"{UCSC_BASE_URL}/getData/track?genome={genome};track={track};" f"chrom={chrom};start={start};end={end}" ) if max_items: url += f";maxItemsOutput={max_items}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() # Track data is keyed by the track name track_type = raw.get("trackType", None) items = raw.get(track, []) if not isinstance(items, list): items = [items] if items else [] result = { "genome": genome, "track": track, "track_type": track_type, "chrom": chrom, "start": start, "end": end, "item_count": len(items), "items": items, } return { "data": result, "metadata": { "source": "UCSC Genome Browser", "query": f"{genome}:{track}:{chrom}:{start}-{end}", "endpoint": "getData/track", }, }