Source code for tooluniverse.ucsc_genome_tool

# ucsc_genome_tool.py
"""
UCSC Genome Browser REST API tool for ToolUniverse.

The UCSC Genome Browser provides access to genome assemblies, gene annotations,
regulatory elements, conservation scores, and hundreds of other tracks for
220+ organisms. The API enables genomic search, DNA sequence retrieval,
and annotation track data access.

API: https://api.genome.ucsc.edu
No authentication required. Rate limit: ~1 request/second recommended.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

UCSC_BASE_URL = "https://api.genome.ucsc.edu"


[docs] @register_tool("UCSCGenomeTool") class UCSCGenomeTool(BaseTool): """ Tool for querying the UCSC Genome Browser REST API. Provides genomic search, DNA sequence retrieval, and annotation track data for 220+ genome assemblies (hg38, mm39, etc.). No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) self.endpoint_type = tool_config.get("fields", {}).get( "endpoint_type", "search" )
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the UCSC Genome Browser API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "status": "error", "error": f"UCSC Genome Browser API request timed out after {self.timeout} seconds", } except requests.exceptions.ConnectionError: return { "status": "error", "error": "Failed to connect to UCSC Genome Browser API. Check network connectivity.", } except requests.exceptions.HTTPError as e: return { "status": "error", "error": f"UCSC Genome Browser API HTTP error: {e.response.status_code}", } except Exception as e: return { "status": "error", "error": f"Unexpected error querying UCSC Genome Browser: {str(e)}", }
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint based on config.""" if self.endpoint_type == "search": return self._search(arguments) elif self.endpoint_type == "get_sequence": return self._get_sequence(arguments) elif self.endpoint_type == "get_track": return self._get_track(arguments) elif self.endpoint_type == "list_tracks": return self._list_tracks(arguments) else: return { "status": "error", "error": f"Unknown endpoint_type: {self.endpoint_type}", }
[docs] def _list_tracks(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """List annotation tracks for a genome, or the column schema of a track. With only ``genome`` set, returns every available track (leaf tracks only) with shortLabel/type/longLabel/parent so callers can discover valid track names for UCSC_get_track. When ``track`` is also provided, returns that track's column schema (name/sqlType/jsonType/description) from the list/schema endpoint. """ genome = arguments.get("genome", "") track = arguments.get("track") name_filter = arguments.get("name_filter") max_tracks = arguments.get("max_tracks", 500) if not genome: return { "status": "error", "error": "genome parameter is required (e.g., 'hg38', 'mm39').", } # Schema mode: a specific track's column definitions. if track: url = f"{UCSC_BASE_URL}/list/schema?genome={genome};track={track}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() column_types = raw.get("columnTypes", []) if not isinstance(column_types, list): column_types = [] result = { "genome": genome, "track": track, "track_type": raw.get("type"), "short_label": raw.get("shortLabel"), "long_label": raw.get("longLabel"), "column_count": len(column_types), "columns": column_types, } return { "status": "success", "data": result, "metadata": { "source": "UCSC Genome Browser", "query": f"{genome}:{track}", "endpoint": "list/schema", }, } # Listing mode: all leaf tracks for the genome. url = f"{UCSC_BASE_URL}/list/tracks?genome={genome};trackLeavesOnly=1" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() track_dict = raw.get(genome, {}) if not isinstance(track_dict, dict): track_dict = {} try: max_tracks = int(max_tracks) except (TypeError, ValueError): max_tracks = 500 if max_tracks <= 0: max_tracks = 500 nf = ( name_filter.lower() if isinstance(name_filter, str) and name_filter else None ) tracks = [] for name, info in track_dict.items(): if not isinstance(info, dict): info = {} short_label = info.get("shortLabel", "") long_label = info.get("longLabel", "") if nf and ( nf not in name.lower() and nf not in str(short_label).lower() and nf not in str(long_label).lower() ): continue tracks.append( { "track": name, "type": info.get("type"), "short_label": short_label, "long_label": long_label, "parent": info.get("parent"), "group": info.get("group"), } ) total_matched = len(tracks) result = { "genome": genome, "name_filter": name_filter, "track_count": total_matched, "returned_count": min(total_matched, max_tracks), "tracks": tracks[:max_tracks], } return { "status": "success", "data": result, "metadata": { "source": "UCSC Genome Browser", "query": genome, "endpoint": "list/tracks", }, }
[docs] def _get_sequence(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get DNA sequence for a specified genomic region.""" genome = arguments.get("genome", "") chrom = arguments.get("chrom", "") start = arguments.get("start", None) end = arguments.get("end", None) if not genome or not chrom or start is None or end is None: return { "status": "error", "error": "genome, chrom, start, and end parameters are all required", } if end <= start: return {"status": "error", "error": "end must be greater than start"} if end - start > 100000: return { "status": "error", "error": "Maximum sequence length is 100,000 bp. Please reduce the range.", } url = f"{UCSC_BASE_URL}/getData/sequence?genome={genome};chrom={chrom};start={start};end={end}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() dna = raw.get("dna", "") result = { "genome": genome, "chrom": chrom, "start": start, "end": end, "length": len(dna), "dna": dna, } return { "status": "success", "data": result, "metadata": { "source": "UCSC Genome Browser", "query": f"{genome}:{chrom}:{start}-{end}", "endpoint": "getData/sequence", }, }
[docs] def _get_track(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get annotation track data for a specified genomic region.""" genome = arguments.get("genome", "") track = arguments.get("track", "") chrom = arguments.get("chrom", "") start = arguments.get("start", None) end = arguments.get("end", None) max_items = arguments.get("maxItemsOutput", 100) if not genome or not track or not chrom or start is None or end is None: return { "status": "error", "error": "genome, track, chrom, start, and end parameters are all required", } url = ( f"{UCSC_BASE_URL}/getData/track?genome={genome};track={track};" f"chrom={chrom};start={start};end={end}" ) if max_items: url += f";maxItemsOutput={max_items}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() # Track data is keyed by the track name track_type = raw.get("trackType", None) items = raw.get(track, []) if not isinstance(items, list): items = [items] if items else [] result = { "genome": genome, "track": track, "track_type": track_type, "chrom": chrom, "start": start, "end": end, "item_count": len(items), "items": items, } return { "status": "success", "data": result, "metadata": { "source": "UCSC Genome Browser", "query": f"{genome}:{track}:{chrom}:{start}-{end}", "endpoint": "getData/track", }, }