Source code for tooluniverse.gtdb_tool

"""
GTDB Tool - Genome Taxonomy Database

GTDB provides a standardized genome-based taxonomy for prokaryotes (Bacteria
and Archaea). The taxonomy is built from phylogenomic analysis of genome
sequences, resolving polyphyletic groups in NCBI taxonomy and providing a
consistent, genome-based classification system.

API base: https://gtdb-api.ecogenomic.org
No authentication required.

GTDB taxon naming convention: prefix__name
  d__ (domain), p__ (phylum), c__ (class), o__ (order),
  f__ (family), g__ (genus), s__ (species)

Reference: Parks et al., Nature Biotechnology 2018, 36:996-1004
"""

import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool


GTDB_BASE_URL = "https://gtdb-api.ecogenomic.org"


[docs] @register_tool("GTDBTool") class GTDBTool(BaseTool): """ Tool for querying the Genome Taxonomy Database (GTDB). GTDB is a genome-based taxonomy for prokaryotes, maintained by the Ecogenomics lab at the University of Queensland. Supported operations: - search_taxon: Search for taxa by partial name - get_species: Get species cluster details with genomes - get_taxon_info: Get taxon card info (rank, genome count, lineage) - search_genomes: Search genomes by organism name - get_genome: Get detailed genome metadata """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.parameter = tool_config.get("parameter", {}) self.required = self.parameter.get("required", []) self.session = requests.Session() self.timeout = 30
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the GTDB API tool with given arguments.""" operation = arguments.get("operation") if not operation: return {"status": "error", "error": "Missing required parameter: operation"} operation_handlers = { "search_taxon": self._search_taxon, "get_species": self._get_species, "get_taxon_info": self._get_taxon_info, "search_genomes": self._search_genomes, "get_genome": self._get_genome, } handler = operation_handlers.get(operation) if not handler: return { "status": "error", "error": "Unknown operation: {}. Available: {}".format( operation, list(operation_handlers.keys()) ), } try: return handler(arguments) except requests.exceptions.Timeout: return {"status": "error", "error": "GTDB API request timed out"} except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to GTDB API"} except Exception as e: return { "status": "error", "error": "GTDB operation failed: {}".format(str(e)), }
[docs] def _make_request(self, path: str, params: Optional[Dict] = None) -> Dict[str, Any]: """Make GET request to GTDB API.""" url = "{}/{}".format(GTDB_BASE_URL, path) response = self.session.get(url, params=params or {}, timeout=self.timeout) if response.status_code == 200: try: data = response.json() return {"ok": True, "data": data} except ValueError: return {"ok": False, "error": "Invalid JSON response from GTDB API"} elif response.status_code == 400: try: err = response.json() return {"ok": False, "error": err.get("detail", "Bad request")} except ValueError: return {"ok": False, "error": "Bad request"} elif response.status_code == 404: return {"ok": False, "error": "Taxon or resource not found in GTDB"} else: return { "ok": False, "error": "GTDB API returned status {}".format(response.status_code), }
[docs] def _search_taxon(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search for taxa by partial name.""" query = arguments.get("query") if not query: return {"status": "error", "error": "query parameter is required"} limit = arguments.get("limit", 20) result = self._make_request( "taxon/search/{}".format(query), params={"limit": min(limit, 100)}, ) if not result["ok"]: return {"status": "error", "error": result["error"]} matches = result["data"].get("matches", []) return { "status": "success", "data": { "query": query, "matches": matches, "count": len(matches), }, }
[docs] def _get_species(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get species cluster details.""" species = arguments.get("species") if not species: return {"status": "error", "error": "species parameter is required"} result = self._make_request("species/search/{}".format(species)) if not result["ok"]: return {"status": "error", "error": result["error"]} data = result["data"] # Limit genomes list to avoid huge responses genomes = data.get("genomes", []) total_genomes = len(genomes) max_genomes = arguments.get("max_genomes", 20) if total_genomes > max_genomes: genomes = genomes[:max_genomes] return { "status": "success", "data": { "species_name": data.get("name", species), "total_genomes": total_genomes, "genomes_shown": len(genomes), "genomes": genomes, }, }
[docs] def _get_taxon_info(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get taxon card info (rank, genome count, higher ranks).""" taxon = arguments.get("taxon") if not taxon: return {"status": "error", "error": "taxon parameter is required"} # Ensure taxon has a GTDB prefix prefixes = ["d__", "p__", "c__", "o__", "f__", "g__", "s__"] has_prefix = any(taxon.startswith(p) for p in prefixes) # Get card info result = self._make_request("taxon/{}/card".format(taxon)) if not result["ok"]: # If the original failed and we don't have a prefix, try with common prefixes if not has_prefix: for prefix in prefixes: result = self._make_request("taxon/{}{}/card".format(prefix, taxon)) if result["ok"]: taxon = prefix + taxon break if not result["ok"]: return {"status": "error", "error": result["error"]} card = result["data"] # Also get the full lineage lineage_result = self._make_request("taxonomy/partial/{}".format(taxon)) lineage = lineage_result["data"] if lineage_result["ok"] else None return { "status": "success", "data": { "taxon": taxon, "rank": card.get("rank"), "n_genomes": card.get("nGenomes"), "higher_ranks": card.get("higherRanks", []), "in_releases": card.get("inReleases", []), "lineage": lineage, }, }
[docs] def _search_genomes(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search genomes by organism name.""" query = arguments.get("query") if not query: return {"status": "error", "error": "query parameter is required"} page = arguments.get("page", 1) items_per_page = arguments.get("items_per_page", 10) result = self._make_request( "search/gtdb", params={ "search": query, "page": page, "itemsPerPage": min(items_per_page, 50), }, ) if not result["ok"]: return {"status": "error", "error": result["error"]} data = result["data"] rows = data.get("rows", []) total = data.get("totalRows", len(rows)) return { "status": "success", "data": { "query": query, "total_results": total, "page": page, "results": rows, "count": len(rows), }, }
[docs] def _get_genome(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed genome metadata by accession.""" accession = arguments.get("accession") if not accession: return {"status": "error", "error": "accession parameter is required"} result = self._make_request("genome/{}/card".format(accession)) if not result["ok"]: return {"status": "error", "error": result["error"]} data = result["data"] # Also get taxon history history_result = self._make_request("genome/{}/taxon-history".format(accession)) taxon_history = history_result["data"] if history_result["ok"] else [] return { "status": "success", "data": { "genome": data.get("genome", {}), "metadata_nucleotide": data.get("metadata_nucleotide", {}), "metadata_gene": data.get("metadata_gene", {}), "metadata_ncbi": data.get("metadata_ncbi", {}), "gtdb_taxonomy": data.get("metadata_taxonomy", {}), "taxon_history": taxon_history, }, }