Source code for tooluniverse.gene_ontology_tool

import requests
from typing import Any, Dict, Optional
from urllib.parse import quote
from .base_tool import BaseTool
from .tool_registry import register_tool



[docs]
@register_tool("GeneOntologyTool")
class GeneOntologyTool(BaseTool):
    """
    A general-purpose tool for calling the Gene Ontology (GO) API.
    It is configured via a dictionary that defines the specific API endpoint.
    """


[docs]
    def __init__(self, tool_config: Dict):
        """
        Initializes the tool with a configuration.

        Args:
            tool_config (Dict): A dictionary containing 'fields' with an 'endpoint'.
        """
        super().__init__(tool_config)
        self.endpoint = tool_config["fields"]["endpoint"]
        self.extract_path = tool_config["fields"].get("extract_path")
        self.timeout = 20


    def _build_url(self, args: Dict[str, Any]) -> str:
        """Builds the request URL from arguments."""
        url = self.endpoint
        for key, value in args.items():
            url = url.replace(f"{{{key}}}", quote(str(value)))
        return url

    def _extract_data(self, data: Dict, extract_path: str) -> Any:
        """Extract specific data from the GO API response using custom paths."""

        if extract_path == "response.docs[0]":
            # Extract single document from GOlr response
            response = data.get("response", {})
            docs = response.get("docs", [])
            if docs:
                return docs[0]
            else:
                return {"error": "No GO term found"}

        elif extract_path == "response.docs":
            # Extract all documents from GOlr response
            response = data.get("response", {})
            docs = response.get("docs", [])
            return docs

        elif extract_path == "associations[*].subject":
            # Extract gene/protein information from Biolink associations
            result = []
            # Handle both dict with associations and direct list from Biolink API
            if isinstance(data, list):
                # Direct list of associations from Biolink API
                associations = data
            else:
                # Dictionary response with associations key
                associations = data.get("associations", [])

            for assoc in associations:
                subject = assoc.get("subject", {})
                result.append(subject)
            return result

        # For simple paths, try direct access
        try:
            if "." in extract_path:
                keys = extract_path.split(".")
                result = data
                for key in keys:
                    if "[" in key and "]" in key:
                        # Handle array indexing like "docs[0]"
                        array_key = key.split("[")[0]
                        index_str = key.split("[")[1].split("]")[0]
                        result = result.get(array_key, [])
                        if index_str.isdigit():
                            index = int(index_str)
                            if index < len(result):
                                result = result[index]
                            else:
                                return {"error": f"Index {index} out of range"}
                        else:
                            return {"error": f"Invalid array index: {index_str}"}
                    else:
                        result = result.get(key, {})
                return result
            else:
                return data.get(extract_path)
        except Exception as e:
            return {"error": f"Failed to extract data using path '{extract_path}': {e}"}


[docs]
    def run(self, arguments: Any = None) -> Any:
        """
        Executes the API call and returns the data.

        Args:
            arguments (Dict[str, Any]): Parameters for the API call.

        Returns:
            Any: The JSON data from the API or an error dictionary.
        """
        # Normalize arguments
        if arguments is None:
            arguments = {}
        if not isinstance(arguments, dict):
            return {"error": "Invalid arguments type; expected a mapping/dict."}

        # Handle different endpoint formats
        if "?" in self.endpoint:
            # This is a complete URL with query parameters (GOlr format)
            url = self.endpoint
            for key, value in arguments.items():
                url = url.replace(f"{{{key}}}", quote(str(value)))
            params = {}
        else:
            # This is a template URL (Biolink format)
            url_args = arguments.copy()
            params = {}

            # Move query parameters to params dict for Biolink API
            if "taxon" in arguments:
                params["taxon"] = url_args.pop("taxon")
            if "rows" in arguments:
                params["rows"] = url_args.pop("rows")
            if "start" in arguments:
                params["start"] = url_args.pop("start")

            # Build URL with remaining arguments
            url = self._build_url(url_args)

        try:
            resp = requests.get(
                url,
                params=params,
                timeout=self.timeout,
                headers={"Accept": "application/json"},
            )
            resp.raise_for_status()
            data = resp.json()
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                return {
                    "error": "The requested resource was not found (404 Not Found)."
                }
            return {
                "error": f"GO API request failed with HTTP status: {e.response.status_code}",
                "detail": e.response.text,
            }
        except requests.exceptions.RequestException as e:
            return {
                "error": f"A network error occurred while requesting the GO API: {e}"
            }
        except ValueError:
            return {
                "error": "Failed to parse GO API response, which may not be valid JSON.",
                "content": resp.text,
            }

        # If extract_path is configured, extract the corresponding subset
        if self.extract_path:
            result = self._extract_data(data, self.extract_path)

            # Handle empty results
            if isinstance(result, list) and len(result) == 0:
                return {"error": f"No data found for path: {self.extract_path}"}
            elif isinstance(result, dict) and "error" in result:
                return result

            return result

        return data


    # Method bindings for backward compatibility and convenience

[docs]
    def search_terms(self, query: str) -> Any:
        return self.run({"query": query})



[docs]
    def get_term_details(self, id: str) -> Any:
        return self.run({"id": id})



[docs]
    def get_genes_for_term(
        self, id: str, taxon: Optional[str] = None, rows: Optional[int] = None
    ) -> Any:
        args = {"id": id}
        if taxon:
            args["taxon"] = taxon
        if rows:
            args["rows"] = rows
        return self.run(args)



[docs]
    def get_terms_for_gene(self, id: str) -> Any:
        return self.run({"id": id})