Source code for tooluniverse.metabolomics_workbench_tool

# metabolomics_workbench_tool.py
"""
Metabolomics Workbench API tool for ToolUniverse.

Metabolomics Workbench is a comprehensive data repository for metabolomics
data, providing access to metabolite structures, study metadata, and
experimental results.

API Documentation: https://www.metabolomicsworkbench.org/tools/mw_rest.php
"""

import requests
from typing import Dict, Any
from urllib.parse import quote
from .base_tool import BaseTool
from .tool_registry import register_tool

# Base URL for Metabolomics Workbench REST API
MWBENCH_BASE_URL = "https://www.metabolomicsworkbench.org/rest"



[docs]
@register_tool("MetabolomicsWorkbenchTool")
class MetabolomicsWorkbenchTool(BaseTool):
    """
    Tool for querying Metabolomics Workbench REST API.

    Metabolomics Workbench provides metabolomics data including:
    - Study metadata and experimental results
    - Compound/metabolite information and structures
    - RefMet standardized nomenclature
    - Mass spectrometry data searches

    No authentication required. Free for academic/research use.
    """


[docs]
    def __init__(self, tool_config: Dict[str, Any]):
        super().__init__(tool_config)
        self.timeout = tool_config.get("timeout", 30)
        # Get the context type from config (study, compound, refmet, gene, protein, moverz, exactmass)
        self.context = tool_config.get("fields", {}).get("context", "compound")
        self.output_format = tool_config.get("fields", {}).get("output_format", "json")



[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the Metabolomics Workbench API call."""
        # Resolve compound_name/name aliases to input_value
        if "input_value" not in arguments:
            for alias in ("compound_name", "name"):
                if alias in arguments:
                    arguments["input_value"] = arguments.pop(alias)
                    break

        context = self.context

        try:
            if context == "study":
                return self._query_study(arguments)
            elif context == "compound":
                return self._query_compound(arguments)
            elif context == "refmet":
                return self._query_refmet(arguments)
            elif context == "moverz":
                return self._search_moverz(arguments)
            elif context == "exactmass":
                return self._search_exactmass(arguments)
            elif context == "metstat":
                return self._query_metstat(arguments)
            elif context == "gene":
                return self._query_gene(arguments)
            elif context == "protein":
                return self._query_protein(arguments)
            elif context == "gene_protein":
                return self._query_gene_protein(arguments)
            else:
                return {"status": "error", "error": f"Unknown context: {context}"}
        except Exception as e:
            raise self.handle_error(e)



[docs]
    def _make_request(self, sub_path: str) -> Dict[str, Any]:
        """Central method to handle API requests and response validation."""
        # Ensure /json is appended to the URL
        if not sub_path.endswith("/json"):
            url = f"{MWBENCH_BASE_URL}/{sub_path.strip('/')}/json"
        else:
            url = f"{MWBENCH_BASE_URL}/{sub_path.strip('/')}"

        try:
            response = requests.get(url, timeout=self.timeout)
            response.raise_for_status()

            # The API sometimes returns "null" as a string or an empty string with 200 OK
            raw_text = response.text.strip()
            if not raw_text or raw_text.lower() == "null" or raw_text == '""':
                return {
                    "status": "success",
                    "data": [],
                    "message": "No results found. RefMet requires exact metabolite names "
                    "(e.g., 'Cholic acid' not 'bile acid', 'Cer(d18:1/16:0)' not 'ceramide'). "
                    "Try a specific compound name or use ChEBI_search for class-level terms.",
                }

            try:
                data = response.json()
                # Check for API-level error status
                if isinstance(data, dict) and data.get("status") == "error":
                    return {
                        "status": "error",
                        "error": data.get("message", "API returned an error status"),
                    }

                # Convert exactmass from string to number if present
                data = self._normalize_numeric_fields(data)

                # Feature-79A-001: Add guidance when RefMet returns empty array
                if isinstance(data, list) and len(data) == 0:
                    return {
                        "status": "success",
                        "data": [],
                        "message": "No results found. RefMet requires exact metabolite names "
                        "(e.g., 'Cholic acid' not 'bile acid', 'Cer(d18:1/16:0)' not 'ceramide'). "
                        "Try a specific compound name or use ChEBI_search for class-level terms.",
                    }

                return {"status": "success", "data": data}
            except ValueError:
                # Some endpoints (confirmed live: moverz/REFMET exact-mass
                # search, and study "metabolites" output) ignore the
                # requested "/json" suffix and return plain tab-separated
                # text instead. Parse that into structured rows rather
                # than handing back one giant string with literal \t/\n
                # characters embedded -- harder for any downstream
                # consumer to use than the JSON every sibling endpoint
                # returns.
                parsed = self._parse_tsv_text(response.text)
                if parsed is not None:
                    return {"status": "success", "data": parsed}
                return {"status": "success", "data": response.text}

        except requests.RequestException as e:
            raise self.handle_error(e)



[docs]
    @staticmethod
    def _parse_tsv_text(text: str):
        """Parse a tab-separated response body into a list of row dicts.

        Returns None (caller falls back to the raw string) if the text
        doesn't actually look like a tab-delimited table.
        """
        lines = [ln for ln in text.strip().split("\n") if ln]
        if len(lines) < 2 or "\t" not in lines[0]:
            return None
        headers = lines[0].split("\t")
        rows = []
        for line in lines[1:]:
            values = line.split("\t")
            rows.append(
                {h: values[i] if i < len(values) else "" for i, h in enumerate(headers)}
            )
        return rows



[docs]
    def _normalize_numeric_fields(self, data: Any) -> Any:
        """Convert numeric string fields to actual numbers."""
        if isinstance(data, dict):
            # Convert exactmass from string to float
            if "exactmass" in data and isinstance(data["exactmass"], str):
                try:
                    data["exactmass"] = float(data["exactmass"])
                except (ValueError, TypeError):
                    pass
            # Recursively process nested dicts
            return {k: self._normalize_numeric_fields(v) for k, v in data.items()}
        elif isinstance(data, list):
            return [self._normalize_numeric_fields(item) for item in data]
        return data



[docs]
    def _query_study(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Query study metadata."""
        study_id = arguments.get("study_id", "")
        output_item = arguments.get("output_item", "summary")
        if not study_id:
            return {"status": "error", "error": "study_id parameter is required"}
        return self._make_request(f"study/study_id/{study_id}/{output_item}")



[docs]
    def _query_compound(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Query compound information."""
        input_item = self.tool_config.get("fields", {}).get("input_item", "formula")
        input_value = arguments.get("input_value", "")
        output_item = arguments.get("output_item", "all")
        if not input_value:
            return {"status": "error", "error": "input_value parameter is required"}
        return self._make_request(f"compound/{input_item}/{input_value}/{output_item}")



[docs]
    def _query_refmet(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Query RefMet nomenclature."""
        input_item = self.tool_config.get("fields", {}).get("input_item", "name")
        input_value = arguments.get("input_value", "")
        output_item = arguments.get("output_item", "all")
        if not input_value:
            return {"status": "error", "error": "input_value parameter is required"}
        return self._make_request(f"refmet/{input_item}/{input_value}/{output_item}")



[docs]
    def _search_moverz(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search by m/z value. Requires database as first URL path segment."""
        mz_value = arguments.get("mz_value")
        adduct = arguments.get("adduct", "M+H")
        tolerance = arguments.get("tolerance", 0.1)
        database = arguments.get("database", "MB")  # MB, LIPIDS, or REFMET
        if mz_value is None:
            return {"status": "error", "error": "mz_value parameter is required"}
        # URL-encode adduct: '+' in 'M+H' must be %2B or the server drops the connection
        encoded_adduct = quote(str(adduct), safe="")
        return self._make_request(
            f"moverz/{database}/{mz_value}/{encoded_adduct}/{tolerance}"
        )



[docs]
    def _search_exactmass(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search by exact mass using moverz endpoint with neutral adduct."""
        mass_value = arguments.get("mass_value")
        tolerance = arguments.get("tolerance", 0.1)
        if mass_value is None:
            return {"status": "error", "error": "mass_value parameter is required"}
        # exactmass endpoint is non-functional; use moverz/REFMET with neutral adduct M
        return self._make_request(f"moverz/REFMET/{mass_value}/M/{tolerance}")


    # METSTAT slot order matches the REST API path:
    # analysis;polarity;chromatography;species;source;disease;kegg_id;refmet_name
    _METSTAT_SLOTS = (
        "analysis",
        "polarity",
        "chromatography",
        "species",
        "source",
        "disease",
        "kegg_id",
        "refmet_name",
    )


[docs]
    def _query_metstat(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Discover studies by phenotype via the METSTAT context.

        Builds the 8-slot semicolon-delimited filter path. Every slot is
        optional; empty slots act as wildcards. At least one filter must be
        provided so the query is not fully unconstrained.
        """
        slots = [str(arguments.get(name) or "").strip() for name in self._METSTAT_SLOTS]
        if not any(slots):
            return {
                "status": "error",
                "error": (
                    "At least one filter is required for METSTAT. Provide one or more of: "
                    + ", ".join(self._METSTAT_SLOTS)
                ),
            }
        # URL-encode each slot value (e.g. spaces) but keep the ';' separators literal.
        encoded = ";".join(quote(s, safe="") for s in slots)
        result = self._make_request(f"metstat/{encoded}")
        return self._rows_to_list(result)



[docs]
    def _query_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Look up a Metabolomics Workbench gene (MGP) record."""
        input_item = arguments.get("id_type") or self.tool_config.get("fields", {}).get(
            "input_item", "gene_symbol"
        )
        input_value = arguments.get("input_value", "")
        if not input_value:
            return {
                "status": "error",
                "error": "input_value parameter is required (gene symbol, gene_id, or mgp_id)",
            }
        encoded = quote(str(input_value), safe="")
        result = self._make_request(f"gene/{input_item}/{encoded}/all")
        return self._rows_to_list(result)



[docs]
    def _query_protein(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Look up a Metabolomics Workbench protein (MGP) record."""
        input_item = arguments.get("id_type") or self.tool_config.get("fields", {}).get(
            "input_item", "uniprot_id"
        )
        input_value = arguments.get("input_value", "")
        if not input_value:
            return {
                "status": "error",
                "error": "input_value parameter is required (uniprot_id, gene_symbol, mgp_id, or refseq_id)",
            }
        encoded = quote(str(input_value), safe="")
        result = self._make_request(f"protein/{input_item}/{encoded}/all")
        return self._rows_to_list(result)



[docs]
    def _query_gene_protein(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Combined gene/protein MGP lookup; routes on the 'entity' argument.

        entity='gene' (default) queries the gene endpoint; entity='protein'
        queries the protein endpoint. id_type selects the lookup namespace.
        """
        entity = str(arguments.get("entity") or "gene").strip().lower()
        if entity == "protein":
            return self._query_protein(arguments)
        if entity == "gene":
            return self._query_gene(arguments)
        return {
            "status": "error",
            "error": "entity must be 'gene' or 'protein'",
        }



[docs]
    @staticmethod
    def _rows_to_list(result: Dict[str, Any]) -> Dict[str, Any]:
        """Flatten the Workbench 'Row1','Row2',... dict into a list under data.

        Multi-result Workbench endpoints return {"Row1": {...}, "Row2": {...}}.
        Single-result endpoints return a bare object. Normalize both to a list
        so consuming agents get a consistent shape.
        """
        if result.get("status") != "success":
            return result
        data = result.get("data")
        if isinstance(data, dict) and any(
            k.lower().startswith("row") for k in data.keys()
        ):
            rows = [v for k, v in data.items() if k.lower().startswith("row")]
            result = dict(result)
            result["data"] = rows
            result["count"] = len(rows)
        elif isinstance(data, dict):
            result = dict(result)
            result["data"] = [data]
            result["count"] = 1
        return result