Source code for tooluniverse.metabolomics_workbench_tool

# metabolomics_workbench_tool.py
"""
Metabolomics Workbench API tool for ToolUniverse.

Metabolomics Workbench is a comprehensive data repository for metabolomics
data, providing access to metabolite structures, study metadata, and
experimental results.

API Documentation: https://www.metabolomicsworkbench.org/tools/mw_rest.php
"""

import requests
from typing import Dict, Any
from urllib.parse import quote
from .base_tool import BaseTool
from .tool_registry import register_tool

# Base URL for Metabolomics Workbench REST API
MWBENCH_BASE_URL = "https://www.metabolomicsworkbench.org/rest"


[docs] @register_tool("MetabolomicsWorkbenchTool") class MetabolomicsWorkbenchTool(BaseTool): """ Tool for querying Metabolomics Workbench REST API. Metabolomics Workbench provides metabolomics data including: - Study metadata and experimental results - Compound/metabolite information and structures - RefMet standardized nomenclature - Mass spectrometry data searches No authentication required. Free for academic/research use. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) # Get the context type from config (study, compound, refmet, gene, protein, moverz, exactmass) self.context = tool_config.get("fields", {}).get("context", "compound") self.output_format = tool_config.get("fields", {}).get("output_format", "json")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Metabolomics Workbench API call.""" # Resolve compound_name/name aliases to input_value if "input_value" not in arguments: for alias in ("compound_name", "name"): if alias in arguments: arguments["input_value"] = arguments.pop(alias) break context = self.context try: if context == "study": return self._query_study(arguments) elif context == "compound": return self._query_compound(arguments) elif context == "refmet": return self._query_refmet(arguments) elif context == "moverz": return self._search_moverz(arguments) elif context == "exactmass": return self._search_exactmass(arguments) elif context == "metstat": return self._query_metstat(arguments) elif context == "gene": return self._query_gene(arguments) elif context == "protein": return self._query_protein(arguments) elif context == "gene_protein": return self._query_gene_protein(arguments) else: return {"status": "error", "error": f"Unknown context: {context}"} except Exception as e: raise self.handle_error(e)
[docs] def _make_request(self, sub_path: str) -> Dict[str, Any]: """Central method to handle API requests and response validation.""" # Ensure /json is appended to the URL if not sub_path.endswith("/json"): url = f"{MWBENCH_BASE_URL}/{sub_path.strip('/')}/json" else: url = f"{MWBENCH_BASE_URL}/{sub_path.strip('/')}" try: response = requests.get(url, timeout=self.timeout) response.raise_for_status() # The API sometimes returns "null" as a string or an empty string with 200 OK raw_text = response.text.strip() if not raw_text or raw_text.lower() == "null" or raw_text == '""': return { "status": "success", "data": [], "message": "No results found. RefMet requires exact metabolite names " "(e.g., 'Cholic acid' not 'bile acid', 'Cer(d18:1/16:0)' not 'ceramide'). " "Try a specific compound name or use ChEBI_search for class-level terms.", } try: data = response.json() # Check for API-level error status if isinstance(data, dict) and data.get("status") == "error": return { "status": "error", "error": data.get("message", "API returned an error status"), } # Convert exactmass from string to number if present data = self._normalize_numeric_fields(data) # Feature-79A-001: Add guidance when RefMet returns empty array if isinstance(data, list) and len(data) == 0: return { "status": "success", "data": [], "message": "No results found. RefMet requires exact metabolite names " "(e.g., 'Cholic acid' not 'bile acid', 'Cer(d18:1/16:0)' not 'ceramide'). " "Try a specific compound name or use ChEBI_search for class-level terms.", } return {"status": "success", "data": data} except ValueError: # Return as text if not JSON (though we requested JSON) return {"status": "success", "data": response.text} except requests.RequestException as e: raise self.handle_error(e)
[docs] def _normalize_numeric_fields(self, data: Any) -> Any: """Convert numeric string fields to actual numbers.""" if isinstance(data, dict): # Convert exactmass from string to float if "exactmass" in data and isinstance(data["exactmass"], str): try: data["exactmass"] = float(data["exactmass"]) except (ValueError, TypeError): pass # Recursively process nested dicts return {k: self._normalize_numeric_fields(v) for k, v in data.items()} elif isinstance(data, list): return [self._normalize_numeric_fields(item) for item in data] return data
[docs] def _query_study(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Query study metadata.""" study_id = arguments.get("study_id", "") output_item = arguments.get("output_item", "summary") if not study_id: return {"status": "error", "error": "study_id parameter is required"} return self._make_request(f"study/study_id/{study_id}/{output_item}")
[docs] def _query_compound(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Query compound information.""" input_item = self.tool_config.get("fields", {}).get("input_item", "formula") input_value = arguments.get("input_value", "") output_item = arguments.get("output_item", "all") if not input_value: return {"status": "error", "error": "input_value parameter is required"} return self._make_request(f"compound/{input_item}/{input_value}/{output_item}")
[docs] def _query_refmet(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Query RefMet nomenclature.""" input_item = self.tool_config.get("fields", {}).get("input_item", "name") input_value = arguments.get("input_value", "") output_item = arguments.get("output_item", "all") if not input_value: return {"status": "error", "error": "input_value parameter is required"} return self._make_request(f"refmet/{input_item}/{input_value}/{output_item}")
[docs] def _search_moverz(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search by m/z value. Requires database as first URL path segment.""" mz_value = arguments.get("mz_value") adduct = arguments.get("adduct", "M+H") tolerance = arguments.get("tolerance", 0.1) database = arguments.get("database", "MB") # MB, LIPIDS, or REFMET if mz_value is None: return {"status": "error", "error": "mz_value parameter is required"} # URL-encode adduct: '+' in 'M+H' must be %2B or the server drops the connection encoded_adduct = quote(str(adduct), safe="") return self._make_request( f"moverz/{database}/{mz_value}/{encoded_adduct}/{tolerance}" )
[docs] def _search_exactmass(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search by exact mass using moverz endpoint with neutral adduct.""" mass_value = arguments.get("mass_value") tolerance = arguments.get("tolerance", 0.1) if mass_value is None: return {"status": "error", "error": "mass_value parameter is required"} # exactmass endpoint is non-functional; use moverz/REFMET with neutral adduct M return self._make_request(f"moverz/REFMET/{mass_value}/M/{tolerance}")
# METSTAT slot order matches the REST API path: # analysis;polarity;chromatography;species;source;disease;kegg_id;refmet_name _METSTAT_SLOTS = ( "analysis", "polarity", "chromatography", "species", "source", "disease", "kegg_id", "refmet_name", )
[docs] def _query_metstat(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Discover studies by phenotype via the METSTAT context. Builds the 8-slot semicolon-delimited filter path. Every slot is optional; empty slots act as wildcards. At least one filter must be provided so the query is not fully unconstrained. """ slots = [str(arguments.get(name) or "").strip() for name in self._METSTAT_SLOTS] if not any(slots): return { "status": "error", "error": ( "At least one filter is required for METSTAT. Provide one or more of: " + ", ".join(self._METSTAT_SLOTS) ), } # URL-encode each slot value (e.g. spaces) but keep the ';' separators literal. encoded = ";".join(quote(s, safe="") for s in slots) result = self._make_request(f"metstat/{encoded}") return self._rows_to_list(result)
[docs] def _query_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Look up a Metabolomics Workbench gene (MGP) record.""" input_item = arguments.get("id_type") or self.tool_config.get("fields", {}).get( "input_item", "gene_symbol" ) input_value = arguments.get("input_value", "") if not input_value: return { "status": "error", "error": "input_value parameter is required (gene symbol, gene_id, or mgp_id)", } encoded = quote(str(input_value), safe="") result = self._make_request(f"gene/{input_item}/{encoded}/all") return self._rows_to_list(result)
[docs] def _query_protein(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Look up a Metabolomics Workbench protein (MGP) record.""" input_item = arguments.get("id_type") or self.tool_config.get("fields", {}).get( "input_item", "uniprot_id" ) input_value = arguments.get("input_value", "") if not input_value: return { "status": "error", "error": "input_value parameter is required (uniprot_id, gene_symbol, mgp_id, or refseq_id)", } encoded = quote(str(input_value), safe="") result = self._make_request(f"protein/{input_item}/{encoded}/all") return self._rows_to_list(result)
[docs] def _query_gene_protein(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Combined gene/protein MGP lookup; routes on the 'entity' argument. entity='gene' (default) queries the gene endpoint; entity='protein' queries the protein endpoint. id_type selects the lookup namespace. """ entity = str(arguments.get("entity") or "gene").strip().lower() if entity == "protein": return self._query_protein(arguments) if entity == "gene": return self._query_gene(arguments) return { "status": "error", "error": "entity must be 'gene' or 'protein'", }
[docs] @staticmethod def _rows_to_list(result: Dict[str, Any]) -> Dict[str, Any]: """Flatten the Workbench 'Row1','Row2',... dict into a list under data. Multi-result Workbench endpoints return {"Row1": {...}, "Row2": {...}}. Single-result endpoints return a bare object. Normalize both to a list so consuming agents get a consistent shape. """ if result.get("status") != "success": return result data = result.get("data") if isinstance(data, dict) and any( k.lower().startswith("row") for k in data.keys() ): rows = [v for k, v in data.items() if k.lower().startswith("row")] result = dict(result) result["data"] = rows result["count"] = len(rows) elif isinstance(data, dict): result = dict(result) result["data"] = [data] result["count"] = 1 return result