Source code for tooluniverse.arrayexpress_tool

"""
ArrayExpress Database Tool (Original Source)

This tool provides access to the ORIGINAL ArrayExpress database for functional
genomics experiments including microarray and RNA-seq data.

ArrayExpress is the authoritative source for functional genomics data. While the
underlying infrastructure has migrated to BioStudies, this tool specifically
accesses the ArrayExpress collection, maintaining the original ArrayExpress
interface and data structure.
"""

import requests
from typing import Any, Dict, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool


[docs] @register_tool("ArrayExpressRESTTool") class ArrayExpressRESTTool(BaseTool): """ ArrayExpress REST API tool - Original ArrayExpress Database. Accesses the official ArrayExpress functional genomics database. ArrayExpress is the authoritative source for gene expression data, microarray experiments, and RNA-seq studies from EBI. The database infrastructure uses BioStudies backend for improved performance and integration, but this tool specifically queries the ArrayExpress collection to maintain the original data source. """
[docs] def __init__(self, tool_config: Dict): super().__init__(tool_config) # ArrayExpress collection via BioStudies API # This IS the original ArrayExpress - just modern infrastructure self.base_url = "https://www.ebi.ac.uk/biostudies/api/v1" self.collection = "arrayexpress" # Original ArrayExpress data self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": "ToolUniverse/ArrayExpress/1.0", } ) self.timeout = 30
[docs] def _build_url(self, args: Dict[str, Any]) -> str: """Build URL from endpoint template and arguments (BioStudies API)""" tool_name = self.tool_config.get("name", "") if tool_name == "arrayexpress_search_experiments": return f"{self.base_url}/search" elif tool_name == "arrayexpress_get_experiment": experiment_id = args.get("experiment_id", "") if experiment_id: return f"{self.base_url}/studies/{experiment_id}" elif tool_name == "arrayexpress_get_experiment_files": experiment_id = args.get("experiment_id", "") if experiment_id: return f"{self.base_url}/studies/{experiment_id}" elif tool_name == "arrayexpress_get_experiment_samples": experiment_id = args.get("experiment_id", "") if experiment_id: return f"{self.base_url}/studies/{experiment_id}" return f"{self.base_url}/search"
[docs] def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]: """Build query parameters for BioStudies API""" params = {} tool_name = self.tool_config.get("name", "") if tool_name == "arrayexpress_search_experiments": # Build search query for BioStudies query_parts = [] if "keywords" in args: query_parts.append(args["keywords"]) if "species" in args: query_parts.append(args["species"]) if "array" in args: query_parts.append(args["array"]) if query_parts: params["query"] = " ".join(query_parts) else: params["query"] = "*" # Default to all # CRITICAL: Always filter to ArrayExpress collection # This ensures we query the ORIGINAL ArrayExpress database only, # not the broader BioStudies repository params["collection"] = "arrayexpress" # Map limit to pageSize limit = args.get("limit", 10) params["pageSize"] = min(limit, 100) # Map offset to page number (BioStudies uses 1-based page numbers) offset = args.get("offset", 0) page_size = params["pageSize"] params["page"] = (offset // page_size) + 1 return params
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the BioStudies API call for ArrayExpress data""" try: url = self._build_url(arguments) params = self._build_params(arguments) tool_name = self.tool_config.get("name", "") response = self.session.get(url, params=params, timeout=self.timeout) response.raise_for_status() # Check content type content_type = response.headers.get("content-type", "") if "json" not in content_type.lower(): return { "status": "error", "error": f"API returned non-JSON content: {content_type}", "url": response.url, } data = response.json() # Transform BioStudies response to match expected ArrayExpress format if tool_name == "arrayexpress_search_experiments": # BioStudies search response hits = data.get("hits", []) transformed_data = { "experiments": hits, "totalHits": data.get("totalHits", 0), "page": data.get("page", 1), "pageSize": data.get("pageSize", len(hits)), } return { "status": "success", "data": transformed_data, "count": len(hits), "url": response.url, } elif tool_name == "arrayexpress_get_experiment": # BioStudies study response return { "status": "success", "data": data, "url": response.url, } elif tool_name == "arrayexpress_get_experiment_files": # Extract files from BioStudies study files = [] if "section" in data: section = data.get("section", {}) files = self._extract_files_from_section(section) return { "status": "success", "data": files, "count": len(files), "url": response.url, } elif tool_name == "arrayexpress_get_experiment_samples": # Extract samples from BioStudies study samples = [] if "section" in data: section = data.get("section", {}) samples = self._extract_samples_from_section(section) return { "status": "success", "data": samples, "count": len(samples), "url": response.url, } else: # Generic response return { "status": "success", "data": data, "url": response.url, } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"BioStudies API error: {str(e)}", "url": url if "url" in locals() else None, } except Exception as e: return { "status": "error", "error": f"Unexpected error: {str(e)}", "url": url if "url" in locals() else None, }
[docs] def _extract_files_from_section(self, section: Dict[str, Any]) -> list: """Extract files from a BioStudies section""" files = [] # Add files from current section if "files" in section and isinstance(section["files"], list): for file_obj in section["files"]: if isinstance(file_obj, dict): files.append( { "name": file_obj.get("path", file_obj.get("name", "")), "size": file_obj.get("size", 0), "type": file_obj.get("type", ""), } ) # Recursively extract from subsections # Note: BioStudies subsections can be a list of lists if "subsections" in section and isinstance(section["subsections"], list): for subsection_group in section["subsections"]: # Handle both list of dicts and list of lists if isinstance(subsection_group, list): for subsection in subsection_group: if isinstance(subsection, dict): files.extend(self._extract_files_from_section(subsection)) elif isinstance(subsection_group, dict): files.extend(self._extract_files_from_section(subsection_group)) return files
[docs] def _extract_samples_from_section(self, section: Dict[str, Any]) -> list: """Extract sample information from a BioStudies section""" samples = [] # Check if current section is a sample section_type = section.get("type", "") if section_type.lower() in ["samples", "sample"]: # Extract sample attributes if "attributes" in section and isinstance(section["attributes"], list): sample_data = {} for attr in section["attributes"]: if isinstance(attr, dict): sample_data[attr.get("name", "")] = attr.get("value", "") if sample_data: samples.append(sample_data) # Look for subsections that might contain sample data # Note: BioStudies subsections can be a list of lists if "subsections" in section and isinstance(section["subsections"], list): for subsection_group in section["subsections"]: # Handle both list of dicts and list of lists if isinstance(subsection_group, list): for subsection in subsection_group: if isinstance(subsection, dict): samples.extend( self._extract_samples_from_section(subsection) ) elif isinstance(subsection_group, dict): samples.extend(self._extract_samples_from_section(subsection_group)) return samples