Source code for tooluniverse.arrayexpress_tool
"""
ArrayExpress Database Tool (Original Source)
This tool provides access to the ORIGINAL ArrayExpress database for functional
genomics experiments including microarray and RNA-seq data.
ArrayExpress is the authoritative source for functional genomics data. While the
underlying infrastructure has migrated to BioStudies, this tool specifically
accesses the ArrayExpress collection, maintaining the original ArrayExpress
interface and data structure.
"""
import requests
from typing import Any, Dict, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("ArrayExpressRESTTool")
class ArrayExpressRESTTool(BaseTool):
"""
ArrayExpress REST API tool - Original ArrayExpress Database.
Accesses the official ArrayExpress functional genomics database.
ArrayExpress is the authoritative source for gene expression data,
microarray experiments, and RNA-seq studies from EBI.
The database infrastructure uses BioStudies backend for improved
performance and integration, but this tool specifically queries
the ArrayExpress collection to maintain the original data source.
"""
[docs]
def __init__(self, tool_config: Dict):
super().__init__(tool_config)
# ArrayExpress collection via BioStudies API
# This IS the original ArrayExpress - just modern infrastructure
self.base_url = "https://www.ebi.ac.uk/biostudies/api/v1"
self.collection = "arrayexpress" # Original ArrayExpress data
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": "ToolUniverse/ArrayExpress/1.0",
}
)
self.timeout = 30
[docs]
def _build_url(self, args: Dict[str, Any]) -> str:
"""Build URL from endpoint template and arguments (BioStudies API)"""
tool_name = self.tool_config.get("name", "")
if tool_name == "arrayexpress_search_experiments":
return f"{self.base_url}/search"
elif tool_name == "arrayexpress_get_experiment":
experiment_id = args.get("experiment_id", "")
if experiment_id:
return f"{self.base_url}/studies/{experiment_id}"
elif tool_name == "arrayexpress_get_experiment_files":
experiment_id = args.get("experiment_id", "")
if experiment_id:
return f"{self.base_url}/studies/{experiment_id}"
elif tool_name == "arrayexpress_get_experiment_samples":
experiment_id = args.get("experiment_id", "")
if experiment_id:
return f"{self.base_url}/studies/{experiment_id}"
return f"{self.base_url}/search"
[docs]
def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]:
"""Build query parameters for BioStudies API"""
params = {}
tool_name = self.tool_config.get("name", "")
if tool_name == "arrayexpress_search_experiments":
# Build search query for BioStudies
query_parts = []
if "keywords" in args:
query_parts.append(args["keywords"])
if "species" in args:
query_parts.append(args["species"])
if "array" in args:
query_parts.append(args["array"])
if query_parts:
params["query"] = " ".join(query_parts)
else:
params["query"] = "*" # Default to all
# CRITICAL: Always filter to ArrayExpress collection
# This ensures we query the ORIGINAL ArrayExpress database only,
# not the broader BioStudies repository
params["collection"] = "arrayexpress"
# Map limit to pageSize
limit = args.get("limit", 10)
params["pageSize"] = min(limit, 100)
# Map offset to page number (BioStudies uses 1-based page numbers)
offset = args.get("offset", 0)
page_size = params["pageSize"]
params["page"] = (offset // page_size) + 1
return params
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the BioStudies API call for ArrayExpress data"""
try:
url = self._build_url(arguments)
params = self._build_params(arguments)
tool_name = self.tool_config.get("name", "")
response = self.session.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
# Check content type
content_type = response.headers.get("content-type", "")
if "json" not in content_type.lower():
return {
"status": "error",
"error": f"API returned non-JSON content: {content_type}",
"url": response.url,
}
data = response.json()
# Transform BioStudies response to match expected ArrayExpress format
if tool_name == "arrayexpress_search_experiments":
# BioStudies search response
hits = data.get("hits", [])
transformed_data = {
"experiments": hits,
"totalHits": data.get("totalHits", 0),
"page": data.get("page", 1),
"pageSize": data.get("pageSize", len(hits)),
}
return {
"status": "success",
"data": transformed_data,
"count": len(hits),
"url": response.url,
}
elif tool_name == "arrayexpress_get_experiment":
# BioStudies study response
return {
"status": "success",
"data": data,
"url": response.url,
}
elif tool_name == "arrayexpress_get_experiment_files":
# Extract files from BioStudies study
files = []
if "section" in data:
section = data.get("section", {})
files = self._extract_files_from_section(section)
return {
"status": "success",
"data": files,
"count": len(files),
"url": response.url,
}
elif tool_name == "arrayexpress_get_experiment_samples":
# Extract samples from BioStudies study
samples = []
if "section" in data:
section = data.get("section", {})
samples = self._extract_samples_from_section(section)
return {
"status": "success",
"data": samples,
"count": len(samples),
"url": response.url,
}
else:
# Generic response
return {
"status": "success",
"data": data,
"url": response.url,
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"BioStudies API error: {str(e)}",
"url": url if "url" in locals() else None,
}
except Exception as e:
return {
"status": "error",
"error": f"Unexpected error: {str(e)}",
"url": url if "url" in locals() else None,
}
[docs]
def _extract_files_from_section(self, section: Dict[str, Any]) -> list:
"""Extract files from a BioStudies section"""
files = []
# Add files from current section
if "files" in section and isinstance(section["files"], list):
for file_obj in section["files"]:
if isinstance(file_obj, dict):
files.append(
{
"name": file_obj.get("path", file_obj.get("name", "")),
"size": file_obj.get("size", 0),
"type": file_obj.get("type", ""),
}
)
# Recursively extract from subsections
# Note: BioStudies subsections can be a list of lists
if "subsections" in section and isinstance(section["subsections"], list):
for subsection_group in section["subsections"]:
# Handle both list of dicts and list of lists
if isinstance(subsection_group, list):
for subsection in subsection_group:
if isinstance(subsection, dict):
files.extend(self._extract_files_from_section(subsection))
elif isinstance(subsection_group, dict):
files.extend(self._extract_files_from_section(subsection_group))
return files
[docs]
def _extract_samples_from_section(self, section: Dict[str, Any]) -> list:
"""Extract sample information from a BioStudies section"""
samples = []
# Check if current section is a sample
section_type = section.get("type", "")
if section_type.lower() in ["samples", "sample"]:
# Extract sample attributes
if "attributes" in section and isinstance(section["attributes"], list):
sample_data = {}
for attr in section["attributes"]:
if isinstance(attr, dict):
sample_data[attr.get("name", "")] = attr.get("value", "")
if sample_data:
samples.append(sample_data)
# Look for subsections that might contain sample data
# Note: BioStudies subsections can be a list of lists
if "subsections" in section and isinstance(section["subsections"], list):
for subsection_group in section["subsections"]:
# Handle both list of dicts and list of lists
if isinstance(subsection_group, list):
for subsection in subsection_group:
if isinstance(subsection, dict):
samples.extend(
self._extract_samples_from_section(subsection)
)
elif isinstance(subsection_group, dict):
samples.extend(self._extract_samples_from_section(subsection_group))
return samples