tooluniverse.archs4_tool 源代码

"""
ARCHS4 Tool - All RNA-seq and ChIP-seq Sample and Signature Search

Provides access to ARCHS4 APIs for querying pre-computed gene expression
across tissues and cell lines, and gene co-expression correlations from
300K+ uniformly processed RNA-seq samples.

API base: https://maayanlab.cloud/archs4/
Correlation API: https://maayanlab.cloud/matrixapi/
No authentication required.

Reference: Lachmann et al., Nature Communications 2018
"""

import csv
import io
import requests
from typing import Dict, Any

from .base_tool import BaseTool
from .tool_registry import register_tool


ARCHS4_BASE_URL = "https://maayanlab.cloud/archs4"
MATRIX_API_URL = "https://maayanlab.cloud/matrixapi"


[文档] @register_tool("ARCHS4Tool") class ARCHS4Tool(BaseTool): """ Tool for querying the ARCHS4 gene expression database. ARCHS4 provides uniformly processed RNA-seq data from GEO, covering 300K+ human and mouse samples with pre-computed expression levels and gene co-expression correlations. Supported operations: - get_gene_expression: Get expression across tissues/cell lines for a gene - get_gene_correlations: Get co-expressed genes (Pearson correlation) """
[文档] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.parameter = tool_config.get("parameter", {}) self.required = self.parameter.get("required", []) self.session = requests.Session() self.timeout = 30
[文档] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: operation = ( arguments.get("operation") or self.tool_config.get("fields", {}).get("operation") or self.get_schema_const_operation() ) if not operation: return {"status": "error", "error": "Missing required parameter: operation"} handlers = { "get_gene_expression": self._get_gene_expression, "get_gene_correlations": self._get_gene_correlations, } handler = handlers.get(operation) if not handler: return { "status": "error", "error": "Unknown operation: {}. Available: {}".format( operation, list(handlers.keys()) ), } try: return handler(arguments) except requests.exceptions.Timeout: return {"status": "error", "error": "ARCHS4 API request timed out"} except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to ARCHS4 API"} except Exception as e: return {"status": "error", "error": "ARCHS4 error: {}".format(str(e))}
[文档] def _get_gene_expression(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get gene expression across tissues or cell lines from ARCHS4.""" gene = arguments.get("gene") or arguments.get("search") if not gene: return {"status": "error", "error": "Missing required parameter: gene"} species = arguments.get("species", "human") expression_type = arguments.get("type", "tissue") params = { "search": gene, "species": species, "type": expression_type, } url = "{}/search/loadExpressionTissue.php".format(ARCHS4_BASE_URL) response = self.session.get(url, params=params, timeout=self.timeout) if response.status_code != 200: return { "status": "error", "error": "ARCHS4 expression API returned HTTP {}".format( response.status_code ), } text = response.text.strip() if not text: return { "status": "error", "error": "No expression data found for gene: {}".format(gene), } # Parse CSV response reader = csv.reader(io.StringIO(text)) header = next(reader, None) if not header: return {"status": "error", "error": "Empty response from ARCHS4"} tissues = [] categories = [] for row in reader: if len(row) < 6: continue tissue_id = row[0] min_val, q1, median, q3, max_val = row[1], row[2], row[3], row[4], row[5] # Skip category headers (no expression values) if not median: categories.append(tissue_id) continue entry = { "tissue": tissue_id, "min": float(min_val) if min_val else None, "q1": float(q1) if q1 else None, "median": float(median) if median else None, "q3": float(q3) if q3 else None, "max": float(max_val) if max_val else None, } tissues.append(entry) # Sort by median expression descending tissues.sort(key=lambda x: x.get("median") or 0, reverse=True) return { "status": "success", "data": tissues, "metadata": { "gene": gene, "species": species, "type": expression_type, "tissue_count": len(tissues), "unit": "log2(TPM+1)", }, }
[文档] def _get_gene_correlations(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get co-expressed genes for a query gene from ARCHS4.""" gene = arguments.get("gene") or arguments.get("id") if not gene: return {"status": "error", "error": "Missing required parameter: gene"} count = arguments.get("count", 20) if count < 2: count = 2 if count > 200: count = 200 # Use the matrixapi POST endpoint url = "{}/coltop".format(MATRIX_API_URL) payload = {"id": gene, "count": count + 1} # +1 because first result is self response = self.session.post(url, json=payload, timeout=self.timeout) if response.status_code != 200: return { "status": "error", "error": "ARCHS4 correlation API returned HTTP {}".format( response.status_code ), } data = response.json() gene_ids = data.get("rowids", []) values = data.get("values", []) correlations = [] for i, (gid, val) in enumerate(zip(gene_ids, values)): # Skip self-correlation (first entry) if gid.upper() == gene.upper(): continue correlations.append( { "gene": gid, "pearson_correlation": round(val, 6), "rank": len(correlations) + 1, } ) return { "status": "success", "data": correlations, "metadata": { "query_gene": gene, "returned": len(correlations), "note": "Pearson correlation computed across 300K+ uniformly processed RNA-seq samples", }, }