Source code for tooluniverse.chipatlas_tool
"""
ChIP-Atlas API Tool
This tool provides access to ChIP-Atlas, a data-mining suite for exploring
epigenomic landscapes with 433,000+ ChIP-seq, ATAC-seq, and Bisulfite-seq experiments.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
CHIPATLAS_BASE_URL = "https://chip-atlas.org"
CHIPATLAS_DATA_URL = "https://chip-atlas.dbcls.jp/data"
[docs]
@register_tool("ChIPAtlasTool")
class ChIPAtlasTool(BaseTool):
"""
ChIP-Atlas API tool for accessing chromatin data.
Provides enrichment analysis, peak browsing, and dataset search.
"""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the tool with given arguments."""
try:
# Get default operation from tool config schema if not provided
default_op = "enrichment_analysis"
if hasattr(self, "tool_config") and self.tool_config:
param_schema = self.tool_config.get("parameter", {})
default_op = (
param_schema.get("properties", {})
.get("operation", {})
.get("default", "enrichment_analysis")
)
operation = arguments.get("operation", default_op)
if operation == "enrichment_analysis":
return self._enrichment_analysis(arguments)
elif operation == "get_experiment_list":
return self._get_experiment_list(arguments)
elif operation == "get_peak_data":
return self._get_peak_data(arguments)
elif operation == "search_datasets":
return self._search_datasets(arguments)
elif operation == "get_colocalization":
return self._get_colocalization(arguments)
elif operation == "get_target_genes":
return self._get_target_genes(arguments)
elif operation == "get_experiment_metadata":
return self._get_experiment_metadata(arguments)
else:
return {
"status": "error",
"data": {"error": f"Unknown operation: {operation}"},
}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _enrichment_analysis(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Perform enrichment analysis on genomic regions, motifs, or gene lists.
Identifies proteins bound to input regions more often than expected.
"""
try:
# Prepare enrichment analysis parameters
bed_data = arguments.get("bed_data")
motif = arguments.get("motif")
gene_list = arguments.get("gene_list")
genome = arguments.get("genome", "hg38")
antigen_class = arguments.get("antigen_class", "")
cell_type_class = arguments.get("cell_type_class", "")
threshold = arguments.get("threshold", "05")
distance = arguments.get("distance", "5000")
# Build API request
# Note: The actual API endpoint needs to be discovered from ChIP-Atlas documentation
# For now, we provide information about how to use it
result_data = {}
if bed_data:
result_data = {
"message": "ChIP-Atlas Enrichment Analysis requires web form submission",
"instruction": f"Submit BED data to: {CHIPATLAS_BASE_URL}/enrichment_analysis",
"parameters": {
"genome": genome,
"antigen_class": antigen_class,
"cell_type_class": cell_type_class,
"threshold": threshold,
},
"note": "ChIP-Atlas enrichment API requires form-based submission. Use Python 'requests' library for programmatic access.",
}
return {"status": "success", "data": result_data}
elif motif:
result_data = {
"message": "Submit motif for enrichment analysis",
"motif": motif,
"url": f"{CHIPATLAS_BASE_URL}/enrichment_analysis",
"note": "Motif should be in IUPAC nucleic acid notation (ATGCWSMKRYBDHVN)",
}
return {"status": "success", "data": result_data}
elif gene_list:
result_data = {
"message": "Submit gene list for enrichment analysis",
"genes": gene_list if isinstance(gene_list, list) else [gene_list],
"distance_from_tss": distance,
"url": f"{CHIPATLAS_BASE_URL}/enrichment_analysis",
"note": "Use official gene symbols (HGNC, MGI, RGD, FlyBase, WormBase, SGD)",
}
return {"status": "success", "data": result_data}
else:
return {
"status": "error",
"data": {
"error": "One of bed_data, motif, or gene_list must be provided"
},
}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _get_experiment_list(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get metadata for all ChIP-Atlas experiments.
Note: The experimentList.tab file is 344MB+ (433k+ experiments).
This method provides guidance on accessing the data rather than downloading.
"""
try:
genome = arguments.get("genome")
antigen = arguments.get("antigen")
cell_type = arguments.get("cell_type")
arguments.get("limit", 100)
# The file is too large (344MB) to download efficiently
# Provide guidance instead
metadata_url = f"{CHIPATLAS_DATA_URL}/metadata/experimentList.tab"
web_search_url = f"{CHIPATLAS_BASE_URL}/search"
message = (
"ChIP-Atlas experimentList.tab is 344MB+ with 433,000+ experiments. "
"For efficient searching, use: (1) ChIPAtlas_search_datasets tool "
"for antigen/cell-type search, or (2) Download the file directly for "
"local analysis."
)
filters = {}
if genome:
filters["genome"] = genome
if antigen:
filters["antigen"] = antigen
if cell_type:
filters["cell_type"] = cell_type
result_data = {
"message": message,
"metadata_file_url": metadata_url,
"web_search_url": web_search_url,
"file_size": "344MB",
"total_experiments": "433,000+",
"filters_requested": filters if filters else "none",
"recommendation": (
"Use ChIPAtlas_search_datasets tool for filtered searches by "
"antigen or cell type, or download the metadata file for local analysis."
),
}
return {"status": "success", "data": result_data}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _get_peak_data(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get URL for peak-call data (BigWig or BED format)."""
try:
experiment_id = arguments.get("experiment_id")
genome = arguments.get("genome", "hg38")
threshold = arguments.get("threshold", "05")
format_type = arguments.get("format", "bigwig")
if not experiment_id:
return {
"status": "error",
"data": {"error": "experiment_id is required"},
}
if format_type.lower() == "bigwig":
url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bw/{experiment_id}.bw"
elif format_type.lower() == "bed":
url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bed{threshold}/{experiment_id}.{threshold}.bed"
elif format_type.lower() == "bigbed":
url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bb{threshold}/{experiment_id}.{threshold}.bb"
else:
return {
"status": "error",
"data": {
"error": f"Invalid format: {format_type}. Use 'bigwig', 'bed', or 'bigbed'"
},
}
result_data = {
"experiment_id": experiment_id,
"genome": genome,
"format": format_type,
"url": url,
"message": "Use this URL to download peak data",
}
return {"status": "success", "data": result_data}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search for datasets by antigen or cell type."""
try:
antigen = arguments.get("antigen")
cell_type = arguments.get("cell_type")
genome = arguments.get("genome", "hg38")
if not antigen and not cell_type:
return {
"status": "error",
"data": {"error": "Either antigen or cell_type must be provided"},
}
# Download antigenList.tab or celltypeList.tab
# These files are ~10MB each - reasonable to download fully
if antigen:
url = f"{CHIPATLAS_DATA_URL}/metadata/antigenList.tab"
search_key = "antigen"
search_value = antigen
else:
url = f"{CHIPATLAS_DATA_URL}/metadata/celltypeList.tab"
search_key = "cell_type"
search_value = cell_type
# Download file (10MB, should complete in seconds)
response = requests.get(url, timeout=30)
response.raise_for_status()
# Parse TSV
lines = response.text.strip().split("\n")
results = []
for line in lines:
fields = line.split("\t")
if len(fields) >= 5:
if antigen:
# Check genome match and search value
# Format: Genome | Antigen_class | Antigen | Num_data | ID
if (
fields[0] == genome
and search_value.lower() in fields[2].lower()
):
results.append(
{
"genome": fields[0],
"class": fields[1],
"name": fields[2],
"num_experiments": fields[3],
"experiment_ids": fields[4].split(",")[
:10
], # Show first 10 IDs
}
)
else:
# Check genome match and search value for cell type
# Format: Genome | Cell_type_class | Cell_type | Num_data | ID
if (
fields[0] == genome
and search_value.lower() in fields[2].lower()
):
results.append(
{
"genome": fields[0],
"cell_type_class": fields[1],
"cell_type": fields[2],
"num_experiments": fields[3],
"experiment_ids": fields[4].split(",")[
:10
], # Show first 10 IDs
}
)
result_data = {
"search_key": search_key,
"search_value": search_value,
"genome": genome,
"num_results": len(results),
"results": results,
}
return {"status": "success", "data": result_data}
except requests.exceptions.Timeout:
return {
"status": "error",
"data": {
"error": "Request timeout - ChIP-Atlas server may be slow. Try again later."
},
}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _get_colocalization(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Colocalization: proteins whose ChIP-seq peaks co-occur with an antigen.
Reads the per-antigen/tissue co-binding matrix (one row per
co-occurring experiment, scored in the '<antigen>|Average' column) and
returns the partner proteins ranked by their best overlap score.
"""
try:
antigen = arguments.get("antigen") or arguments.get("protein")
cell_type_class = arguments.get("cell_type_class") or arguments.get(
"tissue"
)
genome = arguments.get("genome", "hg38")
limit = int(arguments.get("limit", 50))
if not antigen:
return {
"status": "error",
"data": {"error": "antigen (e.g. 'CTCF') is required"},
}
if not cell_type_class:
return {
"status": "error",
"data": {
"error": "cell_type_class is required (e.g. 'Blood'). "
"Query ChIPAtlas_get_experiments first to find tissue classes for an antigen."
},
}
# Tissue class uses spaces in the index but underscores in the path.
tissue_path = str(cell_type_class).replace(" ", "_")
url = f"{CHIPATLAS_DATA_URL}/{genome}/colo/{antigen}.{tissue_path}.tsv"
response = requests.get(url, timeout=30)
if response.status_code == 404:
return {
"status": "error",
"data": {
"error": f"No colocalization data for antigen='{antigen}', "
f"cell_type_class='{cell_type_class}', genome='{genome}'. "
"Check available tissue classes via colo_analysis.json.",
"url": url,
},
}
response.raise_for_status()
lines = response.text.split("\n")
if not lines or not lines[0].strip():
return {
"status": "success",
"data": {
"antigen": antigen,
"cell_type_class": cell_type_class,
"genome": genome,
"partner_count": 0,
"partners": [],
"url": url,
"note": "Colocalization matrix was empty.",
},
}
# header: Experiment | Cell_subclass | Protein | <antigen>|Average | ...
best: Dict[str, Dict[str, Any]] = {}
for line in lines[1:]:
if not line.strip():
continue
cols = line.split("\t")
if len(cols) < 4:
continue
protein = cols[2]
try:
score = float(cols[3])
except (ValueError, IndexError):
continue
prev = best.get(protein)
if prev is None or score > prev["best_overlap_score"]:
best[protein] = {
"protein": protein,
"best_overlap_score": score,
"best_experiment": cols[0],
"best_cell_subclass": cols[1],
}
partners = sorted(
best.values(), key=lambda d: d["best_overlap_score"], reverse=True
)
return {
"status": "success",
"data": {
"antigen": antigen,
"cell_type_class": cell_type_class,
"genome": genome,
"partner_count": len(partners),
"partners": partners[:limit],
"url": url,
},
}
except requests.exceptions.Timeout:
return {
"status": "error",
"data": {"error": "Request timeout - ChIP-Atlas server may be slow."},
}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _get_target_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Target genes: genes bound by a TF near their TSS, ranked by score.
Reads the per-TF target-gene matrix (rows = target genes, col2 =
'<TF>|Average' binding score) at a chosen TSS distance (1/5/10 kb).
"""
try:
antigen = arguments.get("antigen") or arguments.get("tf")
genome = arguments.get("genome", "hg38")
distance = str(arguments.get("distance", "5"))
limit = int(arguments.get("limit", 100))
if not antigen:
return {
"status": "error",
"data": {"error": "antigen / tf (e.g. 'GATA1') is required"},
}
if distance not in ("1", "5", "10"):
return {
"status": "error",
"data": {
"error": f"distance must be '1', '5', or '10' (kb), got '{distance}'"
},
}
url = f"{CHIPATLAS_DATA_URL}/{genome}/target/{antigen}.{distance}.tsv"
response = requests.get(url, timeout=30)
if response.status_code == 404:
return {
"status": "error",
"data": {
"error": f"No target-gene data for antigen='{antigen}', "
f"distance='{distance}', genome='{genome}'. "
"Check availability via target_genes_analysis.json.",
"url": url,
},
}
response.raise_for_status()
lines = response.text.split("\n")
if not lines or not lines[0].strip():
return {
"status": "error",
"data": {"error": "Empty target-gene file.", "url": url},
}
# header: Target_genes | <antigen>|Average | <SRX>|<cell> ...
genes = []
for line in lines[1:]:
if not line.strip():
continue
cols = line.split("\t")
if len(cols) < 2:
continue
try:
avg = float(cols[1])
except (ValueError, IndexError):
continue
genes.append({"gene": cols[0], "average_binding_score": avg})
genes.sort(key=lambda d: d["average_binding_score"], reverse=True)
note = None
if not genes:
note = (
"The target-gene matrix for this antigen currently contains only "
"a header (no scored rows on the server). Try another antigen or distance."
)
return {
"status": "success",
"data": {
"antigen": antigen,
"genome": genome,
"distance_kb": distance,
"gene_count": len(genes),
"target_genes": genes[:limit],
"url": url,
**({"note": note} if note else {}),
},
}
except requests.exceptions.Timeout:
return {
"status": "error",
"data": {"error": "Request timeout - ChIP-Atlas server may be slow."},
}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}
[docs]
def _get_experiment_metadata(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Structured metadata for a single ChIP-Atlas experiment (SRX/DRX/ERX)."""
try:
experiment_id = arguments.get("experiment_id") or arguments.get("expid")
if not experiment_id:
return {
"status": "error",
"data": {"error": "experiment_id is required (e.g. 'SRX080331')"},
}
url = f"{CHIPATLAS_BASE_URL}/data/exp_metadata.json"
response = requests.get(url, params={"expid": experiment_id}, timeout=30)
response.raise_for_status()
records = response.json()
if not isinstance(records, list) or len(records) == 0:
return {
"status": "error",
"data": {
"error": f"No metadata found for experiment '{experiment_id}'.",
"url": response.url,
},
}
genome_filter = arguments.get("genome")
experiments = []
for rec in records:
if genome_filter and rec.get("genome") != genome_filter:
continue
# attributes is a tab-delimited "key=value" string; parse to dict.
attr_raw = rec.get("attributes", "") or ""
attributes = {}
for field in attr_raw.split("\t"):
if "=" in field:
key, _, val = field.partition("=")
attributes[key.strip()] = val.strip()
experiments.append(
{
"expid": rec.get("expid"),
"genome": rec.get("genome"),
"antigen_class": rec.get("agClass"),
"antigen": rec.get("agSubClass"),
"cell_type_class": rec.get("clClass"),
"cell_type": rec.get("clSubClass"),
"title": rec.get("title"),
"attributes": attributes,
}
)
if not experiments:
return {
"status": "error",
"data": {
"error": f"No metadata for experiment '{experiment_id}' "
f"with genome='{genome_filter}'.",
"url": response.url,
},
}
return {
"status": "success",
"data": {
"experiment_id": experiment_id,
"count": len(experiments),
"experiments": experiments,
"url": response.url,
},
}
except requests.exceptions.Timeout:
return {
"status": "error",
"data": {"error": "Request timeout - ChIP-Atlas server may be slow."},
}
except Exception as e:
return {"status": "error", "data": {"error": str(e)}}