Source code for tooluniverse.chipatlas_tool

"""
ChIP-Atlas API Tool

This tool provides access to ChIP-Atlas, a data-mining suite for exploring
epigenomic landscapes with 433,000+ ChIP-seq, ATAC-seq, and Bisulfite-seq experiments.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool


CHIPATLAS_BASE_URL = "https://chip-atlas.org"
CHIPATLAS_DATA_URL = "https://chip-atlas.dbcls.jp/data"


[docs] @register_tool("ChIPAtlasTool") class ChIPAtlasTool(BaseTool): """ ChIP-Atlas API tool for accessing chromatin data. Provides enrichment analysis, peak browsing, and dataset search. """
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the tool with given arguments.""" try: # Get default operation from tool config schema if not provided default_op = "enrichment_analysis" if hasattr(self, "tool_config") and self.tool_config: param_schema = self.tool_config.get("parameter", {}) default_op = ( param_schema.get("properties", {}) .get("operation", {}) .get("default", "enrichment_analysis") ) operation = arguments.get("operation", default_op) if operation == "enrichment_analysis": return self._enrichment_analysis(arguments) elif operation == "get_experiment_list": return self._get_experiment_list(arguments) elif operation == "get_peak_data": return self._get_peak_data(arguments) elif operation == "search_datasets": return self._search_datasets(arguments) elif operation == "get_colocalization": return self._get_colocalization(arguments) elif operation == "get_target_genes": return self._get_target_genes(arguments) elif operation == "get_experiment_metadata": return self._get_experiment_metadata(arguments) else: return { "status": "error", "data": {"error": f"Unknown operation: {operation}"}, } except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _enrichment_analysis(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Perform enrichment analysis on genomic regions, motifs, or gene lists. Identifies proteins bound to input regions more often than expected. """ try: # Prepare enrichment analysis parameters bed_data = arguments.get("bed_data") motif = arguments.get("motif") gene_list = arguments.get("gene_list") genome = arguments.get("genome", "hg38") antigen_class = arguments.get("antigen_class", "") cell_type_class = arguments.get("cell_type_class", "") threshold = arguments.get("threshold", "05") distance = arguments.get("distance", "5000") # Build API request # Note: The actual API endpoint needs to be discovered from ChIP-Atlas documentation # For now, we provide information about how to use it result_data = {} if bed_data: result_data = { "message": "ChIP-Atlas Enrichment Analysis requires web form submission", "instruction": f"Submit BED data to: {CHIPATLAS_BASE_URL}/enrichment_analysis", "parameters": { "genome": genome, "antigen_class": antigen_class, "cell_type_class": cell_type_class, "threshold": threshold, }, "note": "ChIP-Atlas enrichment API requires form-based submission. Use Python 'requests' library for programmatic access.", } return {"status": "success", "data": result_data} elif motif: result_data = { "message": "Submit motif for enrichment analysis", "motif": motif, "url": f"{CHIPATLAS_BASE_URL}/enrichment_analysis", "note": "Motif should be in IUPAC nucleic acid notation (ATGCWSMKRYBDHVN)", } return {"status": "success", "data": result_data} elif gene_list: result_data = { "message": "Submit gene list for enrichment analysis", "genes": gene_list if isinstance(gene_list, list) else [gene_list], "distance_from_tss": distance, "url": f"{CHIPATLAS_BASE_URL}/enrichment_analysis", "note": "Use official gene symbols (HGNC, MGI, RGD, FlyBase, WormBase, SGD)", } return {"status": "success", "data": result_data} else: return { "status": "error", "data": { "error": "One of bed_data, motif, or gene_list must be provided" }, } except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _get_experiment_list(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get metadata for all ChIP-Atlas experiments. Note: The experimentList.tab file is 344MB+ (433k+ experiments). This method provides guidance on accessing the data rather than downloading. """ try: genome = arguments.get("genome") antigen = arguments.get("antigen") cell_type = arguments.get("cell_type") arguments.get("limit", 100) # The file is too large (344MB) to download efficiently # Provide guidance instead metadata_url = f"{CHIPATLAS_DATA_URL}/metadata/experimentList.tab" web_search_url = f"{CHIPATLAS_BASE_URL}/search" message = ( "ChIP-Atlas experimentList.tab is 344MB+ with 433,000+ experiments. " "For efficient searching, use: (1) ChIPAtlas_search_datasets tool " "for antigen/cell-type search, or (2) Download the file directly for " "local analysis." ) filters = {} if genome: filters["genome"] = genome if antigen: filters["antigen"] = antigen if cell_type: filters["cell_type"] = cell_type result_data = { "message": message, "metadata_file_url": metadata_url, "web_search_url": web_search_url, "file_size": "344MB", "total_experiments": "433,000+", "filters_requested": filters if filters else "none", "recommendation": ( "Use ChIPAtlas_search_datasets tool for filtered searches by " "antigen or cell type, or download the metadata file for local analysis." ), } return {"status": "success", "data": result_data} except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _get_peak_data(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get URL for peak-call data (BigWig or BED format).""" try: experiment_id = arguments.get("experiment_id") genome = arguments.get("genome", "hg38") threshold = arguments.get("threshold", "05") format_type = arguments.get("format", "bigwig") if not experiment_id: return { "status": "error", "data": {"error": "experiment_id is required"}, } if format_type.lower() == "bigwig": url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bw/{experiment_id}.bw" elif format_type.lower() == "bed": url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bed{threshold}/{experiment_id}.{threshold}.bed" elif format_type.lower() == "bigbed": url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bb{threshold}/{experiment_id}.{threshold}.bb" else: return { "status": "error", "data": { "error": f"Invalid format: {format_type}. Use 'bigwig', 'bed', or 'bigbed'" }, } result_data = { "experiment_id": experiment_id, "genome": genome, "format": format_type, "url": url, "message": "Use this URL to download peak data", } return {"status": "success", "data": result_data} except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search for datasets by antigen or cell type.""" try: antigen = arguments.get("antigen") cell_type = arguments.get("cell_type") genome = arguments.get("genome", "hg38") if not antigen and not cell_type: return { "status": "error", "data": {"error": "Either antigen or cell_type must be provided"}, } # Download antigenList.tab or celltypeList.tab # These files are ~10MB each - reasonable to download fully if antigen: url = f"{CHIPATLAS_DATA_URL}/metadata/antigenList.tab" search_key = "antigen" search_value = antigen else: url = f"{CHIPATLAS_DATA_URL}/metadata/celltypeList.tab" search_key = "cell_type" search_value = cell_type # Download file (10MB, should complete in seconds) response = requests.get(url, timeout=30) response.raise_for_status() # Parse TSV lines = response.text.strip().split("\n") results = [] for line in lines: fields = line.split("\t") if len(fields) >= 5: if antigen: # Check genome match and search value # Format: Genome | Antigen_class | Antigen | Num_data | ID if ( fields[0] == genome and search_value.lower() in fields[2].lower() ): results.append( { "genome": fields[0], "class": fields[1], "name": fields[2], "num_experiments": fields[3], "experiment_ids": fields[4].split(",")[ :10 ], # Show first 10 IDs } ) else: # Check genome match and search value for cell type # Format: Genome | Cell_type_class | Cell_type | Num_data | ID if ( fields[0] == genome and search_value.lower() in fields[2].lower() ): results.append( { "genome": fields[0], "cell_type_class": fields[1], "cell_type": fields[2], "num_experiments": fields[3], "experiment_ids": fields[4].split(",")[ :10 ], # Show first 10 IDs } ) result_data = { "search_key": search_key, "search_value": search_value, "genome": genome, "num_results": len(results), "results": results, } return {"status": "success", "data": result_data} except requests.exceptions.Timeout: return { "status": "error", "data": { "error": "Request timeout - ChIP-Atlas server may be slow. Try again later." }, } except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _get_colocalization(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Colocalization: proteins whose ChIP-seq peaks co-occur with an antigen. Reads the per-antigen/tissue co-binding matrix (one row per co-occurring experiment, scored in the '<antigen>|Average' column) and returns the partner proteins ranked by their best overlap score. """ try: antigen = arguments.get("antigen") or arguments.get("protein") cell_type_class = arguments.get("cell_type_class") or arguments.get( "tissue" ) genome = arguments.get("genome", "hg38") limit = int(arguments.get("limit", 50)) if not antigen: return { "status": "error", "data": {"error": "antigen (e.g. 'CTCF') is required"}, } if not cell_type_class: return { "status": "error", "data": { "error": "cell_type_class is required (e.g. 'Blood'). " "Query ChIPAtlas_get_experiments first to find tissue classes for an antigen." }, } # Tissue class uses spaces in the index but underscores in the path. tissue_path = str(cell_type_class).replace(" ", "_") url = f"{CHIPATLAS_DATA_URL}/{genome}/colo/{antigen}.{tissue_path}.tsv" response = requests.get(url, timeout=30) if response.status_code == 404: return { "status": "error", "data": { "error": f"No colocalization data for antigen='{antigen}', " f"cell_type_class='{cell_type_class}', genome='{genome}'. " "Check available tissue classes via colo_analysis.json.", "url": url, }, } response.raise_for_status() lines = response.text.split("\n") if not lines or not lines[0].strip(): return { "status": "success", "data": { "antigen": antigen, "cell_type_class": cell_type_class, "genome": genome, "partner_count": 0, "partners": [], "url": url, "note": "Colocalization matrix was empty.", }, } # header: Experiment | Cell_subclass | Protein | <antigen>|Average | ... best: Dict[str, Dict[str, Any]] = {} for line in lines[1:]: if not line.strip(): continue cols = line.split("\t") if len(cols) < 4: continue protein = cols[2] try: score = float(cols[3]) except (ValueError, IndexError): continue prev = best.get(protein) if prev is None or score > prev["best_overlap_score"]: best[protein] = { "protein": protein, "best_overlap_score": score, "best_experiment": cols[0], "best_cell_subclass": cols[1], } partners = sorted( best.values(), key=lambda d: d["best_overlap_score"], reverse=True ) return { "status": "success", "data": { "antigen": antigen, "cell_type_class": cell_type_class, "genome": genome, "partner_count": len(partners), "partners": partners[:limit], "url": url, }, } except requests.exceptions.Timeout: return { "status": "error", "data": {"error": "Request timeout - ChIP-Atlas server may be slow."}, } except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _get_target_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Target genes: genes bound by a TF near their TSS, ranked by score. Reads the per-TF target-gene matrix (rows = target genes, col2 = '<TF>|Average' binding score) at a chosen TSS distance (1/5/10 kb). """ try: antigen = arguments.get("antigen") or arguments.get("tf") genome = arguments.get("genome", "hg38") distance = str(arguments.get("distance", "5")) limit = int(arguments.get("limit", 100)) if not antigen: return { "status": "error", "data": {"error": "antigen / tf (e.g. 'GATA1') is required"}, } if distance not in ("1", "5", "10"): return { "status": "error", "data": { "error": f"distance must be '1', '5', or '10' (kb), got '{distance}'" }, } url = f"{CHIPATLAS_DATA_URL}/{genome}/target/{antigen}.{distance}.tsv" response = requests.get(url, timeout=30) if response.status_code == 404: return { "status": "error", "data": { "error": f"No target-gene data for antigen='{antigen}', " f"distance='{distance}', genome='{genome}'. " "Check availability via target_genes_analysis.json.", "url": url, }, } response.raise_for_status() lines = response.text.split("\n") if not lines or not lines[0].strip(): return { "status": "error", "data": {"error": "Empty target-gene file.", "url": url}, } # header: Target_genes | <antigen>|Average | <SRX>|<cell> ... genes = [] for line in lines[1:]: if not line.strip(): continue cols = line.split("\t") if len(cols) < 2: continue try: avg = float(cols[1]) except (ValueError, IndexError): continue genes.append({"gene": cols[0], "average_binding_score": avg}) genes.sort(key=lambda d: d["average_binding_score"], reverse=True) note = None if not genes: note = ( "The target-gene matrix for this antigen currently contains only " "a header (no scored rows on the server). Try another antigen or distance." ) return { "status": "success", "data": { "antigen": antigen, "genome": genome, "distance_kb": distance, "gene_count": len(genes), "target_genes": genes[:limit], "url": url, **({"note": note} if note else {}), }, } except requests.exceptions.Timeout: return { "status": "error", "data": {"error": "Request timeout - ChIP-Atlas server may be slow."}, } except Exception as e: return {"status": "error", "data": {"error": str(e)}}
[docs] def _get_experiment_metadata(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Structured metadata for a single ChIP-Atlas experiment (SRX/DRX/ERX).""" try: experiment_id = arguments.get("experiment_id") or arguments.get("expid") if not experiment_id: return { "status": "error", "data": {"error": "experiment_id is required (e.g. 'SRX080331')"}, } url = f"{CHIPATLAS_BASE_URL}/data/exp_metadata.json" response = requests.get(url, params={"expid": experiment_id}, timeout=30) response.raise_for_status() records = response.json() if not isinstance(records, list) or len(records) == 0: return { "status": "error", "data": { "error": f"No metadata found for experiment '{experiment_id}'.", "url": response.url, }, } genome_filter = arguments.get("genome") experiments = [] for rec in records: if genome_filter and rec.get("genome") != genome_filter: continue # attributes is a tab-delimited "key=value" string; parse to dict. attr_raw = rec.get("attributes", "") or "" attributes = {} for field in attr_raw.split("\t"): if "=" in field: key, _, val = field.partition("=") attributes[key.strip()] = val.strip() experiments.append( { "expid": rec.get("expid"), "genome": rec.get("genome"), "antigen_class": rec.get("agClass"), "antigen": rec.get("agSubClass"), "cell_type_class": rec.get("clClass"), "cell_type": rec.get("clSubClass"), "title": rec.get("title"), "attributes": attributes, } ) if not experiments: return { "status": "error", "data": { "error": f"No metadata for experiment '{experiment_id}' " f"with genome='{genome_filter}'.", "url": response.url, }, } return { "status": "success", "data": { "experiment_id": experiment_id, "count": len(experiments), "experiments": experiments, "url": response.url, }, } except requests.exceptions.Timeout: return { "status": "error", "data": {"error": "Request timeout - ChIP-Atlas server may be slow."}, } except Exception as e: return {"status": "error", "data": {"error": str(e)}}