Source code for tooluniverse.chipatlas_tool

"""
ChIP-Atlas API Tool

This tool provides access to ChIP-Atlas, a data-mining suite for exploring
epigenomic landscapes with 433,000+ ChIP-seq, ATAC-seq, and Bisulfite-seq experiments.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool


CHIPATLAS_BASE_URL = "https://chip-atlas.org"
CHIPATLAS_DATA_URL = "https://chip-atlas.dbcls.jp/data"



[docs]
@register_tool("ChIPAtlasTool")
class ChIPAtlasTool(BaseTool):
    """
    ChIP-Atlas API tool for accessing chromatin data.
    Provides enrichment analysis, peak browsing, and dataset search.
    """


[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the tool with given arguments."""
        try:
            operation = arguments.get("operation", "enrichment_analysis")

            if operation == "enrichment_analysis":
                return self._enrichment_analysis(arguments)

            elif operation == "get_experiment_list":
                return self._get_experiment_list(arguments)

            elif operation == "get_peak_data":
                return self._get_peak_data(arguments)

            elif operation == "search_datasets":
                return self._search_datasets(arguments)

            else:
                return {
                    "status": "error",
                    "data": {"error": f"Unknown operation: {operation}"},
                }

        except Exception as e:
            return {"status": "error", "data": {"error": str(e)}}



[docs]
    def _enrichment_analysis(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """
        Perform enrichment analysis on genomic regions, motifs, or gene lists.
        Identifies proteins bound to input regions more often than expected.
        """
        try:
            # Prepare enrichment analysis parameters
            bed_data = arguments.get("bed_data")
            motif = arguments.get("motif")
            gene_list = arguments.get("gene_list")
            genome = arguments.get("genome", "hg38")
            antigen_class = arguments.get("antigen_class", "")
            cell_type_class = arguments.get("cell_type_class", "")
            threshold = arguments.get("threshold", "05")
            distance = arguments.get("distance", "5000")

            # Build API request
            # Note: The actual API endpoint needs to be discovered from ChIP-Atlas documentation
            # For now, we provide information about how to use it

            result_data = {}

            if bed_data:
                result_data = {
                    "message": "ChIP-Atlas Enrichment Analysis requires web form submission",
                    "instruction": f"Submit BED data to: {CHIPATLAS_BASE_URL}/enrichment_analysis",
                    "parameters": {
                        "genome": genome,
                        "antigen_class": antigen_class,
                        "cell_type_class": cell_type_class,
                        "threshold": threshold,
                    },
                    "note": "ChIP-Atlas enrichment API requires form-based submission. Use Python 'requests' library for programmatic access.",
                }
                return {"status": "success", "data": result_data}
            elif motif:
                result_data = {
                    "message": "Submit motif for enrichment analysis",
                    "motif": motif,
                    "url": f"{CHIPATLAS_BASE_URL}/enrichment_analysis",
                    "note": "Motif should be in IUPAC nucleic acid notation (ATGCWSMKRYBDHVN)",
                }
                return {"status": "success", "data": result_data}
            elif gene_list:
                result_data = {
                    "message": "Submit gene list for enrichment analysis",
                    "genes": gene_list if isinstance(gene_list, list) else [gene_list],
                    "distance_from_tss": distance,
                    "url": f"{CHIPATLAS_BASE_URL}/enrichment_analysis",
                    "note": "Use official gene symbols (HGNC, MGI, RGD, FlyBase, WormBase, SGD)",
                }
                return {"status": "success", "data": result_data}
            else:
                return {
                    "status": "error",
                    "data": {
                        "error": "One of bed_data, motif, or gene_list must be provided"
                    },
                }

        except Exception as e:
            return {"status": "error", "data": {"error": str(e)}}



[docs]
    def _get_experiment_list(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get metadata for all ChIP-Atlas experiments.

        Note: The experimentList.tab file is 344MB+ (433k+ experiments).
        This method provides guidance on accessing the data rather than downloading.
        """
        try:
            genome = arguments.get("genome")
            antigen = arguments.get("antigen")
            cell_type = arguments.get("cell_type")
            arguments.get("limit", 100)

            # The file is too large (344MB) to download efficiently
            # Provide guidance instead
            metadata_url = f"{CHIPATLAS_DATA_URL}/metadata/experimentList.tab"
            web_search_url = f"{CHIPATLAS_BASE_URL}/search"

            message = (
                "ChIP-Atlas experimentList.tab is 344MB+ with 433,000+ experiments. "
                "For efficient searching, use: (1) ChIPAtlas_search_datasets tool "
                "for antigen/cell-type search, or (2) Download the file directly for "
                "local analysis."
            )

            filters = {}
            if genome:
                filters["genome"] = genome
            if antigen:
                filters["antigen"] = antigen
            if cell_type:
                filters["cell_type"] = cell_type

            result_data = {
                "message": message,
                "metadata_file_url": metadata_url,
                "web_search_url": web_search_url,
                "file_size": "344MB",
                "total_experiments": "433,000+",
                "filters_requested": filters if filters else "none",
                "recommendation": (
                    "Use ChIPAtlas_search_datasets tool for filtered searches by "
                    "antigen or cell type, or download the metadata file for local analysis."
                ),
            }

            return {"status": "success", "data": result_data}

        except Exception as e:
            return {"status": "error", "data": {"error": str(e)}}



[docs]
    def _get_peak_data(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get URL for peak-call data (BigWig or BED format)."""
        try:
            experiment_id = arguments.get("experiment_id")
            genome = arguments.get("genome", "hg38")
            threshold = arguments.get("threshold", "05")
            format_type = arguments.get("format", "bigwig")

            if not experiment_id:
                return {
                    "status": "error",
                    "data": {"error": "experiment_id is required"},
                }

            if format_type.lower() == "bigwig":
                url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bw/{experiment_id}.bw"
            elif format_type.lower() == "bed":
                url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bed{threshold}/{experiment_id}.{threshold}.bed"
            elif format_type.lower() == "bigbed":
                url = f"{CHIPATLAS_DATA_URL}/{genome}/eachData/bb{threshold}/{experiment_id}.{threshold}.bb"
            else:
                return {
                    "status": "error",
                    "data": {
                        "error": f"Invalid format: {format_type}. Use 'bigwig', 'bed', or 'bigbed'"
                    },
                }

            result_data = {
                "experiment_id": experiment_id,
                "genome": genome,
                "format": format_type,
                "url": url,
                "message": "Use this URL to download peak data",
            }

            return {"status": "success", "data": result_data}

        except Exception as e:
            return {"status": "error", "data": {"error": str(e)}}



[docs]
    def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search for datasets by antigen or cell type."""
        try:
            antigen = arguments.get("antigen")
            cell_type = arguments.get("cell_type")
            genome = arguments.get("genome", "hg38")

            if not antigen and not cell_type:
                return {
                    "status": "error",
                    "data": {"error": "Either antigen or cell_type must be provided"},
                }

            # Download antigenList.tab or celltypeList.tab
            # These files are ~10MB each - reasonable to download fully
            if antigen:
                url = f"{CHIPATLAS_DATA_URL}/metadata/antigenList.tab"
                search_key = "antigen"
                search_value = antigen
            else:
                url = f"{CHIPATLAS_DATA_URL}/metadata/celltypeList.tab"
                search_key = "cell_type"
                search_value = cell_type

            # Download file (10MB, should complete in seconds)
            response = requests.get(url, timeout=30)
            response.raise_for_status()

            # Parse TSV
            lines = response.text.strip().split("\n")
            results = []

            for line in lines:
                fields = line.split("\t")
                if len(fields) >= 5:
                    if antigen:
                        # Check genome match and search value
                        # Format: Genome | Antigen_class | Antigen | Num_data | ID
                        if (
                            fields[0] == genome
                            and search_value.lower() in fields[2].lower()
                        ):
                            results.append(
                                {
                                    "genome": fields[0],
                                    "class": fields[1],
                                    "name": fields[2],
                                    "num_experiments": fields[3],
                                    "experiment_ids": fields[4].split(",")[
                                        :10
                                    ],  # Show first 10 IDs
                                }
                            )
                    else:
                        # Check genome match and search value for cell type
                        # Format: Genome | Cell_type_class | Cell_type | Num_data | ID
                        if (
                            fields[0] == genome
                            and search_value.lower() in fields[2].lower()
                        ):
                            results.append(
                                {
                                    "genome": fields[0],
                                    "cell_type_class": fields[1],
                                    "cell_type": fields[2],
                                    "num_experiments": fields[3],
                                    "experiment_ids": fields[4].split(",")[
                                        :10
                                    ],  # Show first 10 IDs
                                }
                            )

            result_data = {
                "search_key": search_key,
                "search_value": search_value,
                "genome": genome,
                "num_results": len(results),
                "results": results,
            }

            return {"status": "success", "data": result_data}

        except requests.exceptions.Timeout:
            return {
                "status": "error",
                "data": {
                    "error": "Request timeout - ChIP-Atlas server may be slow. Try again later."
                },
            }
        except Exception as e:
            return {"status": "error", "data": {"error": str(e)}}