tooluniverse.igsr_tool 源代码

"""
IGSR Tool - International Genome Sample Resource (1000 Genomes Project)

IGSR hosts the 1000 Genomes Project data and related datasets that provide
a comprehensive catalog of human genetic variation across global populations.
The resource includes 4,989 samples from 212 populations grouped into
superpopulations (AFR, AMR, EAS, EUR, SAS).

API base: https://www.internationalgenome.org/api/beta
No authentication required. Elasticsearch-based API.

Reference: Byrska-Bishop et al., Cell 2022, 185(18):3426-3440
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool


IGSR_BASE_URL = "https://www.internationalgenome.org/api/beta"



[文档]
@register_tool("IGSRTool")
class IGSRTool(BaseTool):
    """
    Tool for querying the International Genome Sample Resource (1000 Genomes).

    Provides access to population, sample, and data collection metadata
    from the 1000 Genomes Project and related studies.

    Supported operations:
    - search_populations: Search/list populations with superpopulation filtering
    - search_samples: Search samples by population, data collection
    - list_data_collections: List available data collections/studies
    """


[文档]
    def __init__(self, tool_config: Dict[str, Any]):
        super().__init__(tool_config)
        self.parameter = tool_config.get("parameter", {})
        self.required = self.parameter.get("required", [])
        self.session = requests.Session()
        self.timeout = 30



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the IGSR API tool with given arguments."""
        operation = arguments.get("operation")
        if not operation:
            # Fall back to schema default (set per tool in JSON config)
            props = self.tool_config.get("parameter", {}).get("properties", {})
            operation = props.get("operation", {}).get("default")

        operation_handlers = {
            "search_populations": self._search_populations,
            "search_samples": self._search_samples,
            "list_data_collections": self._list_data_collections,
        }

        handler = operation_handlers.get(operation)
        if not handler:
            return {
                "status": "error",
                "error": "Unknown operation: {}. Available: {}".format(
                    operation, list(operation_handlers.keys())
                ),
            }

        try:
            return handler(arguments)
        except requests.exceptions.Timeout:
            return {"status": "error", "error": "IGSR API request timed out"}
        except requests.exceptions.ConnectionError:
            return {"status": "error", "error": "Failed to connect to IGSR API"}
        except Exception as e:
            return {"status": "error", "error": f"IGSR API error: {str(e)}"}



[文档]
    def _es_search(self, index: str, body: Dict[str, Any]) -> Dict[str, Any]:
        """Execute an Elasticsearch search against IGSR API."""
        url = f"{IGSR_BASE_URL}/{index}/_search"
        response = self.session.post(
            url,
            json=body,
            headers={"Content-Type": "application/json"},
            timeout=self.timeout,
        )
        response.raise_for_status()
        return response.json()



[文档]
    def _search_populations(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search 1000 Genomes populations by superpopulation or name."""
        superpopulation = arguments.get("superpopulation")
        query_text = arguments.get("query")
        limit = min(int(arguments.get("limit", 25)), 100)

        # Fetch all populations (212 total) when superpopulation filter is needed,
        # since the superpopulation.code field is not keyword-indexed in Elasticsearch.
        fetch_limit = 300 if superpopulation else limit
        body: Dict[str, Any] = {"size": fetch_limit}

        filters = []
        if query_text:
            filters.append(
                {
                    "bool": {
                        "should": [
                            {"match": {"name": query_text}},
                            {"match": {"description": query_text}},
                            {"match": {"code": query_text.upper()}},
                        ],
                        "minimum_should_match": 1,
                    }
                }
            )

        if filters:
            body["query"] = {"bool": {"filter": filters}}

        raw = self._es_search("population", body)

        populations = []
        for hit in raw.get("hits", {}).get("hits", []):
            src = hit["_source"]
            superpop = src.get("superpopulation", {})
            # Client-side superpopulation filter (field not keyword-indexed in ES)
            if (
                superpopulation
                and (superpop.get("code") or "").upper() != superpopulation.upper()
            ):
                continue
            populations.append(
                {
                    "code": src.get("code", ""),
                    "name": src.get("name", ""),
                    "description": src.get("description", ""),
                    "sample_count": src.get("samples", {}).get("count", 0),
                    "superpopulation_code": superpop.get("code", ""),
                    "superpopulation_name": superpop.get("name", ""),
                    "latitude": src.get("latitude"),
                    "longitude": src.get("longitude"),
                }
            )
        populations = populations[:limit]

        return {
            "status": "success",
            "data": {
                "total": len(populations),
                "populations": populations,
            },
            "metadata": {
                "source": "IGSR / 1000 Genomes Project (internationalgenome.org)",
                "filter_superpopulation": superpopulation,
                "filter_query": query_text,
            },
        }



[文档]
    def _search_samples(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search 1000 Genomes samples by population or data collection."""
        population = arguments.get("population")
        data_collection = arguments.get("data_collection")
        sample_name = arguments.get("sample_name")
        limit = min(int(arguments.get("limit", 25)), 100)

        body: Dict[str, Any] = {"size": limit}

        filters = []
        if population:
            filters.append({"term": {"populations.code": population.upper()}})
        if data_collection:
            filters.append({"match": {"dataCollections.title": data_collection}})
        if sample_name:
            filters.append({"match": {"name": sample_name}})

        if filters:
            body["query"] = {"bool": {"filter": filters}}

        raw = self._es_search("sample", body)

        samples = []
        for hit in raw.get("hits", {}).get("hits", []):
            src = hit["_source"]
            pops = src.get("populations", [])
            pop_info = [
                {
                    "code": p.get("code", ""),
                    "name": p.get("name", ""),
                    "superpopulation": p.get("superpopulationCode", ""),
                }
                for p in pops
            ]
            dc_titles = [dc.get("title", "") for dc in src.get("dataCollections", [])]
            samples.append(
                {
                    "name": src.get("name", ""),
                    "sex": src.get("sex", ""),
                    "biosample_id": src.get("biosampleId", ""),
                    "populations": pop_info,
                    "data_collections": dc_titles,
                }
            )

        return {
            "status": "success",
            "data": {
                "total": raw.get("hits", {}).get("total", 0),
                "samples": samples,
            },
            "metadata": {
                "source": "IGSR / 1000 Genomes Project (internationalgenome.org)",
                "filter_population": population,
                "filter_data_collection": data_collection,
            },
        }



[文档]
    def _list_data_collections(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """List available 1000 Genomes data collections and studies."""
        limit = min(int(arguments.get("limit", 50)), 100)

        body: Dict[str, Any] = {"size": limit}
        raw = self._es_search("data-collection", body)

        collections = []
        for hit in raw.get("hits", {}).get("hits", []):
            src = hit["_source"]
            collections.append(
                {
                    "code": src.get("code", hit.get("_id", "")),
                    "title": src.get("title", ""),
                    "short_title": src.get("shortTitle", ""),
                    "sample_count": src.get("samples", {}).get("count", 0),
                    "population_count": src.get("populations", {}).get("count", 0),
                    "data_types": src.get("dataTypes", []),
                    "website": src.get("website"),
                }
            )

        return {
            "status": "success",
            "data": {
                "total": raw.get("hits", {}).get("total", 0),
                "collections": collections,
            },
            "metadata": {
                "source": "IGSR / 1000 Genomes Project (internationalgenome.org)",
            },
        }