tooluniverse.gwas_tool 源代码

import re
import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool

_EFO_ID_RE = re.compile(r"^[A-Z]+[_:]\d+")



[文档]
class GWASRESTTool(BaseTool):
    """Base class for GWAS Catalog REST API tools."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.base_url = "https://www.ebi.ac.uk/gwas/rest/api"
        self.endpoint = ""  # Will be set by subclasses



[文档]
    def _make_request(
        self, endpoint: str, params: Optional[Dict] = None
    ) -> Dict[str, Any]:
        """Make a request to the GWAS Catalog API."""
        url = f"{self.base_url}{endpoint}"
        try:
            response = requests.get(url, params=params, timeout=60)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            return {"status": "error", "error": f"Request failed: {str(e)}"}



[文档]
    def _coerce_str(self, value: Any) -> Optional[str]:
        """Return a stripped string, or None."""
        if not isinstance(value, str):
            return None
        s = value.strip()
        return s or None



[文档]
    def _coerce_int(self, value: Any) -> Optional[int]:
        if value is None:
            return None
        try:
            return int(value)
        except (TypeError, ValueError):
            return None



[文档]
    def _efo_id_from_uri_or_id(self, value: Any) -> Optional[str]:
        """
        Best-effort normalize an EFO/OBA/etc identifier.

        Accepts either a full URI (e.g., 'http://www.ebi.ac.uk/efo/OBA_2050062')
        or a bare ID (e.g., 'OBA_2050062' or 'OBA:2050062').

        Note: The GWAS Catalog v2 REST API supports filtering by `efo_id` (and
        sometimes `efo_trait`) on associations/studies endpoints. Passing a full
        URI via `efo_uri` is not consistently supported; we normalize to `efo_id`.
        """
        s = self._coerce_str(value)
        if not s:
            return None
        if s.startswith(("http://", "https://")):
            s = s.rstrip("/").rsplit("/", 1)[-1]
        # Support CURIE-style IDs like "EFO:0001645" or "OBA:2050062" by converting
        # ":" to "_" (GWAS Catalog expects underscore form, e.g., "EFO_0001645").
        if ":" in s and "/" not in s and " " not in s:
            left, right = s.split(":", 1)
            if left and right:
                s = f"{left}_{right}"
        s = s.strip()
        return s or None



[文档]
    def _resolve_trait_to_efo_id(self, disease_trait: str) -> Optional[str]:
        """Resolve a disease trait name to an EFO ID.

        Tries the GWAS Catalog efoTraits endpoint first, then falls back to
        a study-based resolution. The /v2/associations endpoint ignores the
        disease_trait query parameter, so we must resolve to an EFO ID.
        """
        # Primary: GWAS Catalog efoTraits endpoint (v1)
        try:
            resp = requests.get(
                f"{self.base_url}/efoTraits/search/findByEfoTrait",
                params={"trait": disease_trait},
                timeout=15,
            )
            if resp.status_code == 200:
                traits = resp.json().get("_embedded", {}).get("efoTraits", [])
                if traits:
                    short_name = traits[0].get("shortForm")
                    if short_name:
                        return short_name
        except Exception:
            pass

        # Fallback: search studies by disease_trait, extract efo_id from first result
        try:
            resp = requests.get(
                f"{self.base_url}/v2/studies",
                params={"disease_trait": disease_trait, "size": 1},
                timeout=15,
            )
            if resp.status_code == 200:
                studies = resp.json().get("_embedded", {}).get("studies", [])
                if studies:
                    efo_traits = studies[0].get("efo_traits", [])
                    if efo_traits:
                        efo_id = efo_traits[0].get("efo_id")
                        if efo_id:
                            return efo_id
        except Exception:
            pass

        return None



[文档]
    def _resolve_trait_or_error(
        self, disease_trait: Optional[str], efo_id: Optional[str]
    ) -> Dict[str, Any]:
        """Resolve disease_trait to efo_id if needed.

        Returns {"efo_id": <str>} on success, or {"error": <dict>} when
        resolution fails and would produce an unfiltered query.
        Callers check ``"error" in result`` and return ``result["error"]``.
        """
        if disease_trait and not efo_id:
            resolved = self._resolve_trait_to_efo_id(disease_trait)
            if resolved:
                return {"efo_id": resolved}
            return {
                "error": {
                    "status": "error",
                    "error": (
                        f"Could not resolve trait '{disease_trait}' to an EFO ID. "
                        "GWAS Catalog uses specific EFO/MONDO terms. "
                        "For drug response traits, use the underlying disease instead "
                        "(e.g., 'depression' or 'major depressive disorder' instead of "
                        "'antidepressant response'). Or provide efo_id directly "
                        "(e.g., 'MONDO_0002009' for major depressive disorder, "
                        "'EFO_0000305' for breast carcinoma)."
                    ),
                },
            }
        return {"efo_id": efo_id}



[文档]
    @staticmethod
    def _empty_result_note(efo_id: str) -> str:
        """Return a suggestion note when no associations are found for an EFO ID."""
        return (
            f"No associations found for EFO ID '{efo_id}'. "
            "GWAS Catalog may use a broader parent term — try disease_trait "
            "with a text query (e.g., 'colorectal cancer') to find related associations."
        )



[文档]
    def _add_empty_result_note(
        self, result: Dict[str, Any], efo_id: Optional[str]
    ) -> None:
        """Add a suggestion note to result if the data list is empty."""
        if efo_id and isinstance(result.get("data"), list) and not result["data"]:
            result["note"] = self._empty_result_note(efo_id)



[文档]
    def _extract_embedded_data(
        self, data: Dict[str, Any], data_type: str
    ) -> Dict[str, Any]:
        """Extract data from the _embedded structure and add metadata."""
        if "error" in data:
            if "status" not in data:
                return {"status": "error", **data}
            return data

        result: Dict[str, Any] = {"status": "success", "data": [], "metadata": {}}
        metadata: Dict[str, Any] = {}

        # Extract the main data from _embedded
        if "_embedded" in data and data_type in data["_embedded"]:
            result["data"] = data["_embedded"][data_type]

        # Extract pagination metadata
        if "page" in data:
            metadata["pagination"] = data["page"]

        # Extract links metadata
        if "_links" in data:
            metadata["links"] = data["_links"]

        if metadata:
            result["metadata"] = metadata

        # If no _embedded structure and no array was extracted, keep data as empty array
        # This handles the case where API returns pagination metadata but no results

        return result



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the tool with given arguments."""
        return self._make_request(self.endpoint, arguments)





[文档]
@register_tool("GWASAssociationSearch")
class GWASAssociationSearch(GWASRESTTool):
    """Search for GWAS associations by various criteria."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/associations"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search for associations with optional filters."""
        params = {}

        # Handle various search parameters
        # accept 'query' and 'trait' as aliases for 'disease_trait'
        disease_trait = self._coerce_str(
            arguments.get("disease_trait")
            or arguments.get("query")
            or arguments.get("trait")
        )

        # Prefer efo_id filtering. If user provided efo_uri, normalize to efo_id.
        efo_id = self._efo_id_from_uri_or_id(arguments.get("efo_id"))
        if not efo_id:
            efo_id = self._efo_id_from_uri_or_id(arguments.get("efo_uri"))

        # Feature-111A-004: if disease_trait looks like an EFO/OBA/HP ID, treat as efo_id
        if disease_trait and not efo_id and _EFO_ID_RE.match(disease_trait):
            efo_id = self._efo_id_from_uri_or_id(disease_trait)
            disease_trait = None

        # Feature-79C: /v2/associations ignores disease_trait param server-side.
        # Auto-resolve trait name to efo_id for reliable filtering.
        # Feature-81B-008: if resolution fails, return error instead of silently
        # running an unfiltered search that returns 1M+ unrelated associations.
        if disease_trait and not efo_id:
            resolved = self._resolve_trait_to_efo_id(disease_trait)
            if resolved:
                efo_id = resolved
            else:
                return {
                    "status": "error",
                    "error": (
                        f"Could not resolve trait '{disease_trait}' to an EFO ID. "
                        "GWAS Catalog uses specific EFO/MONDO disease terms. "
                        "For drug response traits, use the underlying disease "
                        "(e.g., 'depression' instead of 'antidepressant response', "
                        "'coronary artery disease' instead of 'statin response'). "
                        "Or provide efo_id directly (e.g., 'MONDO_0002009' for "
                        "major depressive disorder, 'EFO_0001645' for myocardial infarction)."
                    ),
                }

        if efo_id:
            params["efo_id"] = efo_id

        efo_trait = self._coerce_str(arguments.get("efo_trait"))
        if efo_trait:
            params["efo_trait"] = efo_trait

        rs_id = self._coerce_str(arguments.get("rs_id"))
        if rs_id:
            params["rs_id"] = rs_id

        accession_id = self._coerce_str(arguments.get("accession_id"))
        if accession_id:
            params["accession_id"] = accession_id

        sort = self._coerce_str(arguments.get("sort"))
        if sort:
            params["sort"] = sort

        direction = self._coerce_str(arguments.get("direction"))
        if direction:
            params["direction"] = direction

        size = self._coerce_int(arguments.get("size") or arguments.get("limit"))
        if size is not None:
            params["size"] = size

        page = self._coerce_int(arguments.get("page"))
        if page is not None:
            params["page"] = page

        # Feature-81B-008: require at least one filter to prevent returning 1M+ results
        filter_keys = {"efo_id", "efo_trait", "rs_id", "accession_id"}
        if not filter_keys.intersection(params):
            return {
                "status": "error",
                "error": (
                    "At least one filter is required: disease_trait, efo_id, "
                    "efo_trait, rs_id, or accession_id."
                ),
            }

        data = self._make_request(self.endpoint, params)
        result = self._extract_embedded_data(data, "associations")

        # Client-side p_value filter (GWAS Catalog API does not support server-side p-value filtering)
        p_threshold = arguments.get("p_value") or arguments.get("p_value_threshold")
        if p_threshold is not None and result.get("status") == "success":
            try:
                p_threshold = float(p_threshold)
                assocs = result.get("data", [])
                if isinstance(assocs, list):
                    filtered = [
                        a
                        for a in assocs
                        if a.get("p_value") is not None
                        and float(a["p_value"]) <= p_threshold
                    ]
                    result["data"] = filtered
                    result.setdefault("metadata", {})["p_value_filter"] = p_threshold
                    result["metadata"]["filtered_count"] = len(filtered)
                    result["metadata"]["total_before_filter"] = len(assocs)
            except (ValueError, TypeError):
                pass

        self._add_empty_result_note(result, efo_id)
        return result





[文档]
@register_tool("GWASStudySearch")
class GWASStudySearch(GWASRESTTool):
    """Search for GWAS studies by various criteria."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/studies"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search for studies with optional filters."""
        params = {}

        disease_trait = self._coerce_str(arguments.get("disease_trait"))
        if disease_trait:
            params["disease_trait"] = disease_trait

        efo_id = self._efo_id_from_uri_or_id(arguments.get("efo_id"))
        if not efo_id:
            efo_id = self._efo_id_from_uri_or_id(arguments.get("efo_uri"))
        if efo_id:
            params["efo_id"] = efo_id

        efo_trait = self._coerce_str(arguments.get("efo_trait"))
        if efo_trait:
            params["efo_trait"] = efo_trait

        cohort = self._coerce_str(arguments.get("cohort"))
        if cohort:
            params["cohort"] = cohort

        if arguments.get("gxe") is not None:
            params["gxe"] = bool(arguments.get("gxe"))
        if arguments.get("full_pvalue_set") is not None:
            params["full_pvalue_set"] = bool(arguments.get("full_pvalue_set"))

        size = self._coerce_int(arguments.get("size"))
        if size is not None:
            params["size"] = size
        page = self._coerce_int(arguments.get("page"))
        if page is not None:
            params["page"] = page

        data = self._make_request(self.endpoint, params)
        return self._extract_embedded_data(data, "studies")





[文档]
@register_tool("GWASSNPSearch")
class GWASSNPSearch(GWASRESTTool):
    """Search for GWAS single nucleotide polymorphisms (SNPs)."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/single-nucleotide-polymorphisms"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Search for SNPs with optional filters."""
        params = {}

        rs_id = arguments.get("rs_id") or arguments.get("rsid")
        if rs_id:
            params["rs_id"] = rs_id
        if "mapped_gene" in arguments:
            params["mapped_gene"] = arguments["mapped_gene"]
        if "size" in arguments:
            params["size"] = arguments["size"]
        if "page" in arguments:
            params["page"] = arguments["page"]

        data = self._make_request(self.endpoint, params)
        return self._extract_embedded_data(data, "snps")




# Get by ID tools

[文档]
@register_tool("GWASAssociationByID")
class GWASAssociationByID(GWASRESTTool):
    """Get a specific GWAS association by its ID."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/associations"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get association by ID."""
        if "association_id" not in arguments:
            return {"status": "error", "error": "association_id is required"}

        association_id = arguments["association_id"]
        return self._make_request(f"{self.endpoint}/{association_id}")





[文档]
@register_tool("GWASStudyByID")
class GWASStudyByID(GWASRESTTool):
    """Get a specific GWAS study by its ID."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/studies"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get study by ID."""
        if "study_id" not in arguments:
            return {"status": "error", "error": "study_id is required"}

        study_id = arguments["study_id"]
        return self._make_request(f"{self.endpoint}/{study_id}")





[文档]
@register_tool("GWASSNPByID")
class GWASSNPByID(GWASRESTTool):
    """Get a specific GWAS SNP by its rs ID."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/single-nucleotide-polymorphisms"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get SNP by rs ID."""
        if "rs_id" not in arguments:
            return {"status": "error", "error": "rs_id is required"}

        rs_id = arguments["rs_id"]
        return self._make_request(f"{self.endpoint}/{rs_id}")




# Specialized search tools based on common use cases from examples

[文档]
@register_tool("GWASVariantsForTrait")
class GWASVariantsForTrait(GWASRESTTool):
    """Get all variants associated with a specific trait."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/associations"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get variants for a trait with pagination support."""
        disease_trait = self._coerce_str(
            arguments.get("disease_trait") or arguments.get("trait")
        )
        efo_id = self._efo_id_from_uri_or_id(
            arguments.get("efo_id")
        ) or self._efo_id_from_uri_or_id(arguments.get("efo_uri"))
        efo_trait = self._coerce_str(arguments.get("efo_trait"))

        if disease_trait and not efo_id and _EFO_ID_RE.match(disease_trait):
            efo_id = self._efo_id_from_uri_or_id(disease_trait)
            disease_trait = None

        # /v2/associations ignores disease_trait — resolve to efo_id
        resolution = self._resolve_trait_or_error(disease_trait, efo_id)
        if "error" in resolution:
            return resolution["error"]
        efo_id = resolution["efo_id"]

        if not disease_trait and not efo_id and not efo_trait:
            return {
                "status": "error",
                "error": "Provide at least one of: disease_trait, efo_id (or efo_uri), efo_trait.",
            }

        page_size = (
            self._coerce_int(arguments.get("size") or arguments.get("limit")) or 200
        )
        params: Dict[str, Any] = {
            "size": page_size,
            "page": self._coerce_int(arguments.get("page")) or 0,
        }
        if efo_id:
            params["efo_id"] = efo_id
        elif efo_trait:
            params["efo_trait"] = efo_trait

        data = self._make_request(self.endpoint, params)
        result = self._extract_embedded_data(data, "associations")
        if efo_id and disease_trait:
            result["resolved_efo_id"] = efo_id
        self._add_empty_result_note(result, efo_id)
        return result





[文档]
@register_tool("GWASAssociationsForTrait")
class GWASAssociationsForTrait(GWASRESTTool):
    """Get all associations for a specific trait, sorted by p-value."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/associations"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get associations for a trait, sorted by significance."""
        disease_trait = self._coerce_str(
            arguments.get("disease_trait") or arguments.get("trait")
        )
        efo_id = self._efo_id_from_uri_or_id(
            arguments.get("efo_id")
        ) or self._efo_id_from_uri_or_id(arguments.get("efo_uri"))
        efo_trait = self._coerce_str(arguments.get("efo_trait"))

        if disease_trait and not efo_id and _EFO_ID_RE.match(disease_trait):
            efo_id = self._efo_id_from_uri_or_id(disease_trait)
            disease_trait = None

        # /v2/associations ignores disease_trait — resolve to efo_id
        resolution = self._resolve_trait_or_error(disease_trait, efo_id)
        if "error" in resolution:
            return resolution["error"]
        efo_id = resolution["efo_id"]

        if not disease_trait and not efo_id and not efo_trait:
            return {
                "status": "error",
                "error": "Provide at least one of: disease_trait, efo_id (or efo_uri), efo_trait.",
            }

        params: Dict[str, Any] = {
            "sort": "p_value",
            "direction": "asc",
            "size": arguments.get("size", 40),
            "page": arguments.get("page", 0),
        }
        if efo_id:
            params["efo_id"] = efo_id
        elif efo_trait:
            params["efo_trait"] = efo_trait

        data = self._make_request(self.endpoint, params)
        result = self._extract_embedded_data(data, "associations")
        if efo_id and disease_trait:
            result["resolved_efo_id"] = efo_id
        self._add_empty_result_note(result, efo_id)
        return result





[文档]
@register_tool("GWASAssociationsForSNP")
class GWASAssociationsForSNP(GWASRESTTool):
    """Get all associations for a specific SNP."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/associations"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get associations for a SNP."""
        rs_id = self._coerce_str(arguments.get("rs_id"))
        if not rs_id:
            return {"status": "error", "error": "rs_id is required"}

        params = {
            "rs_id": rs_id,
            "size": self._coerce_int(arguments.get("size")) or 200,
            "page": self._coerce_int(arguments.get("page")) or 0,
        }

        sort = self._coerce_str(arguments.get("sort"))
        if sort:
            params["sort"] = sort
        direction = self._coerce_str(arguments.get("direction"))
        if direction:
            params["direction"] = direction

        data = self._make_request(self.endpoint, params)
        return self._extract_embedded_data(data, "associations")





[文档]
@register_tool("GWASStudiesForTrait")
class GWASStudiesForTrait(GWASRESTTool):
    """Get studies for a specific trait with optional filters."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/studies"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get studies for a trait with optional filters."""
        disease_trait = self._coerce_str(arguments.get("disease_trait"))
        efo_id = self._efo_id_from_uri_or_id(
            arguments.get("efo_id")
        ) or self._efo_id_from_uri_or_id(arguments.get("efo_uri"))
        efo_trait = self._coerce_str(arguments.get("efo_trait"))
        if not disease_trait and not efo_id and not efo_trait:
            return {
                "status": "error",
                "error": "Provide at least one of: disease_trait, efo_id (or efo_uri), efo_trait.",
            }

        params = {
            "size": self._coerce_int(arguments.get("size")) or 200,
            "page": self._coerce_int(arguments.get("page")) or 0,
        }

        if disease_trait:
            params["disease_trait"] = disease_trait
        if efo_id:
            params["efo_id"] = efo_id
        if efo_trait:
            params["efo_trait"] = efo_trait

        cohort = self._coerce_str(arguments.get("cohort"))
        if cohort:
            params["cohort"] = cohort
        if arguments.get("gxe") is not None:
            params["gxe"] = bool(arguments.get("gxe"))
        if arguments.get("full_pvalue_set") is not None:
            params["full_pvalue_set"] = bool(arguments.get("full_pvalue_set"))

        data = self._make_request(self.endpoint, params)
        return self._extract_embedded_data(data, "studies")





[文档]
@register_tool("GWASSNPsForGene")
class GWASSNPsForGene(GWASRESTTool):
    """Get SNPs mapped to a specific gene."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        # Feature-83B-001: v2 /single-nucleotide-polymorphisms?mapped_gene= returns
        # HTTP 500 for all gene queries. The v1 endpoint
        # /singleNucleotidePolymorphisms/search/findByGene?geneName= works correctly.
        self.endpoint = "/singleNucleotidePolymorphisms/search/findByGene"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get SNPs for a gene."""
        gene = (
            arguments.get("gene_symbol")
            or arguments.get("mapped_gene")
            or arguments.get("gene")
        )
        if not gene:
            return {"status": "error", "error": "gene_symbol is required"}

        params = {
            "geneName": gene,
            "size": arguments.get("size", 50),
            "page": arguments.get("page", 0),
        }

        data = self._make_request(self.endpoint, params)
        # v1 endpoint returns key "singleNucleotidePolymorphisms", not "snps"
        return self._extract_embedded_data(data, "singleNucleotidePolymorphisms")





[文档]
@register_tool("GWASAssociationsForStudy")
class GWASAssociationsForStudy(GWASRESTTool):
    """Get all associations for a specific study."""


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.endpoint = "/v2/associations"



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get associations for a study."""
        if "accession_id" not in arguments:
            return {"status": "error", "error": "accession_id is required"}

        params = {
            "accession_id": arguments["accession_id"],
            "sort": "p_value",
            "direction": "asc",
            "size": arguments.get("size", 200),
            "page": arguments.get("page", 0),
        }

        data = self._make_request(self.endpoint, params)
        return self._extract_embedded_data(data, "associations")