Source code for tooluniverse.ensembl_ld_tool
# ensembl_ld_tool.py
"""
Ensembl REST API Linkage Disequilibrium (LD) tool for ToolUniverse.
Provides linkage disequilibrium data from the Ensembl REST API using
1000 Genomes Phase 3 population data. LD measures the non-random
association of alleles at different genetic loci and is essential
for GWAS interpretation, fine-mapping, and population genetics.
API: https://rest.ensembl.org/
Endpoints: /ld/:species/:id/:population_name
/ld/:species/pairwise/:id1/:id2
No authentication required. Rate limit: 15 requests/second.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}
[docs]
@register_tool("EnsemblLDTool")
class EnsemblLDTool(BaseTool):
"""
Tool for querying linkage disequilibrium data from Ensembl REST API.
Provides LD statistics (r2, D') between variants using 1000 Genomes
Phase 3 population data across 26 populations.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
self.endpoint_type = tool_config.get("fields", {}).get(
"endpoint_type", "ld_variants"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Ensembl LD API call."""
try:
return self._dispatch(arguments)
except requests.exceptions.Timeout:
return {
"error": f"Ensembl LD API request timed out after {self.timeout} seconds"
}
except requests.exceptions.ConnectionError:
return {
"error": "Failed to connect to Ensembl REST API. Check network connectivity."
}
except requests.exceptions.HTTPError as e:
status = e.response.status_code if e.response else "unknown"
return {"error": f"Ensembl REST API HTTP error: {status}"}
except Exception as e:
return {"error": f"Unexpected error querying Ensembl LD: {str(e)}"}
[docs]
def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint based on config."""
if self.endpoint_type == "ld_variants":
return self._ld_variants(arguments)
elif self.endpoint_type == "ld_pairwise":
return self._ld_pairwise(arguments)
else:
return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs]
def _ld_variants(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get all variants in LD with a query variant in a population."""
variant_id = arguments.get("variant_id", "")
population = arguments.get("population", "")
r2_threshold = arguments.get("r2_threshold", None)
d_prime_threshold = arguments.get("d_prime_threshold", None)
if not variant_id:
return {"error": "variant_id parameter is required (e.g., 'rs1042779')"}
if not population:
return {
"error": "population parameter is required (e.g., '1000GENOMES:phase_3:CEU')"
}
url = f"{ENSEMBL_BASE_URL}/ld/human/{variant_id}/{population}"
params = {"content-type": "application/json"}
if r2_threshold is not None:
params["r2"] = r2_threshold
if d_prime_threshold is not None:
params["d_prime"] = d_prime_threshold
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
# Parse LD entries
ld_variants = []
for entry in raw:
try:
r2_val = float(entry.get("r2", 0))
dp_val = float(entry.get("d_prime", 0))
except (ValueError, TypeError):
r2_val = 0.0
dp_val = 0.0
ld_variants.append(
{
"variant1": entry.get("variation1", ""),
"variant2": entry.get("variation2", ""),
"r2": r2_val,
"d_prime": dp_val,
"population_name": entry.get("population_name", population),
}
)
# Sort by r2 descending
ld_variants.sort(key=lambda x: x["r2"], reverse=True)
# Limit to top 200
ld_variants = ld_variants[:200]
result = {
"query_variant": variant_id,
"population": population,
"ld_count": len(ld_variants),
"ld_variants": ld_variants,
}
return {
"data": result,
"metadata": {
"source": "Ensembl REST API",
"query": f"{variant_id} in {population}",
"endpoint": "ld",
},
}
[docs]
def _ld_pairwise(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get pairwise LD statistics between two variants across populations."""
variant1 = arguments.get("variant1", "")
variant2 = arguments.get("variant2", "")
if not variant1:
return {"error": "variant1 parameter is required (e.g., 'rs6792369')"}
if not variant2:
return {"error": "variant2 parameter is required (e.g., 'rs1042779')"}
url = f"{ENSEMBL_BASE_URL}/ld/human/pairwise/{variant1}/{variant2}"
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
# Parse LD by population
ld_by_pop = []
for entry in raw:
try:
r2_val = float(entry.get("r2", 0))
dp_val = float(entry.get("d_prime", 0))
except (ValueError, TypeError):
r2_val = 0.0
dp_val = 0.0
ld_by_pop.append(
{
"population_name": entry.get("population_name", ""),
"r2": r2_val,
"d_prime": dp_val,
}
)
# Sort by population name
ld_by_pop.sort(key=lambda x: x["population_name"])
result = {
"variant1": variant1,
"variant2": variant2,
"population_count": len(ld_by_pop),
"ld_by_population": ld_by_pop,
}
return {
"data": result,
"metadata": {
"source": "Ensembl REST API",
"query": f"{variant1} vs {variant2}",
"endpoint": "ld/pairwise",
},
}