Source code for tooluniverse.impc_tool
# impc_tool.py
"""
IMPC (International Mouse Phenotyping Consortium) Solr API tool for ToolUniverse.
IMPC provides standardized phenotyping data for knockout mouse lines,
covering all protein-coding genes in the mouse genome.
Data includes:
- Gene summaries with production/phenotyping status
- Mammalian Phenotype (MP) ontology annotations from knockout mice
- Statistical results from phenotyping pipelines
- Viability and fertility assessments
API Documentation: https://www.mousephenotype.org/help/programmatic-data-access/
Base URL: https://www.ebi.ac.uk/mi/impc/solr/
No authentication required.
"""
import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool
# Base URL for IMPC Solr API
IMPC_SOLR_BASE = "https://www.ebi.ac.uk/mi/impc/solr"
[docs]
@register_tool("IMPCTool")
class IMPCTool(BaseTool):
"""
Tool for querying IMPC mouse phenotyping data via Solr API.
Provides access to:
- Gene information and phenotyping status
- Mouse knockout phenotype associations (MP terms)
- Statistical results from standardized phenotyping
- Viability and fertility data
No authentication required. Data freely available (CC-BY 4.0).
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
self.operation = tool_config.get("fields", {}).get(
"operation", "get_gene_summary"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the IMPC API call."""
operation = self.operation
if operation == "get_gene_summary":
return self._get_gene_summary(arguments)
elif operation == "get_phenotypes_by_gene":
return self._get_phenotypes_by_gene(arguments)
elif operation == "search_genes":
return self._search_genes(arguments)
elif operation == "get_gene_phenotype_hits":
return self._get_gene_phenotype_hits(arguments)
else:
return {"status": "error", "error": f"Unknown operation: {operation}"}
[docs]
def _solr_query(
self,
core: str,
query: str,
rows: int = 50,
fields: Optional[str] = None,
filter_query: Optional[str] = None,
) -> Dict[str, Any]:
"""Execute a Solr query against an IMPC core."""
url = f"{IMPC_SOLR_BASE}/{core}/select"
params = {
"q": query,
"rows": min(rows, 500),
"wt": "json",
}
if fields:
params["fl"] = fields
if filter_query:
params["fq"] = filter_query
try:
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
return {
"status": "success",
"response": data.get("response", {}),
}
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"IMPC Solr API timeout after {self.timeout}s",
}
except requests.exceptions.HTTPError as e:
return {
"status": "error",
"error": f"IMPC API HTTP error: {e.response.status_code}",
}
except requests.exceptions.RequestException as e:
return {"status": "error", "error": f"IMPC API request failed: {str(e)}"}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs]
def _get_gene_summary(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Get gene summary from IMPC including phenotyping status, viability, and basic info.
Queries the 'gene' core for a given gene symbol.
"""
gene_symbol = arguments.get("gene_symbol", "")
mgi_id = arguments.get("mgi_id", "")
if not gene_symbol and not mgi_id:
return {
"status": "error",
"error": "Either gene_symbol or mgi_id is required",
}
if mgi_id:
query = f'mgi_accession_id:"{mgi_id}"'
else:
query = f'marker_symbol:"{gene_symbol}"'
fields = (
"mgi_accession_id,marker_symbol,marker_name,marker_synonym,"
"marker_type,human_gene_symbol,status,imits_phenotype_started,"
"imits_phenotype_complete,imits_phenotype_status,"
"latest_phenotype_status,latest_production_status,"
"latest_phenotyping_centre,latest_production_centre,"
"allele_name,es_cell_status,mouse_status,phenotype_status,"
"production_centre,phenotyping_centre,p_value,mp_id,mp_term,"
"mp_term_synonym,top_level_mp_id,top_level_mp_term,"
"hp_id,hp_term,disease_id,disease_term,disease_source,"
"human_curated,mouse_curated,mgi_predicted,impc_predicted,"
"has_qc,legacy_phenotype_status"
)
result = self._solr_query(core="gene", query=query, rows=5, fields=fields)
if result["status"] != "success":
return result
docs = result["response"].get("docs", [])
num_found = result["response"].get("numFound", 0)
if num_found == 0:
return {
"status": "success",
"data": None,
"message": f"Gene not found in IMPC: {gene_symbol or mgi_id}. "
"The gene may not yet be phenotyped by IMPC.",
}
gene_data = docs[0]
return {
"status": "success",
"data": {
"mgi_id": gene_data.get("mgi_accession_id"),
"symbol": gene_data.get("marker_symbol"),
"name": gene_data.get("marker_name"),
"synonyms": gene_data.get("marker_synonym", []),
"human_ortholog": gene_data.get("human_gene_symbol", []),
"marker_type": gene_data.get("marker_type"),
"phenotype_status": gene_data.get("latest_phenotype_status"),
"production_status": gene_data.get("latest_production_status"),
"phenotyping_centre": gene_data.get("latest_phenotyping_centre"),
"production_centre": gene_data.get("latest_production_centre"),
"has_phenotype_data": gene_data.get("imits_phenotype_complete") == "1"
or bool(gene_data.get("mp_id")),
"mp_terms": gene_data.get("mp_term", []),
"mp_ids": gene_data.get("mp_id", []),
"top_level_mp_terms": gene_data.get("top_level_mp_term", []),
"hp_terms": gene_data.get("hp_term", []),
"disease_associations": [
{"disease_id": did, "disease_term": dt}
for did, dt in zip(
gene_data.get("disease_id", []),
gene_data.get("disease_term", []),
)
]
if gene_data.get("disease_id")
else [],
},
"source": "IMPC (International Mouse Phenotyping Consortium)",
}
[docs]
def _get_phenotypes_by_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Get all phenotype annotations for a gene from IMPC knockout mice.
Queries the 'genotype-phenotype' core for significant phenotype calls.
"""
gene_symbol = arguments.get("gene_symbol", "")
mgi_id = arguments.get("mgi_id", "")
rows = arguments.get("limit", 100)
if not gene_symbol and not mgi_id:
return {
"status": "error",
"error": "Either gene_symbol or mgi_id is required",
}
if mgi_id:
query = f'marker_accession_id:"{mgi_id}"'
else:
query = f'marker_symbol:"{gene_symbol}"'
fields = (
"marker_symbol,marker_accession_id,allele_symbol,allele_accession_id,"
"mp_term_id,mp_term_name,top_level_mp_term_id,top_level_mp_term_name,"
"zygosity,sex,life_stage_name,parameter_name,procedure_name,"
"pipeline_name,phenotyping_center,p_value,effect_size,"
"statistical_method,resource_name"
)
result = self._solr_query(
core="genotype-phenotype",
query=query,
rows=rows,
fields=fields,
)
if result["status"] != "success":
return result
docs = result["response"].get("docs", [])
num_found = result["response"].get("numFound", 0)
# Organize phenotypes by MP term to deduplicate
phenotype_map = {}
for doc in docs:
mp_id = doc.get("mp_term_id")
if mp_id and mp_id not in phenotype_map:
phenotype_map[mp_id] = {
"mp_term_id": mp_id,
"mp_term_name": doc.get("mp_term_name"),
"top_level_mp_term_id": doc.get("top_level_mp_term_id"),
"top_level_mp_term_name": doc.get("top_level_mp_term_name"),
"zygosity": doc.get("zygosity"),
"sex": doc.get("sex"),
"life_stage": doc.get("life_stage_name"),
"procedure": doc.get("procedure_name"),
"parameter": doc.get("parameter_name"),
"p_value": doc.get("p_value"),
"effect_size": doc.get("effect_size"),
"phenotyping_center": doc.get("phenotyping_center"),
"allele_symbol": doc.get("allele_symbol"),
}
phenotypes = list(phenotype_map.values())
# Group by top-level MP term for summary
top_level_groups = {}
for p in phenotypes:
top_names = p.get("top_level_mp_term_name") or ["Uncategorized"]
if isinstance(top_names, str):
top_names = [top_names]
for tname in top_names:
if tname not in top_level_groups:
top_level_groups[tname] = []
top_level_groups[tname].append(p["mp_term_name"])
return {
"status": "success",
"data": {
"gene_symbol": gene_symbol or docs[0].get("marker_symbol", ""),
"mgi_id": mgi_id
or (docs[0].get("marker_accession_id", "") if docs else ""),
"total_phenotype_calls": num_found,
"unique_phenotypes": len(phenotypes),
"phenotypes": phenotypes,
"phenotype_summary_by_system": {
system: terms for system, terms in sorted(top_level_groups.items())
},
},
"source": "IMPC genotype-phenotype associations",
}
[docs]
def _search_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Search IMPC for genes matching a query string.
Useful for finding MGI IDs from gene symbols or partial names.
"""
query_str = arguments.get("query", "")
rows = arguments.get("limit", 20)
if not query_str:
return {"status": "error", "error": "query parameter is required"}
# Search across multiple fields
query = (
f"marker_symbol:{query_str}* OR "
f"marker_name:*{query_str}* OR "
f"marker_synonym:*{query_str}* OR "
f'mgi_accession_id:"{query_str}"'
)
fields = (
"mgi_accession_id,marker_symbol,marker_name,marker_synonym,"
"marker_type,human_gene_symbol,latest_phenotype_status,"
"latest_production_status"
)
result = self._solr_query(core="gene", query=query, rows=rows, fields=fields)
if result["status"] != "success":
return result
docs = result["response"].get("docs", [])
num_found = result["response"].get("numFound", 0)
genes = []
for doc in docs:
genes.append(
{
"mgi_id": doc.get("mgi_accession_id"),
"symbol": doc.get("marker_symbol"),
"name": doc.get("marker_name"),
"synonyms": doc.get("marker_synonym", []),
"human_ortholog": doc.get("human_gene_symbol", []),
"phenotype_status": doc.get("latest_phenotype_status"),
"production_status": doc.get("latest_production_status"),
}
)
return {
"status": "success",
"data": {
"query": query_str,
"genes": genes,
"count": len(genes),
"total_found": num_found,
},
"source": "IMPC gene search",
}
[docs]
def _get_gene_phenotype_hits(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Get statistical results for a gene including p-values and effect sizes.
Queries the 'statistical-result' core for detailed phenotyping statistics.
"""
gene_symbol = arguments.get("gene_symbol", "")
mgi_id = arguments.get("mgi_id", "")
rows = arguments.get("limit", 100)
significant_only = arguments.get("significant_only", True)
if not gene_symbol and not mgi_id:
return {
"status": "error",
"error": "Either gene_symbol or mgi_id is required",
}
if mgi_id:
query = f'marker_accession_id:"{mgi_id}"'
else:
query = f'marker_symbol:"{gene_symbol}"'
filter_query = None
if significant_only:
filter_query = "significant:true"
fields = (
"marker_symbol,marker_accession_id,allele_symbol,"
"mp_term_id,mp_term_name,top_level_mp_term_id,top_level_mp_term_name,"
"parameter_name,procedure_name,pipeline_name,"
"zygosity,sex,phenotyping_center,"
"p_value,effect_size,statistical_method,significant,"
"female_ko_parameter_estimate,male_ko_parameter_estimate,"
"female_percentage_change,male_percentage_change,"
"classification_tag,life_stage_name"
)
result = self._solr_query(
core="statistical-result",
query=query,
rows=rows,
fields=fields,
filter_query=filter_query,
)
if result["status"] != "success":
return result
docs = result["response"].get("docs", [])
num_found = result["response"].get("numFound", 0)
hits = []
for doc in docs:
hits.append(
{
"parameter": doc.get("parameter_name"),
"procedure": doc.get("procedure_name"),
"mp_term_id": doc.get("mp_term_id"),
"mp_term_name": doc.get("mp_term_name"),
"top_level_mp_term": doc.get("top_level_mp_term_name"),
"p_value": doc.get("p_value"),
"effect_size": doc.get("effect_size"),
"significant": doc.get("significant"),
"zygosity": doc.get("zygosity"),
"sex": doc.get("sex"),
"life_stage": doc.get("life_stage_name"),
"statistical_method": doc.get("statistical_method"),
"classification_tag": doc.get("classification_tag"),
"female_ko_estimate": doc.get("female_ko_parameter_estimate"),
"male_ko_estimate": doc.get("male_ko_parameter_estimate"),
"phenotyping_center": doc.get("phenotyping_center"),
"allele_symbol": doc.get("allele_symbol"),
}
)
return {
"status": "success",
"data": {
"gene_symbol": gene_symbol
or (docs[0].get("marker_symbol", "") if docs else ""),
"total_results": num_found,
"results_returned": len(hits),
"significant_only": significant_only,
"hits": hits,
},
"source": "IMPC statistical results",
}