tooluniverse.alphamissense_tool 源代码

# alphamissense_tool.py
"""
AlphaMissense API tool for ToolUniverse.

AlphaMissense is DeepMind's deep learning model for predicting the pathogenicity
of missense variants. It provides pathogenicity classifications for ~71 million
possible single amino acid substitutions in the human proteome.

Classifications:
- Pathogenic: score > 0.564
- Ambiguous: 0.34 <= score <= 0.564
- Benign: score < 0.34

API Documentation: https://alphamissense.hegelab.org/
Data Source: Cheng et al., Science 2023
"""

from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from typing import Dict, Any, List, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool

# Base URL for AlphaMissense API (hegelab.org)
ALPHAMISSENSE_BASE_URL = "https://alphamissense.hegelab.org"
UNIPROT_FASTA_URL = "https://rest.uniprot.org/uniprotkb/{accession}.fasta"


[文档] @register_tool("AlphaMissenseTool") class AlphaMissenseTool(BaseTool): """ Tool for querying AlphaMissense pathogenicity predictions. AlphaMissense uses deep learning trained on evolutionary data to predict the pathogenicity of all possible single amino acid substitutions in human proteins. Classification thresholds: - Pathogenic: score > 0.564 - Ambiguous: 0.34 <= score <= 0.564 - Benign: score < 0.34 No authentication required. Free for academic/research use. """ # Classification thresholds from the AlphaMissense paper PATHOGENIC_THRESHOLD = 0.564 BENIGN_THRESHOLD = 0.34
[文档] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) self.operation = tool_config.get("fields", {}).get( "operation", "get_protein_scores" )
[文档] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the AlphaMissense API call.""" operation = self.operation if operation == "get_protein_scores": return self._get_protein_scores(arguments) elif operation == "get_variant_score": return self._get_variant_score(arguments) elif operation == "get_residue_scores": return self._get_residue_scores(arguments) else: return {"status": "error", "error": f"Unknown operation: {operation}"}
[文档] def _classify_score(self, score: float) -> str: """Classify pathogenicity based on AlphaMissense thresholds.""" if score > self.PATHOGENIC_THRESHOLD: return "pathogenic" elif score < self.BENIGN_THRESHOLD: return "benign" else: return "ambiguous"
[文档] def _fetch_protein_length(self, uniprot_id: str) -> Optional[int]: """Get protein length from UniProt FASTA.""" try: r = requests.get(UNIPROT_FASTA_URL.format(accession=uniprot_id), timeout=15) if r.status_code != 200: return None lines = r.text.strip().splitlines() return len("".join(lines[1:])) except requests.exceptions.RequestException: return None
[文档] def _fetch_single_residue( self, uniprot_id: str, position: int ) -> Optional[Dict[str, Any]]: """Fetch one residue's AlphaMissense scores; returns None on error.""" try: r = requests.get( f"{ALPHAMISSENSE_BASE_URL}/hotspotapi", params={"uid": uniprot_id, "resi": position}, timeout=self.timeout, ) if r.status_code != 200: return None return r.json() except requests.exceptions.RequestException: return None
[文档] def _get_protein_scores(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get AlphaMissense scores for every residue of a protein. The hegelab AlphaMissense API only supports per-residue queries — there is no whole-protein dump endpoint. This method abstracts that by: 1. Looking up the protein length from UniProt FASTA 2. Fetching all residues concurrently via a thread pool 3. Returning the aggregated per-position records Set max_residues to cap the loop if you only want a partial protein (e.g. for fast diagnostic calls). max_residues=0 (the default) means no cap — fetch every residue. """ uniprot_id = arguments.get("uniprot_id") if not uniprot_id: return {"status": "error", "error": "uniprot_id parameter is required"} max_residues = arguments.get("max_residues", 0) try: max_residues = int(max_residues) if max_residues else 0 except (TypeError, ValueError): max_residues = 0 # 1. Protein length from UniProt protein_length = self._fetch_protein_length(uniprot_id) if protein_length is None: return { "status": "error", "error": ( f"Could not fetch UniProt sequence length for '{uniprot_id}'. " f"Verify the accession is correct." ), } positions = list(range(1, protein_length + 1)) if max_residues and max_residues < len(positions): positions = positions[:max_residues] # 2. Concurrent per-residue fetch scores: List[Optional[Dict[str, Any]]] = [None] * len(positions) with ThreadPoolExecutor(max_workers=20) as executor: future_to_idx = { executor.submit(self._fetch_single_residue, uniprot_id, p): i for i, p in enumerate(positions) } for fut in as_completed(future_to_idx): idx = future_to_idx[fut] try: scores[idx] = fut.result() except Exception: scores[idx] = None n_fetched = sum(1 for s in scores if s is not None) if n_fetched == 0: pdb_url = f"{ALPHAMISSENSE_BASE_URL}/pdb/AF-{uniprot_id}-F1-AM_v4.pdb" return { "status": "error", "error": ( f"No AlphaMissense data found for '{uniprot_id}' " f"(tried {len(positions)} positions, all returned non-200). " f"The protein may not be in the AlphaMissense database." ), "pdb_download": pdb_url, } # 3. Aggregate into per-position list (drop Nones from failed positions) per_position = [ {"position": p, **(s or {})} for p, s in zip(positions, scores) if s is not None ] return { "status": "success", "data": { "uniprot_id": uniprot_id, "protein_length": protein_length, "scores": per_position, "n_positions_returned": n_fetched, "n_positions_attempted": len(positions), "max_residues_cap": max_residues if max_residues else None, "thresholds": { "pathogenic": f"> {self.PATHOGENIC_THRESHOLD}", "ambiguous": f"{self.BENIGN_THRESHOLD} - {self.PATHOGENIC_THRESHOLD}", "benign": f"< {self.BENIGN_THRESHOLD}", }, "pdb_download": f"{ALPHAMISSENSE_BASE_URL}/pdb/AF-{uniprot_id}-F1-AM_v4.pdb", }, }
[文档] def _get_variant_score(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get AlphaMissense pathogenicity score for a specific variant. Variant format: p.X123Y where X is reference amino acid, 123 is position, and Y is the variant amino acid. """ uniprot_id = arguments.get("uniprot_id") variant = arguments.get("variant") if not uniprot_id: return {"status": "error", "error": "uniprot_id parameter is required"} if not variant: return { "status": "error", "error": "variant parameter is required (e.g., 'p.R123H' or 'R123H')", } # Parse variant notation variant_clean = variant.replace("p.", "").strip() try: # Extract position from variant (e.g., "R123H" -> 123) import re match = re.match(r"([A-Z])(\d+)([A-Z])", variant_clean) if not match: return { "status": "error", "error": f"Invalid variant format: {variant}. Expected format: p.X123Y or X123Y (e.g., p.R123H)", } ref_aa = match.group(1) position = int(match.group(2)) alt_aa = match.group(3) # Query the API url = f"{ALPHAMISSENSE_BASE_URL}/hotspotapi" params = {"uid": uniprot_id, "resi": position} response = requests.get(url, params=params, timeout=self.timeout) if response.status_code == 404: return { "status": "success", "data": None, "message": f"No AlphaMissense data found for {uniprot_id} position {position}", } response.raise_for_status() data = response.json() # Look for the specific variant in the response score = None if isinstance(data, dict): # API may return different formats scores = data.get("scores", data.get("data", {})) if isinstance(scores, dict): score = scores.get(alt_aa) elif isinstance(scores, list): for item in scores: if item.get("aa") == alt_aa or item.get("variant") == alt_aa: score = item.get("score", item.get("am_pathogenicity")) break if score is not None: classification = self._classify_score(score) return { "status": "success", "data": { "uniprot_id": uniprot_id, "variant": f"p.{ref_aa}{position}{alt_aa}", "position": position, "reference_aa": ref_aa, "variant_aa": alt_aa, "pathogenicity_score": score, "classification": classification, "thresholds": { "pathogenic": f"> {self.PATHOGENIC_THRESHOLD}", "ambiguous": f"{self.BENIGN_THRESHOLD} - {self.PATHOGENIC_THRESHOLD}", "benign": f"< {self.BENIGN_THRESHOLD}", }, }, } else: return { "status": "success", "data": { "uniprot_id": uniprot_id, "variant": f"p.{ref_aa}{position}{alt_aa}", "raw_response": data, "message": "Score extraction requires parsing API response format", }, } except requests.exceptions.Timeout: return { "status": "error", "error": f"AlphaMissense API timeout after {self.timeout}s", } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"AlphaMissense API request failed: {str(e)}", } except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[文档] def _get_residue_scores(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get AlphaMissense scores for all possible substitutions at a specific residue. Returns scores for all 20 amino acid substitutions at the given position. """ uniprot_id = arguments.get("uniprot_id") position = arguments.get("position") if not uniprot_id: return {"status": "error", "error": "uniprot_id parameter is required"} if not position: return {"status": "error", "error": "position parameter is required"} try: position = int(position) except (ValueError, TypeError): return {"status": "error", "error": "position must be an integer"} try: url = f"{ALPHAMISSENSE_BASE_URL}/hotspotapi" params = {"uid": uniprot_id, "resi": position} response = requests.get(url, params=params, timeout=self.timeout) if response.status_code == 404: return { "status": "success", "data": None, "message": f"No AlphaMissense data found for {uniprot_id} position {position}", } response.raise_for_status() data = response.json() return { "status": "success", "data": { "uniprot_id": uniprot_id, "position": position, "scores": data, "thresholds": { "pathogenic": f"> {self.PATHOGENIC_THRESHOLD}", "ambiguous": f"{self.BENIGN_THRESHOLD} - {self.PATHOGENIC_THRESHOLD}", "benign": f"< {self.BENIGN_THRESHOLD}", }, }, } except requests.exceptions.Timeout: return { "status": "error", "error": f"AlphaMissense API timeout after {self.timeout}s", } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"AlphaMissense API request failed: {str(e)}", } except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}