tooluniverse.kegg_ext_tool 源代码

# kegg_ext_tool.py
"""
KEGG Extended API tool for ToolUniverse.

Provides access to additional KEGG REST API endpoints for gene-pathway links,
pathway gene lists, and compound/metabolite information. Complements the
existing KEGG tools (search, gene info, pathway info, list organisms).

API: https://rest.kegg.jp/
No authentication required. Free public access.
Note: KEGG REST returns tab-separated text, not JSON. This tool parses
the text into structured JSON responses.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool


KEGG_BASE_URL = "https://rest.kegg.jp"


def _parse_tsv_column(text: str, column: int = 0) -> list:
    """Extract a single column from KEGG tab-separated text output."""
    results = []
    for line in text.strip().split("\n"):
        parts = line.strip().split("\t")
        if len(parts) > column:
            results.append(parts[column])
    return results


def _fetch_pathway_names(organism: str, timeout: int) -> Dict[str, str]:
    """Fetch pathway_id → pathway_name mapping for an organism."""
    url = f"{KEGG_BASE_URL}/list/pathway/{organism}"
    resp = requests.get(url, timeout=timeout)
    if resp.status_code != 200:
        return {}
    pw_names = {}
    for line in resp.text.strip().split("\n"):
        parts = line.strip().split("\t")
        if len(parts) >= 2:
            pw_names[parts[0]] = parts[1]
    return pw_names


[文档] class KEGGExtTool(BaseTool): """ Tool for KEGG REST API extended endpoints providing gene-pathway links, pathway gene lists, and compound details. No authentication required. """
[文档] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "get_gene_pathways")
[文档] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the KEGG API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return { "status": "error", "error": f"KEGG API timed out after {self.timeout}s", } except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to KEGG REST API"} except requests.exceptions.HTTPError as e: code = e.response.status_code if e.response is not None else "unknown" if code == 404: return {"status": "error", "error": f"KEGG entry not found"} return {"status": "error", "error": f"KEGG API HTTP error: {code}"} except Exception as e: return { "status": "error", "error": f"Unexpected error querying KEGG: {str(e)}", }
[文档] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint == "get_gene_pathways": return self._get_gene_pathways(arguments) elif self.endpoint == "get_pathway_genes": return self._get_pathway_genes(arguments) elif self.endpoint == "get_compound": return self._get_compound(arguments) elif self.endpoint == "list_brite": return self._list_brite(arguments) elif self.endpoint == "get_brite_hierarchy": return self._get_brite_hierarchy(arguments) elif self.endpoint == "search_disease": return self._search_disease(arguments) elif self.endpoint == "get_disease": return self._get_disease(arguments) elif self.endpoint == "get_disease_genes": return self._get_disease_genes(arguments) elif self.endpoint == "search_drug": return self._search_drug(arguments) elif self.endpoint == "get_drug": return self._get_drug(arguments) elif self.endpoint == "get_drug_targets": return self._get_drug_targets(arguments) elif self.endpoint == "search_network": return self._search_network(arguments) elif self.endpoint == "get_network": return self._get_network(arguments) elif self.endpoint == "search_variant": return self._search_variant(arguments) elif self.endpoint == "get_variant": return self._get_variant(arguments) elif self.endpoint == "conv_ids": return self._conv_ids(arguments) elif self.endpoint == "link_entries": return self._link_entries(arguments) else: return {"status": "error", "error": f"Unknown endpoint: {self.endpoint}"}
[文档] def _resolve_gene_symbol(self, symbol: str, organism: str = "hsa") -> str: """Resolve gene symbol (e.g. TP53) to KEGG gene ID (e.g. hsa:7157). Parses /find results and matches by exact gene symbol (case-insensitive). """ url = f"{KEGG_BASE_URL}/find/{organism}/{symbol}" resp = requests.get(url, timeout=self.timeout) if resp.status_code != 200 or not resp.text.strip(): return "" symbol_upper = symbol.upper() for line in resp.text.strip().split("\n"): parts = line.split("\t") if len(parts) < 2: continue kegg_id = parts[0].strip() # Format: "SYMBOL1, SYN2, SYN3; description" annotation = parts[1].split(";")[0] symbols = [s.strip().upper() for s in annotation.split(",")] if symbol_upper in symbols: return kegg_id return ""
[文档] def _get_gene_pathways(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get all KEGG pathways that a gene participates in.""" gene_id = ( arguments.get("gene_id") or arguments.get("gene_symbol") or arguments.get("gene") or "" ) if not gene_id: return { "status": "error", "error": "gene_id is required (e.g., 'hsa:7157' for human TP53). You may also pass gene_symbol='TP53'.", } # Auto-resolve gene symbol to KEGG ID (e.g. TP53 → hsa:7157) organism = arguments.get("organism", "hsa") if ":" not in gene_id: resolved = self._resolve_gene_symbol(gene_id, organism) if not resolved: return { "status": "error", "error": f"Could not resolve gene symbol '{gene_id}' to a KEGG gene ID for organism '{organism}'. Use KEGG format 'hsa:7157' directly, or verify the gene symbol.", } gene_id = resolved # Get pathway links for this gene url = f"{KEGG_BASE_URL}/link/pathway/{gene_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() pathway_ids = _parse_tsv_column(response.text, column=1) if not pathway_ids: return { "status": "success", "data": { "gene_id": gene_id, "pathway_count": 0, "pathways": [], }, "metadata": {"source": "KEGG REST API", "gene_id": gene_id}, } # Batch-fetch pathway names for this organism org = gene_id.split(":")[0] pw_names = _fetch_pathway_names(org, self.timeout) pathway_details = [] for pw_id in pathway_ids: name = pw_names.get(pw_id) or pw_names.get(pw_id.replace("path:", ""), "") pathway_details.append({"pathway_id": pw_id, "pathway_name": name}) return { "status": "success", "data": { "gene_id": gene_id, "pathway_count": len(pathway_details), "pathways": pathway_details, }, "metadata": { "source": "KEGG REST API", "gene_id": gene_id, }, }
[文档] def _get_pathway_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get all genes in a KEGG pathway.""" pathway_id = arguments.get("pathway_id", "") if not pathway_id: return { "status": "error", "error": "pathway_id is required (e.g., 'hsa04115' for p53 signaling)", } # Determine organism prefix from pathway ID # hsa04115 -> hsa org = "" for i, ch in enumerate(pathway_id): if ch.isdigit(): org = pathway_id[:i] break if not org: return { "status": "error", "error": "Cannot determine organism from pathway_id. Use format like 'hsa04115'", } # Get gene links for pathway url = f"{KEGG_BASE_URL}/link/{org}/{pathway_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() genes = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t") if len(parts) >= 2: genes.append(parts[1]) # Get pathway name pw_name = "" info_url = f"{KEGG_BASE_URL}/list/pathway/{org}" info_response = requests.get(info_url, timeout=self.timeout) if info_response.status_code == 200: for line in info_response.text.strip().split("\n"): if pathway_id in line: parts = line.strip().split("\t") if len(parts) >= 2: pw_name = parts[1] break return { "status": "success", "data": { "pathway_id": pathway_id, "pathway_name": pw_name, "gene_count": len(genes), "genes": genes, }, "metadata": { "source": "KEGG REST API", "pathway_id": pathway_id, "organism": org, }, }
[文档] def _get_compound(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get KEGG compound/metabolite details.""" compound_id = arguments.get("compound_id", "") if not compound_id: return { "status": "error", "error": "compound_id is required (e.g., 'C00002' for ATP)", } # Ensure proper KEGG compound format if not compound_id.startswith("C"): compound_id = f"C{compound_id}" url = f"{KEGG_BASE_URL}/get/{compound_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() text = response.text if not text.strip(): return {"status": "error", "error": f"Compound not found: {compound_id}"} # Parse KEGG flat file format result = { "compound_id": compound_id, "names": [], "formula": None, "exact_mass": None, "mol_weight": None, "pathways": {}, "enzymes": [], "dblinks": {}, } current_field = None for line in text.split("\n"): if line.startswith("NAME"): current_field = "NAME" name = line[12:].strip().rstrip(";") if name: result["names"].append(name) elif line.startswith("FORMULA"): result["formula"] = line[12:].strip() current_field = None elif line.startswith("EXACT_MASS"): try: result["exact_mass"] = float(line[12:].strip()) except ValueError: result["exact_mass"] = line[12:].strip() current_field = None elif line.startswith("MOL_WEIGHT"): try: result["mol_weight"] = float(line[12:].strip()) except ValueError: result["mol_weight"] = line[12:].strip() current_field = None elif line.startswith("PATHWAY"): current_field = "PATHWAY" parts = line[12:].strip().split(" ", 1) if len(parts) >= 2: result["pathways"][parts[0].strip()] = parts[1].strip() elif parts: result["pathways"][parts[0].strip()] = "" elif line.startswith("ENZYME"): current_field = "ENZYME" enzymes = line[12:].strip().split() result["enzymes"].extend(enzymes) elif line.startswith("DBLINKS"): current_field = "DBLINKS" parts = line[12:].strip().split(": ", 1) if len(parts) == 2: result["dblinks"][parts[0].strip()] = parts[1].strip() elif line.startswith("REMARK"): result["remark"] = line[12:].strip() current_field = None elif line.startswith("///"): break elif line.startswith(" "): content = line[12:].strip() if current_field == "NAME": name = content.rstrip(";") if name: result["names"].append(name) elif current_field == "PATHWAY": parts = content.split(" ", 1) if len(parts) >= 2: result["pathways"][parts[0].strip()] = parts[1].strip() elif current_field == "ENZYME": result["enzymes"].extend(content.split()) elif current_field == "DBLINKS": parts = content.split(": ", 1) if len(parts) == 2: result["dblinks"][parts[0].strip()] = parts[1].strip() else: # New field we don't specifically handle current_field = None return { "status": "success", "data": result, "metadata": { "source": "KEGG REST API", "compound_id": compound_id, }, }
[文档] def _list_brite(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """List all available KEGG BRITE hierarchy classifications.""" url = f"{KEGG_BASE_URL}/list/brite" response = requests.get(url, timeout=self.timeout) response.raise_for_status() hierarchies = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t", 1) if len(parts) >= 2: hierarchies.append( { "hierarchy_id": parts[0].strip(), "name": parts[1].strip(), } ) elif parts: hierarchies.append( { "hierarchy_id": parts[0].strip(), "name": "", } ) return { "status": "success", "data": { "hierarchy_count": len(hierarchies), "hierarchies": hierarchies, }, "metadata": { "source": "KEGG BRITE", }, }
[文档] def _get_brite_hierarchy(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get a specific KEGG BRITE hierarchy as a JSON tree.""" hierarchy_id = arguments.get("hierarchy_id", "") if not hierarchy_id: return { "status": "error", "error": "hierarchy_id is required (e.g., 'ko01000' for Enzymes)", } # KEGG BRITE JSON endpoint requires br: prefix url = f"{KEGG_BASE_URL}/get/br:{hierarchy_id}/json" response = requests.get(url, timeout=self.timeout) response.raise_for_status() if not response.text.strip(): return { "status": "error", "error": f"BRITE hierarchy not found: {hierarchy_id}", } tree = response.json() return { "status": "success", "data": tree, "metadata": { "source": "KEGG BRITE", "hierarchy_id": hierarchy_id, }, }
# --- KEGG Disease endpoints ---
[文档] def _search_disease(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search KEGG disease database by keyword.""" keyword = arguments.get("keyword") or arguments.get("query", "") if not keyword: return { "status": "error", "error": "keyword or query is required (e.g., 'leukemia', 'diabetes')", } url = f"{KEGG_BASE_URL}/find/disease/{keyword}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() diseases = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t", 1) if len(parts) >= 2: disease_id = parts[0].strip().replace("ds:", "") diseases.append( {"disease_id": disease_id, "name": parts[1].strip()} ) max_results = arguments.get("max_results", 25) diseases = diseases[:max_results] return { "status": "success", "data": diseases, "metadata": { "source": "KEGG Disease", "keyword": keyword, "total": len(diseases), }, }
[文档] def _get_disease(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed KEGG disease information.""" disease_id = arguments.get("disease_id", "") if not disease_id: return { "status": "error", "error": "disease_id is required (e.g., 'H00001')", } url = f"{KEGG_BASE_URL}/get/{disease_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() text = response.text if not text.strip(): return {"status": "error", "error": f"Disease not found: {disease_id}"} result = { "disease_id": disease_id, "names": [], "category": None, "description": None, "genes": [], "pathways": {}, "drugs": [], "dblinks": {}, "references": [], } current_field = None for line in text.split("\n"): if line.startswith("NAME"): current_field = "NAME" name = line[12:].strip().rstrip(";") if name: result["names"].append(name) elif line.startswith("DESCRIPTION"): current_field = "DESCRIPTION" result["description"] = line[12:].strip() elif line.startswith("CATEGORY"): result["category"] = line[12:].strip() current_field = None elif line.startswith("GENE"): current_field = "GENE" gene_line = line[12:].strip() if gene_line: result["genes"].append(gene_line) elif line.startswith("PATHWAY"): current_field = "PATHWAY" parts = line[12:].strip().split(" ", 1) if len(parts) >= 2: result["pathways"][parts[0].strip()] = parts[1].strip() elif line.startswith("DRUG"): current_field = "DRUG" drug_line = line[12:].strip() if drug_line: result["drugs"].append(drug_line) elif line.startswith("DBLINKS"): current_field = "DBLINKS" parts = line[12:].strip().split(": ", 1) if len(parts) == 2: result["dblinks"][parts[0].strip()] = parts[1].strip() elif line.startswith("REFERENCE"): current_field = "REFERENCE" ref = line[12:].strip() if ref: result["references"].append(ref) elif line.startswith("///"): break elif line.startswith(" "): content = line[12:].strip() if current_field == "NAME": name = content.rstrip(";") if name: result["names"].append(name) elif current_field == "DESCRIPTION": result["description"] = ( (result["description"] or "") + " " + content ) elif current_field == "GENE": result["genes"].append(content) elif current_field == "PATHWAY": parts = content.split(" ", 1) if len(parts) >= 2: result["pathways"][parts[0].strip()] = parts[1].strip() elif current_field == "DRUG": result["drugs"].append(content) elif current_field == "DBLINKS": parts = content.split(": ", 1) if len(parts) == 2: result["dblinks"][parts[0].strip()] = parts[1].strip() elif current_field == "REFERENCE": result["references"].append(content) else: current_field = None return { "status": "success", "data": result, "metadata": {"source": "KEGG Disease", "disease_id": disease_id}, }
[文档] def _get_disease_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get genes linked to a KEGG disease.""" disease_id = arguments.get("disease_id", "") if not disease_id: return { "status": "error", "error": "disease_id is required (e.g., 'H00001')", } url = f"{KEGG_BASE_URL}/link/hsa/{disease_id}" organism = arguments.get("organism", "hsa") if organism != "hsa": url = f"{KEGG_BASE_URL}/link/{organism}/{disease_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() genes = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t") if len(parts) >= 2: genes.append(parts[1]) return { "status": "success", "data": { "disease_id": disease_id, "gene_count": len(genes), "genes": genes, }, "metadata": { "source": "KEGG Disease", "disease_id": disease_id, "organism": organism, }, }
# --- KEGG Drug endpoints ---
[文档] def _search_drug(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search KEGG drug database by keyword.""" keyword = arguments.get("keyword", "") if not keyword: return { "status": "error", "error": "keyword is required (e.g., 'aspirin', 'imatinib')", } url = f"{KEGG_BASE_URL}/find/drug/{keyword}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() drugs = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t", 1) if len(parts) >= 2: drug_id = parts[0].strip().replace("dr:", "") drugs.append({"drug_id": drug_id, "name": parts[1].strip()}) max_results = arguments.get("max_results", 25) drugs = drugs[:max_results] return { "status": "success", "data": drugs, "metadata": { "source": "KEGG Drug", "keyword": keyword, "total": len(drugs), }, }
[文档] def _get_drug(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed KEGG drug information.""" drug_id = arguments.get("drug_id", "") if not drug_id: return { "status": "error", "error": "drug_id is required (e.g., 'D00109' for aspirin)", } url = f"{KEGG_BASE_URL}/get/{drug_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() text = response.text if not text.strip(): return {"status": "error", "error": f"Drug not found: {drug_id}"} result = { "drug_id": drug_id, "names": [], "formula": None, "exact_mass": None, "mol_weight": None, "targets": [], "pathways": {}, "diseases": [], "dblinks": {}, "efficacy": None, "product": [], } current_field = None for line in text.split("\n"): if line.startswith("NAME"): current_field = "NAME" name = line[12:].strip().rstrip(";") if name: result["names"].append(name) elif line.startswith("FORMULA"): result["formula"] = line[12:].strip() current_field = None elif line.startswith("EXACT_MASS"): try: result["exact_mass"] = float(line[12:].strip()) except ValueError: result["exact_mass"] = line[12:].strip() current_field = None elif line.startswith("MOL_WEIGHT"): try: result["mol_weight"] = float(line[12:].strip()) except ValueError: result["mol_weight"] = line[12:].strip() current_field = None elif line.startswith("TARGET"): current_field = "TARGET" target_line = line[12:].strip() if target_line: result["targets"].append(target_line) elif line.startswith("PATHWAY"): current_field = "PATHWAY" parts = line[12:].strip().split(" ", 1) if len(parts) >= 2: result["pathways"][parts[0].strip()] = parts[1].strip() elif line.startswith("DISEASE"): current_field = "DISEASE" disease_line = line[12:].strip() if disease_line: result["diseases"].append(disease_line) elif line.startswith("DBLINKS"): current_field = "DBLINKS" parts = line[12:].strip().split(": ", 1) if len(parts) == 2: result["dblinks"][parts[0].strip()] = parts[1].strip() elif line.startswith("EFFICACY"): current_field = "EFFICACY" result["efficacy"] = line[12:].strip() elif line.startswith("PRODUCT"): current_field = "PRODUCT" prod = line[12:].strip() if prod: result["product"].append(prod) elif line.startswith("///"): break elif line.startswith(" "): content = line[12:].strip() if current_field == "NAME": name = content.rstrip(";") if name: result["names"].append(name) elif current_field == "TARGET": result["targets"].append(content) elif current_field == "PATHWAY": parts = content.split(" ", 1) if len(parts) >= 2: result["pathways"][parts[0].strip()] = parts[1].strip() elif current_field == "DISEASE": result["diseases"].append(content) elif current_field == "DBLINKS": parts = content.split(": ", 1) if len(parts) == 2: result["dblinks"][parts[0].strip()] = parts[1].strip() elif current_field == "EFFICACY": result["efficacy"] = (result["efficacy"] or "") + " " + content elif current_field == "PRODUCT": result["product"].append(content) else: current_field = None return { "status": "success", "data": result, "metadata": {"source": "KEGG Drug", "drug_id": drug_id}, }
[文档] def _get_drug_targets(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get gene targets linked to a KEGG drug.""" drug_id = arguments.get("drug_id", "") if not drug_id: return {"status": "error", "error": "drug_id is required (e.g., 'D00109')"} # KEGG link: drug -> target genes url = f"{KEGG_BASE_URL}/link/hsa/{drug_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() targets = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t") if len(parts) >= 2: targets.append(parts[1]) return { "status": "success", "data": { "drug_id": drug_id, "target_count": len(targets), "targets": targets, }, "metadata": {"source": "KEGG Drug", "drug_id": drug_id}, }
# --- KEGG Network endpoints ---
[文档] def _search_network(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search KEGG NETWORK database by keyword.""" keyword = arguments.get("keyword", "") if not keyword: return { "status": "error", "error": "keyword is required (e.g., 'EGFR', 'RAS', 'p53')", } url = f"{KEGG_BASE_URL}/find/network/{keyword}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() networks = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t", 1) if len(parts) >= 2: net_id = parts[0].strip().replace("ne:", "") networks.append({"network_id": net_id, "name": parts[1].strip()}) max_results = arguments.get("max_results", 25) networks = networks[:max_results] return { "status": "success", "data": networks, "metadata": { "source": "KEGG Network", "keyword": keyword, "total": len(networks), }, }
[文档] def _get_network(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed KEGG network information.""" network_id = arguments.get("network_id", "") if not network_id: return { "status": "error", "error": "network_id is required (e.g., 'N00001')", } url = f"{KEGG_BASE_URL}/get/{network_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() text = response.text if not text.strip(): return {"status": "error", "error": f"Network not found: {network_id}"} result = { "network_id": network_id, "name": None, "definition": None, "expanded": None, "classes": [], "diseases": [], "drugs": [], "elements": [], } current_field = None for line in text.split("\n"): if line.startswith("NAME"): result["name"] = line[12:].strip() current_field = None elif line.startswith("DEFINITION"): current_field = "DEFINITION" result["definition"] = line[12:].strip() elif line.startswith(" EXPANDED"): result["expanded"] = line[12:].strip() current_field = None elif line.startswith("CLASS"): current_field = "CLASS" result["classes"].append(line[12:].strip()) elif line.startswith("DISEASE"): current_field = "DISEASE" result["diseases"].append(line[12:].strip()) elif line.startswith("DRUG_TARGET") or line.startswith("DRUG"): current_field = "DRUG" drug_line = line[12:].strip() if drug_line: result["drugs"].append(drug_line) elif line.startswith("ELEMENT"): current_field = "ELEMENT" result["elements"].append(line[12:].strip()) elif line.startswith("///"): break elif line.startswith(" "): content = line[12:].strip() if current_field == "CLASS": result["classes"].append(content) elif current_field == "DISEASE": result["diseases"].append(content) elif current_field == "DRUG": result["drugs"].append(content) elif current_field == "ELEMENT": result["elements"].append(content) elif current_field == "DEFINITION": result["definition"] = (result["definition"] or "") + " " + content else: current_field = None return { "status": "success", "data": result, "metadata": {"source": "KEGG Network", "network_id": network_id}, }
# --- KEGG Variant endpoints ---
[文档] def _search_variant(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search KEGG VARIANT database by gene name.""" keyword = arguments.get("keyword", "") if not keyword: return { "status": "error", "error": "keyword is required (e.g., 'BRAF', 'TP53', 'EGFR')", } url = f"{KEGG_BASE_URL}/find/variant/{keyword}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() variants = [] for line in response.text.strip().split("\n"): if line.strip(): parts = line.strip().split("\t", 1) if len(parts) >= 2: var_id = parts[0].strip().replace("hsa_var:", "") variants.append( {"variant_id": var_id, "description": parts[1].strip()} ) max_results = arguments.get("max_results", 25) variants = variants[:max_results] return { "status": "success", "data": variants, "metadata": { "source": "KEGG Variant", "keyword": keyword, "total": len(variants), }, }
[文档] def _get_variant(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed KEGG variant information.""" variant_id = arguments.get("variant_id", "") if not variant_id: return { "status": "error", "error": "variant_id is required (e.g., 'hsa_var:673v1' for BRAF V600E)", } # Ensure proper KEGG format if not variant_id.startswith("hsa_var:"): variant_id = f"hsa_var:{variant_id}" url = f"{KEGG_BASE_URL}/get/{variant_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() text = response.text if not text.strip(): return {"status": "error", "error": f"Variant not found: {variant_id}"} result = { "variant_id": variant_id, "name": None, "type": None, "gene": None, "organism": None, "variations": [], "networks": [], "diseases": [], "drugs": [], } current_field = None current_variation = None for line in text.split("\n"): if line.startswith("NAME"): result["name"] = line[12:].strip() current_field = None elif line.startswith("TYPE"): result["type"] = line[12:].strip() current_field = None elif line.startswith("GENE"): result["gene"] = line[12:].strip() current_field = None elif line.startswith("ORGANISM"): result["organism"] = line[12:].strip() current_field = None elif line.startswith("VARIATION"): current_field = "VARIATION" content = line[12:].strip() current_variation = { "mutation": content, "clinvar": [], "dbsnp": [], "cosmic": [], } result["variations"].append(current_variation) elif line.startswith("NETWORK"): current_field = "NETWORK" result["networks"].append(line[12:].strip()) elif line.startswith("DISEASE"): current_field = "DISEASE" result["diseases"].append(line[12:].strip()) elif line.startswith("DRUG_TARGET"): current_field = "DRUG" result["drugs"].append(line[12:].strip()) elif line.startswith("///"): break elif line.startswith(" "): content = line[12:].strip() if current_field == "VARIATION" and current_variation: if content.startswith("mutation "): current_variation = { "mutation": content.replace("mutation ", ""), "clinvar": [], "dbsnp": [], "cosmic": [], } result["variations"].append(current_variation) elif content.startswith("ClinVar:"): current_variation["clinvar"] = content.replace( "ClinVar: ", "" ).split() elif content.startswith("dbSNP:"): current_variation["dbsnp"] = content.replace( "dbSNP: ", "" ).split() elif content.startswith("COSM:"): current_variation["cosmic"] = content.replace( "COSM: ", "" ).split() elif current_field == "NETWORK": result["networks"].append(content) elif current_field == "DISEASE": result["diseases"].append(content) elif current_field == "DRUG": result["drugs"].append(content) else: current_field = None return { "status": "success", "data": result, "metadata": {"source": "KEGG Variant", "variant_id": variant_id}, }
[文档] def _conv_ids(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Convert between KEGG identifiers and external database IDs. The KEGG /conv endpoint maps KEGG gene IDs to/from external databases: ncbi-geneid, ncbi-proteinid, uniprot, chebi, pubchem. """ kegg_id = arguments.get("kegg_id", "") target_db = arguments.get("target_db", "") if not kegg_id: return { "status": "error", "error": "kegg_id is required (e.g., 'hsa:7157' for TP53)", } if not target_db: return { "status": "error", "error": "target_db is required: uniprot, ncbi-geneid, ncbi-proteinid, chebi, or pubchem", } valid_dbs = {"uniprot", "ncbi-geneid", "ncbi-proteinid", "chebi", "pubchem"} if target_db not in valid_dbs: return { "status": "error", "error": f"target_db must be one of: {', '.join(sorted(valid_dbs))}", } url = f"{KEGG_BASE_URL}/conv/{target_db}/{kegg_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() text = response.text.strip() if not text: return { "status": "error", "error": f"No {target_db} mapping found for {kegg_id}", } mappings = [] for line in text.split("\n"): parts = line.strip().split("\t") if len(parts) == 2: mappings.append({"kegg_id": parts[0], "external_id": parts[1]}) return { "status": "success", "data": mappings, "metadata": { "source": "KEGG conv", "query_kegg_id": kegg_id, "target_db": target_db, "total": len(mappings), }, }