Source code for tooluniverse.kegg_ext_tool
# kegg_ext_tool.py
"""
KEGG Extended API tool for ToolUniverse.
Provides access to additional KEGG REST API endpoints for gene-pathway links,
pathway gene lists, and compound/metabolite information. Complements the
existing KEGG tools (search, gene info, pathway info, list organisms).
API: https://rest.kegg.jp/
No authentication required. Free public access.
Note: KEGG REST returns tab-separated text, not JSON. This tool parses
the text into structured JSON responses.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
KEGG_BASE_URL = "https://rest.kegg.jp"
[docs]
class KEGGExtTool(BaseTool):
"""
Tool for KEGG REST API extended endpoints providing gene-pathway links,
pathway gene lists, and compound details.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "get_gene_pathways")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the KEGG API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"KEGG API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to KEGG REST API"}
except requests.exceptions.HTTPError as e:
code = e.response.status_code if e.response is not None else "unknown"
if code == 404:
return {"error": f"KEGG entry not found"}
return {"error": f"KEGG API HTTP error: {code}"}
except Exception as e:
return {"error": f"Unexpected error querying KEGG: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "get_gene_pathways":
return self._get_gene_pathways(arguments)
elif self.endpoint == "get_pathway_genes":
return self._get_pathway_genes(arguments)
elif self.endpoint == "get_compound":
return self._get_compound(arguments)
elif self.endpoint == "list_brite":
return self._list_brite(arguments)
elif self.endpoint == "get_brite_hierarchy":
return self._get_brite_hierarchy(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _get_gene_pathways(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get all KEGG pathways that a gene participates in."""
gene_id = arguments.get("gene_id", "")
if not gene_id:
return {"error": "gene_id is required (e.g., 'hsa:7157' for human TP53)"}
# Ensure proper KEGG format (org:id)
if ":" not in gene_id:
return {
"error": "gene_id must be in KEGG format 'organism:id' (e.g., 'hsa:7157')"
}
# Get pathway links
url = f"{KEGG_BASE_URL}/link/pathway/{gene_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
pathways = []
for line in response.text.strip().split("\n"):
if line.strip():
parts = line.strip().split("\t")
if len(parts) >= 2:
pathways.append(parts[1])
if not pathways:
return {
"data": {
"gene_id": gene_id,
"pathway_count": 0,
"pathways": [],
},
"metadata": {"source": "KEGG REST API", "gene_id": gene_id},
}
# Get pathway names
pathway_details = []
for pw_id in pathways:
pathway_details.append({"pathway_id": pw_id})
# Batch get pathway names via list
org = gene_id.split(":")[0]
list_url = f"{KEGG_BASE_URL}/list/pathway/{org}"
list_response = requests.get(list_url, timeout=self.timeout)
if list_response.status_code == 200:
pw_names = {}
for line in list_response.text.strip().split("\n"):
if line.strip():
parts = line.strip().split("\t")
if len(parts) >= 2:
pw_names[parts[0]] = parts[1]
for pd in pathway_details:
# Map path:hsaXXXXX format
pw_id = pd["pathway_id"]
name = pw_names.get(pw_id, "")
if not name:
# Try without path: prefix
short_id = pw_id.replace("path:", "")
name = pw_names.get(short_id, "")
pd["pathway_name"] = name
return {
"data": {
"gene_id": gene_id,
"pathway_count": len(pathway_details),
"pathways": pathway_details,
},
"metadata": {
"source": "KEGG REST API",
"gene_id": gene_id,
},
}
[docs]
def _get_pathway_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get all genes in a KEGG pathway."""
pathway_id = arguments.get("pathway_id", "")
if not pathway_id:
return {
"error": "pathway_id is required (e.g., 'hsa04115' for p53 signaling)"
}
# Determine organism prefix from pathway ID
# hsa04115 -> hsa
org = ""
for i, ch in enumerate(pathway_id):
if ch.isdigit():
org = pathway_id[:i]
break
if not org:
return {
"error": "Cannot determine organism from pathway_id. Use format like 'hsa04115'"
}
# Get gene links for pathway
url = f"{KEGG_BASE_URL}/link/{org}/{pathway_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
genes = []
for line in response.text.strip().split("\n"):
if line.strip():
parts = line.strip().split("\t")
if len(parts) >= 2:
genes.append(parts[1])
# Get pathway name
pw_name = ""
info_url = f"{KEGG_BASE_URL}/list/pathway/{org}"
info_response = requests.get(info_url, timeout=self.timeout)
if info_response.status_code == 200:
for line in info_response.text.strip().split("\n"):
if pathway_id in line:
parts = line.strip().split("\t")
if len(parts) >= 2:
pw_name = parts[1]
break
return {
"data": {
"pathway_id": pathway_id,
"pathway_name": pw_name,
"gene_count": len(genes),
"genes": genes,
},
"metadata": {
"source": "KEGG REST API",
"pathway_id": pathway_id,
"organism": org,
},
}
[docs]
def _get_compound(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get KEGG compound/metabolite details."""
compound_id = arguments.get("compound_id", "")
if not compound_id:
return {"error": "compound_id is required (e.g., 'C00002' for ATP)"}
# Ensure proper KEGG compound format
if not compound_id.startswith("C"):
compound_id = f"C{compound_id}"
url = f"{KEGG_BASE_URL}/get/{compound_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
text = response.text
if not text.strip():
return {"error": f"Compound not found: {compound_id}"}
# Parse KEGG flat file format
result = {
"compound_id": compound_id,
"names": [],
"formula": None,
"exact_mass": None,
"mol_weight": None,
"pathways": {},
"enzymes": [],
"dblinks": {},
}
current_field = None
for line in text.split("\n"):
if line.startswith("NAME"):
current_field = "NAME"
name = line[12:].strip().rstrip(";")
if name:
result["names"].append(name)
elif line.startswith("FORMULA"):
result["formula"] = line[12:].strip()
current_field = None
elif line.startswith("EXACT_MASS"):
try:
result["exact_mass"] = float(line[12:].strip())
except ValueError:
result["exact_mass"] = line[12:].strip()
current_field = None
elif line.startswith("MOL_WEIGHT"):
try:
result["mol_weight"] = float(line[12:].strip())
except ValueError:
result["mol_weight"] = line[12:].strip()
current_field = None
elif line.startswith("PATHWAY"):
current_field = "PATHWAY"
parts = line[12:].strip().split(" ", 1)
if len(parts) >= 2:
result["pathways"][parts[0].strip()] = parts[1].strip()
elif parts:
result["pathways"][parts[0].strip()] = ""
elif line.startswith("ENZYME"):
current_field = "ENZYME"
enzymes = line[12:].strip().split()
result["enzymes"].extend(enzymes)
elif line.startswith("DBLINKS"):
current_field = "DBLINKS"
parts = line[12:].strip().split(": ", 1)
if len(parts) == 2:
result["dblinks"][parts[0].strip()] = parts[1].strip()
elif line.startswith("REMARK"):
result["remark"] = line[12:].strip()
current_field = None
elif line.startswith("///"):
break
elif line.startswith(" "):
content = line[12:].strip()
if current_field == "NAME":
name = content.rstrip(";")
if name:
result["names"].append(name)
elif current_field == "PATHWAY":
parts = content.split(" ", 1)
if len(parts) >= 2:
result["pathways"][parts[0].strip()] = parts[1].strip()
elif current_field == "ENZYME":
result["enzymes"].extend(content.split())
elif current_field == "DBLINKS":
parts = content.split(": ", 1)
if len(parts) == 2:
result["dblinks"][parts[0].strip()] = parts[1].strip()
else:
# New field we don't specifically handle
current_field = None
return {
"data": result,
"metadata": {
"source": "KEGG REST API",
"compound_id": compound_id,
},
}
[docs]
def _list_brite(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""List all available KEGG BRITE hierarchy classifications."""
url = f"{KEGG_BASE_URL}/list/brite"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
hierarchies = []
for line in response.text.strip().split("\n"):
if line.strip():
parts = line.strip().split("\t", 1)
if len(parts) >= 2:
hierarchies.append(
{
"hierarchy_id": parts[0].strip(),
"name": parts[1].strip(),
}
)
elif parts:
hierarchies.append(
{
"hierarchy_id": parts[0].strip(),
"name": "",
}
)
return {
"data": {
"hierarchy_count": len(hierarchies),
"hierarchies": hierarchies,
},
"metadata": {
"source": "KEGG BRITE",
},
}
[docs]
def _get_brite_hierarchy(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get a specific KEGG BRITE hierarchy as a JSON tree."""
hierarchy_id = arguments.get("hierarchy_id", "")
if not hierarchy_id:
return {"error": "hierarchy_id is required (e.g., 'ko01000' for Enzymes)"}
# KEGG BRITE JSON endpoint requires br: prefix
url = f"{KEGG_BASE_URL}/get/br:{hierarchy_id}/json"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
if not response.text.strip():
return {"error": f"BRITE hierarchy not found: {hierarchy_id}"}
tree = response.json()
return {
"data": tree,
"metadata": {
"source": "KEGG BRITE",
"hierarchy_id": hierarchy_id,
},
}