Source code for tooluniverse.chem_tool

"""
ChEMBL API Tools

This module provides tools for accessing the ChEMBL database:
- ChEMBLTool: Specialized tool for similarity search
- ChEMBLRESTTool: Generic REST API tool for ChEMBL endpoints
"""

import requests
from urllib.parse import quote
from typing import Any, Dict

# from rdkit import Chem
from .base_tool import BaseTool
from .tool_registry import register_tool
from .http_utils import request_with_retry
from indigo import Indigo


[docs] @register_tool("ChEMBLRESTTool") class ChEMBLRESTTool(BaseTool): """ Generic ChEMBL REST API tool. Wrapper for ChEMBL API endpoints defined in chembl_tools.json. Supports all ChEMBL data resources: molecules, targets, assays, activities, drugs, etc. """
[docs] def __init__(self, tool_config: Dict): super().__init__(tool_config) self.base_url = "https://www.ebi.ac.uk/chembl/api/data" self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": "ToolUniverse/1.0"} ) self.timeout = 30
[docs] def _build_url(self, args: Dict[str, Any]) -> str: """Build URL from endpoint template and arguments""" endpoint_template = self.tool_config.get("fields", {}).get("endpoint", "") tool_name = self.tool_config.get("name", "") if endpoint_template: url = endpoint_template # Replace placeholders in URL for k, v in args.items(): url = url.replace(f"{{{k}}}", str(v)) # If URL doesn't start with http, prepend base_url if not url.startswith("http"): url = self.base_url + url return url # Build URL based on tool name patterns if tool_name.startswith("ChEMBL_get_molecule"): chembl_id = args.get("chembl_id", "") if chembl_id: return f"{self.base_url}/molecule/{chembl_id}.json" elif tool_name.startswith("ChEMBL_get_target"): target_id = args.get("target_chembl_id", "") if target_id: return f"{self.base_url}/target/{target_id}.json" elif tool_name.startswith("ChEMBL_get_assay"): assay_id = args.get("assay_chembl_id", "") if assay_id: return f"{self.base_url}/assay/{assay_id}.json" elif tool_name.startswith("ChEMBL_get_activity"): activity_id = args.get("activity_id", "") if activity_id: return f"{self.base_url}/activity/{activity_id}.json" elif tool_name.startswith("ChEMBL_get_drug"): drug_id = args.get("drug_chembl_id", "") if drug_id: return f"{self.base_url}/drug/{drug_id}.json" return self.base_url
[docs] def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]: """Build query parameters for ChEMBL API""" params = {} self.tool_config.get("name", "") # ChEMBL API uses query parameters for filtering # Common parameters: limit, offset, format, ordering if "limit" in args: params["limit"] = args["limit"] if "offset" in args: params["offset"] = args["offset"] if "format" in args: params["format"] = args["format"] else: params["format"] = "json" # Optional field projection to reduce payload size on heavy endpoints. # ChEMBL supports projection via the `only` query parameter. # We accept ToolUniverse argument name `fields` and map it to `only`. # Power users can also pass `only` directly. only_value = args.get("only", None) fields_value = args.get("fields", None) projection_value = only_value if only_value is not None else fields_value if projection_value is not None: if isinstance(projection_value, (list, tuple)): params["only"] = ",".join(str(f) for f in projection_value) else: params["only"] = str(projection_value) if "ordering" in args: params["ordering"] = args["ordering"] # Add any filter parameters (ChEMBL uses field__filter syntax) # e.g., molecule_chembl_id__exact, pref_name__contains for key, value in args.items(): if ( key not in [ "limit", "offset", "format", "fields", "only", "ordering", "chembl_id", "target_chembl_id", "assay_chembl_id", "activity_id", "drug_chembl_id", ] and value is not None ): params[key] = value return params
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the ChEMBL API call""" try: url = self._build_url(arguments) params = self._build_params(arguments) tool_name = self.tool_config.get("name", "") # Check if this is an image endpoint is_image_endpoint = ( "get_molecule_image" in tool_name.lower() or "/image/" in url ) response = request_with_retry( self.session, "GET", url, params=params, timeout=self.timeout, max_attempts=3, backoff_seconds=0.5, ) response.raise_for_status() # Handle image endpoints differently if is_image_endpoint: content_type = response.headers.get("Content-Type", "") if "image" in content_type or "svg" in content_type: # Return the image URL and content type for binary data return { "status": "success", "data": f"Image data available at URL (Content-Type: {content_type})", "url": response.url, "content_type": content_type, "image_size_bytes": len(response.content), } data = response.json() response_data = { "status": "success", "data": data, "url": response.url, } # Extract count if available (ChEMBL pagination) if isinstance(data, dict): if "page_meta" in data: response_data["page_meta"] = data["page_meta"] if "page" in data: response_data["pagination"] = data["page"] # Count results if it's a list or has a results key if isinstance(data, list): response_data["count"] = len(data) elif isinstance(data, dict): # ChEMBL often returns data in a key matching the resource name for key in [ "molecules", "targets", "assays", "activities", "drugs", "mechanisms", "indications", "binding_sites", ]: if key in data and isinstance(data[key], list): response_data["count"] = len(data[key]) break return response_data except requests.exceptions.HTTPError as e: resp = e.response status_code = getattr(resp, "status_code", None) detail = None if getattr(resp, "text", None): # Include a short preview of the response body for debugging, # but avoid returning huge payloads. detail = resp.text[:500] return { "status": "error", "error": f"ChEMBL API returned HTTP {status_code}", "url": getattr(resp, "url", url if "url" in locals() else None), "status_code": status_code, "detail": detail, } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"ChEMBL API request failed: {str(e)}", "url": url if "url" in locals() else None, "detail": repr(e), } except Exception as e: return { "status": "error", "error": f"Unexpected error: {str(e)}", "url": url if "url" in locals() else None, "detail": repr(e), }
[docs] @register_tool("ChEMBLTool") class ChEMBLTool(BaseTool): """ Tool to search for molecules similar to a given compound name or SMILES using the ChEMBL Web Services API. Note: This tool is designed for small molecule compounds only. Biologics (antibodies, proteins, oligonucleotides, etc.) do not have SMILES structures and cannot be used for structure-based similarity search. The tool will provide detailed error messages when biologics are queried, explaining the reason and suggesting alternative tools. """
[docs] def __init__(self, tool_config, base_url="https://www.ebi.ac.uk/chembl/api/data"): super().__init__(tool_config) self.base_url = base_url self.indigo = Indigo()
[docs] def run(self, arguments): query = arguments.get("query") similarity_threshold = arguments.get("similarity_threshold", 80) max_results = arguments.get("max_results", 20) if not query: return {"error": "`query` parameter is required."} return self._search_similar_molecules(query, similarity_threshold, max_results)
[docs] def get_chembl_id_by_name(self, compound_name): """ Search ChEMBL for a compound by name and return the ChEMBL ID of the first match. """ headers = {"Accept": "application/json"} search_url = f"{self.base_url}/molecule/search.json?q={quote(compound_name)}" print(search_url) response = requests.get(search_url, headers=headers) response.raise_for_status() results = response.json().get("molecules", []) if not results or not isinstance(results, list): return {"error": "No valid results found for the compound name."} if not results: return {"error": "No results found for the compound name."} top_molecules = results[:3] # Get the top 3 results chembl_ids = [ molecule.get("molecule_chembl_id") for molecule in top_molecules if molecule.get("molecule_chembl_id") ] if not chembl_ids: return {"error": "No ChEMBL IDs found for the compound name."} return {"chembl_ids": chembl_ids}
[docs] def get_smiles_pref_name_by_chembl_id(self, query): """ Given a ChEMBL ID, return a dict with canonical SMILES and preferred name. """ headers = {"Accept": "application/json"} if query.upper().startswith("CHEMBL"): molecule_url = f"{self.base_url}/molecule/{quote(query)}.json" response = requests.get(molecule_url, headers=headers) response.raise_for_status() molecule = response.json() if not molecule or not isinstance(molecule, dict): return {"error": "No valid molecule found for the given ChEMBL ID."} molecule_structures = molecule.get("molecule_structures") if not molecule_structures or not isinstance(molecule_structures, dict): return { "error": "Molecule structures not found or invalid for the ChEMBL ID." } smiles = molecule_structures.get("canonical_smiles") pref_name = molecule.get("pref_name") if not smiles: return {"error": "SMILES not found for the given ChEMBL ID."} return {"smiles": smiles, "pref_name": pref_name} else: return None
[docs] def get_chembl_smiles_pref_name_id_by_name(self, compound_name): """ Search ChEMBL for a compound by name and return a list of dicts with ChEMBL ID, canonical SMILES, and preferred name for the top 5 matches. """ headers = {"Accept": "application/json"} search_url = f"{self.base_url}/molecule/search.json?q={quote(compound_name)}" response = requests.get(search_url, headers=headers) response.raise_for_status() results = response.json().get("molecules", []) if not results or not isinstance(results, list): return {"error": "No valid results found for the compound name."} top_molecules = results[:5] output = [] molecules_without_smiles = [] for molecule in top_molecules: chembl_id = molecule.get("molecule_chembl_id", None) molecule_structures = molecule.get("molecule_structures", {}) molecule_type = molecule.get("molecule_type", "Unknown") if molecule_structures is not None: smiles = molecule_structures.get("canonical_smiles", None) else: smiles = None pref_name = molecule.get("pref_name") if chembl_id and smiles: output.append( {"chembl_id": chembl_id, "smiles": smiles, "pref_name": pref_name} ) elif chembl_id and not smiles: smiles_pre_name_dict = self.get_smiles_pref_name_by_chembl_id(chembl_id) if ( isinstance(smiles_pre_name_dict, dict) and "error" not in smiles_pre_name_dict ): output.append( { "chembl_id": chembl_id, "smiles": smiles_pre_name_dict["smiles"], "pref_name": smiles_pre_name_dict.get("pref_name"), } ) else: # Store info about molecules found but without SMILES molecules_without_smiles.append( { "chembl_id": chembl_id, "pref_name": pref_name, "molecule_type": molecule_type, } ) if not output: # Provide detailed error message with reason and alternative tools error_msg = "No ChEMBL IDs or SMILES found for the compound name." if molecules_without_smiles: molecule_types = set( [ m.get("molecule_type") for m in molecules_without_smiles if m.get("molecule_type") ] ) if any( mt in ["Antibody", "Protein", "Oligonucleotide", "Oligosaccharide"] for mt in molecule_types ): error_msg = ( f"The compound '{compound_name}' was found in ChEMBL but does not have a SMILES structure. " f"This tool is designed for small molecule compounds only. " f"The found molecule(s) are of type(s): {', '.join(molecule_types)}. " f"Biologics (antibodies, proteins, etc.) do not have SMILES representations. " f"For searching similar biologics, consider using: " f"PDB_search_similar_structures (for structure/sequence similarity search using PDB ID or sequence), " f"BLAST_protein_search (for protein/antibody sequence similarity search, requires amino acid sequence), " f"or UniProt_search (for searching proteins in UniProt database). " f"For small molecule similarity search, use: PubChem_search_compounds_by_similarity (requires SMILES input)." ) else: error_msg = ( f"The compound '{compound_name}' was found in ChEMBL (ChEMBL ID(s): " f"{', '.join([m.get('chembl_id') for m in molecules_without_smiles[:3]])}) " f"but does not have a SMILES structure available. " f"This tool requires SMILES for similarity search. " f"For searching similar small molecules, consider using: " f"PubChem_search_compounds_by_similarity (requires SMILES input)." ) return {"error": error_msg} return output
[docs] def _search_similar_molecules(self, query, similarity_threshold, max_results): headers = {"Accept": "application/json"} smiles_info_list = [] # If the query looks like a ChEMBL ID, fetch its SMILES and pref_name if isinstance(query, str) and query.upper().startswith("CHEMBL"): result = self.get_smiles_pref_name_by_chembl_id(query) if isinstance(result, dict) and "error" in result: return result smiles_info_list.append( { "chembl_id": query, "smiles": result["smiles"], "pref_name": result.get("pref_name"), } ) # If not a ChEMBL ID, use get_chembl_smiles_pref_name_id_by_name to get info if len(smiles_info_list) == 0 and isinstance(query, str): results = self.get_chembl_smiles_pref_name_id_by_name(query) if isinstance(results, dict) and "error" in results: return results for item in results: smiles_info_list.append(item) if len(smiles_info_list) == 0: # Check if the compound exists in ChEMBL but without SMILES if isinstance(query, str) and not query.upper().startswith("CHEMBL"): # Try to get molecule info to provide better error message headers = {"Accept": "application/json"} search_url = f"{self.base_url}/molecule/search.json?q={quote(query)}" try: response = requests.get(search_url, headers=headers) response.raise_for_status() results = response.json().get("molecules", []) if results and len(results) > 0: molecule = results[0] molecule_type = molecule.get("molecule_type", "Unknown") chembl_id = molecule.get("molecule_chembl_id") if molecule_type in [ "Antibody", "Protein", "Oligonucleotide", "Oligosaccharide", ]: return { "error": ( f"The compound '{query}' was found in ChEMBL (ChEMBL ID: {chembl_id}) " f"but is a {molecule_type.lower()}, not a small molecule. " f"This tool is designed for small molecule compounds only. " f"Biologics (antibodies, proteins, etc.) do not have SMILES representations " f"and cannot be used for structure-based similarity search. " f"For searching similar biologics, consider using: " f"PDB_search_similar_structures (for structure/sequence similarity search using PDB ID or sequence), " f"BLAST_protein_search (for protein/antibody sequence similarity search, requires amino acid sequence), " f"or UniProt_search (for searching proteins in UniProt database). " f"For small molecule similarity search, use: PubChem_search_compounds_by_similarity (requires SMILES input)." ) } except Exception: pass return { "error": ( f"SMILES representation not found for the compound '{query}'. " f"This tool requires SMILES structure for similarity search. " f"If you have a SMILES string, you can use it directly as the query. " f"Alternatively, consider using PubChem_search_compounds_by_similarity " f"(requires SMILES input) for similarity search." ) } results_list = [] for info in smiles_info_list: smiles = info["smiles"] pref_name = info.get("pref_name") chembl_id = info.get("chembl_id") mol = self.indigo.loadMolecule(smiles) if mol is None: return {"error": "Failed to load molecule with Indigo."} encoded_smiles = quote(smiles) similarity_url = f"{self.base_url}/similarity/{encoded_smiles}/{similarity_threshold}.json?limit={max_results}" sim_response = requests.get(similarity_url, headers=headers) sim_response.raise_for_status() sim_results = sim_response.json().get("molecules", []) similar_molecules = [] for mol in sim_results: sim_chembl_id = mol.get("molecule_chembl_id") sim_pref_name = mol.get("pref_name", "N/A") mol_structures = mol.get("molecule_structures", {}) if mol_structures is None: continue mol_smiles = mol_structures.get("canonical_smiles", "N/A") similarity = mol.get("similarity", "N/A") similar_molecules.append( { "chembl_id": sim_chembl_id, "pref_name": sim_pref_name, "smiles": mol_smiles, "similarity": similarity, } ) if len(similar_molecules) == 0: continue results_list.append( { "chembl_id": chembl_id, "pref_name": pref_name, "smiles": smiles, "similar_molecules": similar_molecules, } ) return results_list