Source code for tooluniverse.chem_tool

import requests
from urllib.parse import quote

# from rdkit import Chem
from .base_tool import BaseTool
from .tool_registry import register_tool
from indigo import Indigo


[docs] @register_tool("ChEMBLTool") class ChEMBLTool(BaseTool): """ Tool to search for molecules similar to a given compound name or SMILES using the ChEMBL Web Services API. """
[docs] def __init__(self, tool_config, base_url="https://www.ebi.ac.uk/chembl/api/data"): super().__init__(tool_config) self.base_url = base_url self.indigo = Indigo()
[docs] def run(self, arguments): query = arguments.get("query") similarity_threshold = arguments.get("similarity_threshold", 80) max_results = arguments.get("max_results", 20) if not query: return {"error": "`query` parameter is required."} return self._search_similar_molecules(query, similarity_threshold, max_results)
[docs] def get_chembl_id_by_name(self, compound_name): """ Search ChEMBL for a compound by name and return the ChEMBL ID of the first match. """ headers = {"Accept": "application/json"} search_url = f"{self.base_url}/molecule/search.json?q={quote(compound_name)}" print(search_url) response = requests.get(search_url, headers=headers) response.raise_for_status() results = response.json().get("molecules", []) if not results or not isinstance(results, list): return {"error": "No valid results found for the compound name."} if not results: return {"error": "No results found for the compound name."} top_molecules = results[:3] # Get the top 3 results chembl_ids = [ molecule.get("molecule_chembl_id") for molecule in top_molecules if molecule.get("molecule_chembl_id") ] if not chembl_ids: return {"error": "No ChEMBL IDs found for the compound name."} return {"chembl_ids": chembl_ids}
[docs] def get_smiles_pref_name_by_chembl_id(self, query): """ Given a ChEMBL ID, return a dict with canonical SMILES and preferred name. """ headers = {"Accept": "application/json"} if query.upper().startswith("CHEMBL"): molecule_url = f"{self.base_url}/molecule/{quote(query)}.json" response = requests.get(molecule_url, headers=headers) response.raise_for_status() molecule = response.json() if not molecule or not isinstance(molecule, dict): return {"error": "No valid molecule found for the given ChEMBL ID."} molecule_structures = molecule.get("molecule_structures") if not molecule_structures or not isinstance(molecule_structures, dict): return { "error": "Molecule structures not found or invalid for the ChEMBL ID." } smiles = molecule_structures.get("canonical_smiles") pref_name = molecule.get("pref_name") if not smiles: return {"error": "SMILES not found for the given ChEMBL ID."} return {"smiles": smiles, "pref_name": pref_name} else: return None
[docs] def get_chembl_smiles_pref_name_id_by_name(self, compound_name): """ Search ChEMBL for a compound by name and return a list of dicts with ChEMBL ID, canonical SMILES, and preferred name for the top 5 matches. """ headers = {"Accept": "application/json"} search_url = f"{self.base_url}/molecule/search.json?q={quote(compound_name)}" response = requests.get(search_url, headers=headers) response.raise_for_status() results = response.json().get("molecules", []) if not results or not isinstance(results, list): return {"error": "No valid results found for the compound name."} top_molecules = results[:5] output = [] for molecule in top_molecules: chembl_id = molecule.get("molecule_chembl_id", None) molecule_structures = molecule.get("molecule_structures", {}) if molecule_structures is not None: smiles = molecule_structures.get("canonical_smiles", None) else: smiles = None pref_name = molecule.get("pref_name") if chembl_id and smiles: output.append( {"chembl_id": chembl_id, "smiles": smiles, "pref_name": pref_name} ) elif chembl_id and not smiles: smiles_pre_name_dict = self.get_smiles_pref_name_by_chembl_id(chembl_id) if ( isinstance(smiles_pre_name_dict, dict) and "error" not in smiles_pre_name_dict ): output.append( { "chembl_id": chembl_id, "smiles": smiles_pre_name_dict["smiles"], "pref_name": smiles_pre_name_dict.get("pref_name"), } ) if not output: return {"error": "No ChEMBL IDs or SMILES found for the compound name."} return output
[docs] def _search_similar_molecules(self, query, similarity_threshold, max_results): headers = {"Accept": "application/json"} smiles_info_list = [] # If the query looks like a ChEMBL ID, fetch its SMILES and pref_name if isinstance(query, str) and query.upper().startswith("CHEMBL"): result = self.get_smiles_pref_name_by_chembl_id(query) if isinstance(result, dict) and "error" in result: return result smiles_info_list.append( { "chembl_id": query, "smiles": result["smiles"], "pref_name": result.get("pref_name"), } ) # If not a ChEMBL ID, use get_chembl_smiles_pref_name_id_by_name to get info if len(smiles_info_list) == 0 and isinstance(query, str): results = self.get_chembl_smiles_pref_name_id_by_name(query) if isinstance(results, dict) and "error" in results: return results for item in results: smiles_info_list.append(item) if len(smiles_info_list) == 0: return {"error": "SMILES representation not found for the compound."} results_list = [] for info in smiles_info_list: smiles = info["smiles"] pref_name = info.get("pref_name") chembl_id = info.get("chembl_id") mol = self.indigo.loadMolecule(smiles) if mol is None: return {"error": "Failed to load molecule with Indigo."} encoded_smiles = quote(smiles) similarity_url = f"{self.base_url}/similarity/{encoded_smiles}/{similarity_threshold}.json?limit={max_results}" sim_response = requests.get(similarity_url, headers=headers) sim_response.raise_for_status() sim_results = sim_response.json().get("molecules", []) similar_molecules = [] for mol in sim_results: sim_chembl_id = mol.get("molecule_chembl_id") sim_pref_name = mol.get("pref_name", "N/A") mol_structures = mol.get("molecule_structures", {}) if mol_structures is None: continue mol_smiles = mol_structures.get("canonical_smiles", "N/A") similarity = mol.get("similarity", "N/A") similar_molecules.append( { "chembl_id": sim_chembl_id, "pref_name": sim_pref_name, "smiles": mol_smiles, "similarity": similarity, } ) if len(similar_molecules) == 0: continue results_list.append( { "chembl_id": chembl_id, "pref_name": pref_name, "smiles": smiles, "similar_molecules": similar_molecules, } ) return results_list