Source code for tooluniverse.chem_tool

"""
ChEMBL API Tools

This module provides tools for accessing the ChEMBL database:
- ChEMBLTool: Specialized tool for similarity search
- ChEMBLRESTTool: Generic REST API tool for ChEMBL endpoints
"""

import requests
from urllib.parse import quote
from typing import Any, Dict

# from rdkit import Chem
from .base_tool import BaseTool
from .tool_registry import register_tool
from .http_utils import request_with_retry
from indigo import Indigo



[docs]
@register_tool("ChEMBLRESTTool")
class ChEMBLRESTTool(BaseTool):
    """
    Generic ChEMBL REST API tool.
    Wrapper for ChEMBL API endpoints defined in chembl_tools.json.
    Supports all ChEMBL data resources: molecules, targets, assays, activities, drugs, etc.
    """


[docs]
    def __init__(self, tool_config: Dict):
        super().__init__(tool_config)
        self.base_url = "https://www.ebi.ac.uk/chembl/api/data"
        self.session = requests.Session()
        self.session.headers.update(
            {"Accept": "application/json", "User-Agent": "ToolUniverse/1.0"}
        )
        self.timeout = 30



[docs]
    def _build_url(self, args: Dict[str, Any]) -> str:
        """Build URL from endpoint template and arguments"""
        endpoint_template = self.tool_config.get("fields", {}).get("endpoint", "")
        tool_name = self.tool_config.get("name", "")

        if endpoint_template:
            url = endpoint_template
            # Replace placeholders in URL
            for k, v in args.items():
                url = url.replace(f"{{{k}}}", str(v))
            # If URL doesn't start with http, prepend base_url
            if not url.startswith("http"):
                url = self.base_url + url
            return url

        # Build URL based on tool name patterns
        if tool_name.startswith("ChEMBL_get_molecule"):
            chembl_id = args.get("chembl_id", "")
            if chembl_id:
                return f"{self.base_url}/molecule/{chembl_id}.json"
        elif tool_name.startswith("ChEMBL_get_target"):
            target_id = args.get("target_chembl_id", "")
            if target_id:
                return f"{self.base_url}/target/{target_id}.json"
        elif tool_name.startswith("ChEMBL_get_assay"):
            assay_id = args.get("assay_chembl_id", "")
            if assay_id:
                return f"{self.base_url}/assay/{assay_id}.json"
        elif tool_name.startswith("ChEMBL_get_activity"):
            activity_id = args.get("activity_id", "")
            if activity_id:
                return f"{self.base_url}/activity/{activity_id}.json"
        elif tool_name.startswith("ChEMBL_get_drug"):
            drug_id = args.get("drug_chembl_id", "")
            if drug_id:
                return f"{self.base_url}/drug/{drug_id}.json"

        return self.base_url



[docs]
    def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]:
        """Build query parameters for ChEMBL API"""
        params = {}
        self.tool_config.get("name", "")

        # ChEMBL API uses query parameters for filtering
        # Common parameters: limit, offset, format, ordering
        if "limit" in args:
            params["limit"] = args["limit"]
        if "offset" in args:
            params["offset"] = args["offset"]
        if "format" in args:
            params["format"] = args["format"]
        else:
            params["format"] = "json"
        # Optional field projection to reduce payload size on heavy endpoints.
        # ChEMBL supports projection via the `only` query parameter.
        # We accept ToolUniverse argument name `fields` and map it to `only`.
        # Power users can also pass `only` directly.
        only_value = args.get("only", None)
        fields_value = args.get("fields", None)
        projection_value = only_value if only_value is not None else fields_value
        if projection_value is not None:
            if isinstance(projection_value, (list, tuple)):
                params["only"] = ",".join(str(f) for f in projection_value)
            else:
                params["only"] = str(projection_value)
        if "ordering" in args:
            params["ordering"] = args["ordering"]

        # Add any filter parameters (ChEMBL uses field__filter syntax)
        # e.g., molecule_chembl_id__exact, pref_name__contains
        for key, value in args.items():
            if (
                key
                not in [
                    "limit",
                    "offset",
                    "format",
                    "fields",
                    "only",
                    "ordering",
                    "chembl_id",
                    "target_chembl_id",
                    "assay_chembl_id",
                    "activity_id",
                    "drug_chembl_id",
                ]
                and value is not None
            ):
                params[key] = value

        return params



[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the ChEMBL API call"""
        try:
            url = self._build_url(arguments)
            params = self._build_params(arguments)
            tool_name = self.tool_config.get("name", "")

            # Check if this is an image endpoint
            is_image_endpoint = (
                "get_molecule_image" in tool_name.lower() or "/image/" in url
            )

            response = request_with_retry(
                self.session,
                "GET",
                url,
                params=params,
                timeout=self.timeout,
                max_attempts=3,
                backoff_seconds=0.5,
            )
            response.raise_for_status()

            # Handle image endpoints differently
            if is_image_endpoint:
                content_type = response.headers.get("Content-Type", "")
                if "image" in content_type or "svg" in content_type:
                    # Return the image URL and content type for binary data
                    return {
                        "status": "success",
                        "data": f"Image data available at URL (Content-Type: {content_type})",
                        "url": response.url,
                        "content_type": content_type,
                        "image_size_bytes": len(response.content),
                    }

            data = response.json()

            response_data = {
                "status": "success",
                "data": data,
                "url": response.url,
            }

            # Extract count if available (ChEMBL pagination)
            if isinstance(data, dict):
                if "page_meta" in data:
                    response_data["page_meta"] = data["page_meta"]
                if "page" in data:
                    response_data["pagination"] = data["page"]

            # Count results if it's a list or has a results key
            if isinstance(data, list):
                response_data["count"] = len(data)
            elif isinstance(data, dict):
                # ChEMBL often returns data in a key matching the resource name
                for key in [
                    "molecules",
                    "targets",
                    "assays",
                    "activities",
                    "drugs",
                    "mechanisms",
                    "indications",
                    "binding_sites",
                ]:
                    if key in data and isinstance(data[key], list):
                        response_data["count"] = len(data[key])
                        break

            return response_data

        except requests.exceptions.HTTPError as e:
            resp = e.response
            status_code = getattr(resp, "status_code", None)
            detail = None
            if getattr(resp, "text", None):
                # Include a short preview of the response body for debugging,
                # but avoid returning huge payloads.
                detail = resp.text[:500]
            return {
                "status": "error",
                "error": f"ChEMBL API returned HTTP {status_code}",
                "url": getattr(resp, "url", url if "url" in locals() else None),
                "status_code": status_code,
                "detail": detail,
            }
        except requests.exceptions.RequestException as e:
            return {
                "status": "error",
                "error": f"ChEMBL API request failed: {str(e)}",
                "url": url if "url" in locals() else None,
                "detail": repr(e),
            }
        except Exception as e:
            return {
                "status": "error",
                "error": f"Unexpected error: {str(e)}",
                "url": url if "url" in locals() else None,
                "detail": repr(e),
            }





[docs]
@register_tool("ChEMBLTool")
class ChEMBLTool(BaseTool):
    """
    Tool to search for molecules similar to a given compound name or SMILES using the ChEMBL Web Services API.

    Note: This tool is designed for small molecule compounds only. Biologics (antibodies, proteins,
    oligonucleotides, etc.) do not have SMILES structures and cannot be used for structure-based
    similarity search. The tool will provide detailed error messages when biologics are queried,
    explaining the reason and suggesting alternative tools.
    """


[docs]
    def __init__(self, tool_config, base_url="https://www.ebi.ac.uk/chembl/api/data"):
        super().__init__(tool_config)
        self.base_url = base_url
        self.indigo = Indigo()



[docs]
    def run(self, arguments):
        query = arguments.get("query")
        similarity_threshold = arguments.get("similarity_threshold", 80)
        max_results = arguments.get("max_results", 20)

        if not query:
            return {"error": "`query` parameter is required."}
        return self._search_similar_molecules(query, similarity_threshold, max_results)



[docs]
    def get_chembl_id_by_name(self, compound_name):
        """
        Search ChEMBL for a compound by name and return the ChEMBL ID of the first match.
        """
        headers = {"Accept": "application/json"}
        search_url = f"{self.base_url}/molecule/search.json?q={quote(compound_name)}"
        print(search_url)
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        results = response.json().get("molecules", [])
        if not results or not isinstance(results, list):
            return {"error": "No valid results found for the compound name."}
        if not results:
            return {"error": "No results found for the compound name."}
        top_molecules = results[:3]  # Get the top 3 results
        chembl_ids = [
            molecule.get("molecule_chembl_id")
            for molecule in top_molecules
            if molecule.get("molecule_chembl_id")
        ]
        if not chembl_ids:
            return {"error": "No ChEMBL IDs found for the compound name."}
        return {"chembl_ids": chembl_ids}



[docs]
    def get_smiles_pref_name_by_chembl_id(self, query):
        """
        Given a ChEMBL ID, return a dict with canonical SMILES and preferred name.
        """
        headers = {"Accept": "application/json"}
        if query.upper().startswith("CHEMBL"):
            molecule_url = f"{self.base_url}/molecule/{quote(query)}.json"
            response = requests.get(molecule_url, headers=headers)
            response.raise_for_status()
            molecule = response.json()
            if not molecule or not isinstance(molecule, dict):
                return {"error": "No valid molecule found for the given ChEMBL ID."}
            molecule_structures = molecule.get("molecule_structures")
            if not molecule_structures or not isinstance(molecule_structures, dict):
                return {
                    "error": "Molecule structures not found or invalid for the ChEMBL ID."
                }
            smiles = molecule_structures.get("canonical_smiles")
            pref_name = molecule.get("pref_name")
            if not smiles:
                return {"error": "SMILES not found for the given ChEMBL ID."}
            return {"smiles": smiles, "pref_name": pref_name}
        else:
            return None



[docs]
    def get_chembl_smiles_pref_name_id_by_name(self, compound_name):
        """
        Search ChEMBL for a compound by name and return a list of dicts with ChEMBL ID, canonical SMILES, and preferred name for the top 5 matches.
        """
        headers = {"Accept": "application/json"}
        search_url = f"{self.base_url}/molecule/search.json?q={quote(compound_name)}"
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        results = response.json().get("molecules", [])
        if not results or not isinstance(results, list):
            return {"error": "No valid results found for the compound name."}
        top_molecules = results[:5]
        output = []
        molecules_without_smiles = []
        for molecule in top_molecules:
            chembl_id = molecule.get("molecule_chembl_id", None)
            molecule_structures = molecule.get("molecule_structures", {})
            molecule_type = molecule.get("molecule_type", "Unknown")
            if molecule_structures is not None:
                smiles = molecule_structures.get("canonical_smiles", None)
            else:
                smiles = None
            pref_name = molecule.get("pref_name")
            if chembl_id and smiles:
                output.append(
                    {"chembl_id": chembl_id, "smiles": smiles, "pref_name": pref_name}
                )
            elif chembl_id and not smiles:
                smiles_pre_name_dict = self.get_smiles_pref_name_by_chembl_id(chembl_id)
                if (
                    isinstance(smiles_pre_name_dict, dict)
                    and "error" not in smiles_pre_name_dict
                ):
                    output.append(
                        {
                            "chembl_id": chembl_id,
                            "smiles": smiles_pre_name_dict["smiles"],
                            "pref_name": smiles_pre_name_dict.get("pref_name"),
                        }
                    )
                else:
                    # Store info about molecules found but without SMILES
                    molecules_without_smiles.append(
                        {
                            "chembl_id": chembl_id,
                            "pref_name": pref_name,
                            "molecule_type": molecule_type,
                        }
                    )
        if not output:
            # Provide detailed error message with reason and alternative tools
            error_msg = "No ChEMBL IDs or SMILES found for the compound name."
            if molecules_without_smiles:
                molecule_types = set(
                    [
                        m.get("molecule_type")
                        for m in molecules_without_smiles
                        if m.get("molecule_type")
                    ]
                )
                if any(
                    mt in ["Antibody", "Protein", "Oligonucleotide", "Oligosaccharide"]
                    for mt in molecule_types
                ):
                    error_msg = (
                        f"The compound '{compound_name}' was found in ChEMBL but does not have a SMILES structure. "
                        f"This tool is designed for small molecule compounds only. "
                        f"The found molecule(s) are of type(s): {', '.join(molecule_types)}. "
                        f"Biologics (antibodies, proteins, etc.) do not have SMILES representations. "
                        f"For searching similar biologics, consider using: "
                        f"PDB_search_similar_structures (for structure/sequence similarity search using PDB ID or sequence), "
                        f"BLAST_protein_search (for protein/antibody sequence similarity search, requires amino acid sequence), "
                        f"or UniProt_search (for searching proteins in UniProt database). "
                        f"For small molecule similarity search, use: PubChem_search_compounds_by_similarity (requires SMILES input)."
                    )
                else:
                    error_msg = (
                        f"The compound '{compound_name}' was found in ChEMBL (ChEMBL ID(s): "
                        f"{', '.join([m.get('chembl_id') for m in molecules_without_smiles[:3]])}) "
                        f"but does not have a SMILES structure available. "
                        f"This tool requires SMILES for similarity search. "
                        f"For searching similar small molecules, consider using: "
                        f"PubChem_search_compounds_by_similarity (requires SMILES input)."
                    )
            return {"error": error_msg}
        return output



[docs]
    def _search_similar_molecules(self, query, similarity_threshold, max_results):
        headers = {"Accept": "application/json"}

        smiles_info_list = []

        # If the query looks like a ChEMBL ID, fetch its SMILES and pref_name
        if isinstance(query, str) and query.upper().startswith("CHEMBL"):
            result = self.get_smiles_pref_name_by_chembl_id(query)
            if isinstance(result, dict) and "error" in result:
                return result
            smiles_info_list.append(
                {
                    "chembl_id": query,
                    "smiles": result["smiles"],
                    "pref_name": result.get("pref_name"),
                }
            )

        # If not a ChEMBL ID, use get_chembl_smiles_pref_name_id_by_name to get info
        if len(smiles_info_list) == 0 and isinstance(query, str):
            results = self.get_chembl_smiles_pref_name_id_by_name(query)
            if isinstance(results, dict) and "error" in results:
                return results
            for item in results:
                smiles_info_list.append(item)

        if len(smiles_info_list) == 0:
            # Check if the compound exists in ChEMBL but without SMILES
            if isinstance(query, str) and not query.upper().startswith("CHEMBL"):
                # Try to get molecule info to provide better error message
                headers = {"Accept": "application/json"}
                search_url = f"{self.base_url}/molecule/search.json?q={quote(query)}"
                try:
                    response = requests.get(search_url, headers=headers)
                    response.raise_for_status()
                    results = response.json().get("molecules", [])
                    if results and len(results) > 0:
                        molecule = results[0]
                        molecule_type = molecule.get("molecule_type", "Unknown")
                        chembl_id = molecule.get("molecule_chembl_id")
                        if molecule_type in [
                            "Antibody",
                            "Protein",
                            "Oligonucleotide",
                            "Oligosaccharide",
                        ]:
                            return {
                                "error": (
                                    f"The compound '{query}' was found in ChEMBL (ChEMBL ID: {chembl_id}) "
                                    f"but is a {molecule_type.lower()}, not a small molecule. "
                                    f"This tool is designed for small molecule compounds only. "
                                    f"Biologics (antibodies, proteins, etc.) do not have SMILES representations "
                                    f"and cannot be used for structure-based similarity search. "
                                    f"For searching similar biologics, consider using: "
                                    f"PDB_search_similar_structures (for structure/sequence similarity search using PDB ID or sequence), "
                                    f"BLAST_protein_search (for protein/antibody sequence similarity search, requires amino acid sequence), "
                                    f"or UniProt_search (for searching proteins in UniProt database). "
                                    f"For small molecule similarity search, use: PubChem_search_compounds_by_similarity (requires SMILES input)."
                                )
                            }
                except Exception:
                    pass
            return {
                "error": (
                    f"SMILES representation not found for the compound '{query}'. "
                    f"This tool requires SMILES structure for similarity search. "
                    f"If you have a SMILES string, you can use it directly as the query. "
                    f"Alternatively, consider using PubChem_search_compounds_by_similarity "
                    f"(requires SMILES input) for similarity search."
                )
            }

        results_list = []
        for info in smiles_info_list:
            smiles = info["smiles"]
            pref_name = info.get("pref_name")
            chembl_id = info.get("chembl_id")
            mol = self.indigo.loadMolecule(smiles)
            if mol is None:
                return {"error": "Failed to load molecule with Indigo."}

            encoded_smiles = quote(smiles)
            similarity_url = f"{self.base_url}/similarity/{encoded_smiles}/{similarity_threshold}.json?limit={max_results}"
            sim_response = requests.get(similarity_url, headers=headers)
            sim_response.raise_for_status()
            sim_results = sim_response.json().get("molecules", [])
            similar_molecules = []
            for mol in sim_results:
                sim_chembl_id = mol.get("molecule_chembl_id")
                sim_pref_name = mol.get("pref_name", "N/A")
                mol_structures = mol.get("molecule_structures", {})
                if mol_structures is None:
                    continue
                mol_smiles = mol_structures.get("canonical_smiles", "N/A")
                similarity = mol.get("similarity", "N/A")
                similar_molecules.append(
                    {
                        "chembl_id": sim_chembl_id,
                        "pref_name": sim_pref_name,
                        "smiles": mol_smiles,
                        "similarity": similarity,
                    }
                )
            if len(similar_molecules) == 0:
                continue
            results_list.append(
                {
                    "chembl_id": chembl_id,
                    "pref_name": pref_name,
                    "smiles": smiles,
                    "similar_molecules": similar_molecules,
                }
            )

        return results_list