Source code for tooluniverse.rdkit_cheminfo_tool

# rdkit_cheminfo_tool.py
"""
RDKit Cheminformatics Tools: Pharmacophore Features and Matched Molecular Pairs

Implements:
1. Pharmacophore feature extraction from SMILES
   - Feature types: HBD, HBA, Aromatic, Hydrophobic, PosIonizable, NegIonizable
   - Optional 3D feature center coordinates from MMFF conformer
2. Matched Molecular Pairs (MMP) analysis
   - Identifies common core and transforming fragments between two compounds
   - Computes property deltas and Tanimoto similarity

No external API calls. Requires RDKit.

References:
- Pharmacophore: Leach et al., Drug Discovery Today (2010)
- MMP: Hussain & Rea, J. Chem. Inf. Model. (2010)
"""

from typing import Dict, Any

from .base_tool import BaseTool
from .tool_registry import register_tool

try:
    from rdkit import Chem, DataStructs
    from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem

    HAS_RDKIT = True
except Exception:
    HAS_RDKIT = False

try:
    from rdkit.Chem import rdMMPA

    HAS_MMPA = True
except Exception:
    HAS_MMPA = False

# SMARTS patterns for pharmacophore features.
# Each dict value is a LIST of complete SMARTS strings (not comma-joined),
# because complex patterns (Hydrophobic, PosIonizable, NegIonizable) contain
# internal commas as OR operators that must not be used as separators.
_PHARM_SMARTS: Dict[str, list] = {
    # Three separate SMARTS: N–H donors, O–H donors, S–H donors
    "HBD": ["[#7;!H0;v3,v4&+1]", "[#8;!H0]", "[#16;!H0]"],
    # Four separate SMARTS: tertiary N acceptors, ether/carbonyl O, aromatic O, S acceptors
    "HBA": [
        "[#7;H0;v3,$([N;H0;+0;$(N-a)])]",
        "[#8;H0;+0]",
        "[o;H0]",
        "[#16;H0;v2]",
    ],
    "Aromatic": ["a"],
    # Hydrophobic: split into simple, valid SMARTS to avoid nested-SMARTS parse errors.
    # Aromatic/heteroaromatic + heavy halogens + non-polar S; aliphatic sp3 C (CH3/CH2/CH)
    "Hydrophobic": [
        "[c,s,Br,I,S&H0&v2]",  # aromatic C/S, heavy halogens, thioether S
        "[CX4H3]",  # methyl carbons
        "[CX4H2]",  # methylene carbons
        "[CX4H1]",  # methine carbons
    ],
    # Single recursive SMARTS — must not be split
    "PosIonizable": [
        "[$([NH2;+0;$(N-[!$([OH,OX1])])]),"
        "$([NH;+0;$(N(-[!$([OH,OX1])])-[!$([OH,OX1])])]),"
        "$([N;H0;+0;$(N(-[!$([OH,OX1])])"
        "(-[!$([OH,OX1])])-[!$([OH,OX1])])]),"
        "$([n;H0;+0]),$([NH;+]),$([NH2;+])]"
    ],
    # Single recursive SMARTS — must not be split
    "NegIonizable": [
        "[$([C,S](=[O,S,P])-[O;H1,H0&-1]),$([#7;v5;$(N(-[O;H0&-1])-[OX2;H0])])]"
    ],
}


[docs] @register_tool("RDKitCheminfoTool") class RDKitCheminfoTool(BaseTool): """ RDKit-based cheminformatics tools for pharmacophore and MMP analysis. Endpoints: - pharmacophore_features: Extract pharmacophore feature centers from SMILES - matched_molecular_pair: Find molecular transformation between two SMILES """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "pharmacophore_features")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: if not HAS_RDKIT: return { "status": "error", "error": "RDKit is required. Install with: pip install rdkit", } try: if self.endpoint == "pharmacophore_features": return self._pharmacophore_features(arguments) elif self.endpoint == "matched_molecular_pair": return self._matched_molecular_pair(arguments) else: return { "status": "error", "error": f"Unknown endpoint: {self.endpoint}", } except Exception as e: return { "status": "error", "error": f"RDKit cheminformatics error: {str(e)}", }
[docs] def _parse_smiles(self, smiles: str): if not smiles or not smiles.strip(): return None, "smiles is required" mol = Chem.MolFromSmiles(smiles.strip()) if mol is None: return None, f"Invalid SMILES: {smiles}" return mol, None
[docs] def _get_descriptors(self, mol) -> Dict[str, Any]: return { "MW": round(Descriptors.ExactMolWt(mol), 2), "cLogP": round(Descriptors.MolLogP(mol), 3), "HBD": int(rdMolDescriptors.CalcNumHBD(mol)), "HBA": int(rdMolDescriptors.CalcNumHBA(mol)), "TPSA": round(Descriptors.TPSA(mol), 2), "rotatable_bonds": int(rdMolDescriptors.CalcNumRotatableBonds(mol)), "heavy_atoms": int(mol.GetNumHeavyAtoms()), "rings": int(rdMolDescriptors.CalcNumRings(mol)), }
[docs] def _pharmacophore_features(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Extract pharmacophore features from SMILES using SMARTS matching.""" smiles = arguments.get("smiles", "").strip() mol, err = self._parse_smiles(smiles) if err: return {"status": "error", "error": err} include_features = arguments.get("include_features") if include_features is None: include_features = list(_PHARM_SMARTS.keys()) features_found: Dict[str, Any] = {} feature_counts: Dict[str, int] = {} for feature_name in include_features: if feature_name not in _PHARM_SMARTS: continue atom_indices: set = set() for smarts in _PHARM_SMARTS[feature_name]: # already a list smarts = smarts.strip() if not smarts: continue try: pattern = Chem.MolFromSmarts(smarts) if pattern is None: continue for match in mol.GetSubstructMatches(pattern): atom_indices.update(match) except Exception: continue features_found[feature_name] = { "count": len(atom_indices), "atom_indices": sorted(list(atom_indices)), } feature_counts[feature_name] = len(atom_indices) # Attempt 3D conformer generation for feature center coordinates feature_centers_3d = None try: mol_h = Chem.AddHs(mol) embed_result = AllChem.EmbedMolecule(mol_h, randomSeed=42) if embed_result == 0: AllChem.MMFFOptimizeMolecule(mol_h) conf = mol_h.GetConformer() centers = {} for fname, fdata in features_found.items(): indices = fdata["atom_indices"] if indices: try: positions = [conf.GetAtomPosition(idx) for idx in indices] n = len(positions) center = [ round(sum(p.x for p in positions) / n, 3), round(sum(p.y for p in positions) / n, 3), round(sum(p.z for p in positions) / n, 3), ] centers[fname] = center except Exception: pass feature_centers_3d = centers if centers else None except Exception: pass n_hbd = feature_counts.get("HBD", 0) n_hba = feature_counts.get("HBA", 0) n_arom = feature_counts.get("Aromatic", 0) n_hydro = feature_counts.get("Hydrophobic", 0) return { "status": "success", "data": { "smiles": smiles, "pharmacophore_features": features_found, "feature_counts": feature_counts, "feature_centers_3d": feature_centers_3d, "interpretation": { "HBD_count": n_hbd, "HBA_count": n_hba, "aromatic_atom_count": n_arom, "hydrophobic_atom_count": n_hydro, "hbd_hba_ratio": round(n_hbd / n_hba, 2) if n_hba > 0 else None, "3d_available": feature_centers_3d is not None, "note": ( "High hydrophobic + low HBA may indicate poor aqueous solubility. " "HBD/HBA ratio influences membrane permeability (Ro5). " + ( "3D feature centers computed from MMFF conformer." if feature_centers_3d else "3D conformer generation failed; 2D feature counts provided." ) ), }, }, "metadata": { "source": "RDKit local computation", "method": "SMARTS-based pharmacophore features", "3d_method": "MMFF optimization via AllChem.EmbedMolecule", "reference": "Leach et al. (2010); RDKit MolChemicalFeatures", }, }
[docs] def _matched_molecular_pair(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Find MMP transformation between two SMILES compounds.""" smiles_a = arguments.get("smiles_a", "").strip() smiles_b = arguments.get("smiles_b", "").strip() mol_a, err = self._parse_smiles(smiles_a) if err: return {"status": "error", "error": f"smiles_a: {err}"} mol_b, err = self._parse_smiles(smiles_b) if err: return {"status": "error", "error": f"smiles_b: {err}"} canonical_a = Chem.MolToSmiles(mol_a) canonical_b = Chem.MolToSmiles(mol_b) desc_a = self._get_descriptors(mol_a) desc_b = self._get_descriptors(mol_b) deltas = {k: round(desc_b[k] - desc_a[k], 3) for k in desc_a} # Tanimoto similarity (Morgan r=2 fingerprints) tanimoto = None try: fp_a = AllChem.GetMorganFingerprintAsBitVect(mol_a, 2, nBits=2048) fp_b = AllChem.GetMorganFingerprintAsBitVect(mol_b, 2, nBits=2048) tanimoto = round(float(DataStructs.TanimotoSimilarity(fp_a, fp_b)), 4) except Exception: pass # MMP analysis via rdMMPA fragmentation mmp_result: Dict[str, Any] = {} if HAS_MMPA: try: frags_a = rdMMPA.FragmentMol( mol_a, minCuts=1, maxCuts=1, maxCutBonds=100 ) frags_b = rdMMPA.FragmentMol( mol_b, minCuts=1, maxCuts=1, maxCutBonds=100 ) # rdMMPA.FragmentMol returns (None, combined_mol) tuples where # combined_mol contains both sidechain and core as disconnected # fragments joined by '.' in the SMILES (both carry [*:1] attachment). # Use GetMolFrags to separate them; larger fragment = core. from rdkit.Chem import rdmolops def _split_frag_pair(frag_pair): """Return (core_smi, sc_smi) or (None, None) on failure.""" if len(frag_pair) < 2 or frag_pair[1] is None: return None, None combined = frag_pair[1] parts = rdmolops.GetMolFrags(combined, asMols=True) if len(parts) != 2: return None, None # Larger fragment = core; smaller = sidechain parts_sorted = sorted(parts, key=lambda m: m.GetNumHeavyAtoms()) sc_mol, core_mol = parts_sorted[0], parts_sorted[1] return Chem.MolToSmiles(core_mol), Chem.MolToSmiles(sc_mol) cores_a: Dict[str, str] = {} for frag_pair in frags_a: core_smi, sc_smi = _split_frag_pair(frag_pair) if core_smi: cores_a[core_smi] = sc_smi or "" cores_b: Dict[str, str] = {} for frag_pair in frags_b: core_smi, sc_smi = _split_frag_pair(frag_pair) if core_smi: cores_b[core_smi] = sc_smi or "" common = set(cores_a.keys()) & set(cores_b.keys()) if common: # Pick largest common core by heavy atom count best_core = max( common, key=lambda s: Chem.MolFromSmiles(s).GetNumHeavyAtoms() if Chem.MolFromSmiles(s) else 0, ) sc_a = cores_a.get(best_core, "") sc_b = cores_b.get(best_core, "") mmp_result = { "is_mmp": True, "common_core_smiles": best_core, "side_chain_a": sc_a, "side_chain_b": sc_b, "transformation": f"{sc_a}{sc_b}", "n_common_cores_found": len(common), } else: mmp_result = { "is_mmp": False, "note": ( "No common core found with single-cut fragmentation. " "Compounds may differ by multiple structural changes." ), } except Exception as e: mmp_result = { "is_mmp": None, "error": f"MMP fragmentation failed: {str(e)}", } else: mmp_result = { "is_mmp": None, "note": "rdMMPA module not available in this RDKit build.", } if tanimoto is not None: if tanimoto >= 0.85: sim_label = "High (≥0.85) — likely MMP" elif tanimoto >= 0.5: sim_label = "Moderate (0.5–0.85)" else: sim_label = "Low (<0.5) — structurally distinct" else: sim_label = None return { "status": "success", "data": { "smiles_a": canonical_a, "smiles_b": canonical_b, "tanimoto_similarity": tanimoto, "similarity_label": sim_label, "mmp_analysis": mmp_result, "property_deltas": { "description": "Change in properties (B minus A). Positive = increase in compound B.", "deltas": deltas, "formatted": { "ΔMW": f"{deltas['MW']:+.1f} Da", "ΔcLogP": f"{deltas['cLogP']:+.3f}", "ΔHBD": f"{deltas['HBD']:+d}", "ΔHBA": f"{deltas['HBA']:+d}", "ΔTPSA": f"{deltas['TPSA']:+.1f} Ų", }, }, "descriptors": { "compound_a": desc_a, "compound_b": desc_b, }, }, "metadata": { "source": "RDKit local computation", "method": "rdMMPA single-cut fragmentation + Morgan r=2 Tanimoto", "reference": "Hussain & Rea, J. Chem. Inf. Model. (2010)", }, }