Source code for tooluniverse.boltz_tool
import os
import pprint
import subprocess
import tempfile
import yaml
import json
import shutil
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("Boltz2DockingTool")
class Boltz2DockingTool(BaseTool):
"""
Tool to perform protein-ligand docking and affinity prediction using the local Boltz-2 model.
This tool constructs a YAML input file, runs the `boltz predict` command,
and parses the output to return the predicted structure and affinity.
"""
[docs]
def __init__(self, tool_config: dict):
"""
Initializes the BoltzDockingTool.
Checks if the 'boltz' command is available in the system's PATH.
"""
super().__init__(tool_config)
if not shutil.which("boltz"):
raise EnvironmentError(
"The 'boltz' command is not found. "
"Please ensure the 'boltz' package is installed and accessible in the system's PATH. "
"Installation guide: https://github.com/jwohlwend/boltz"
)
def _build_yaml_input(self, arguments: dict) -> dict:
"""Constructs the YAML data structure for the Boltz input."""
protein_sequence = arguments.get("protein_sequence")
ligands = arguments.get("ligands", [])
# The first ligand is assumed to be the binder for affinity prediction
if not ligands:
raise ValueError(
"At least one ligand must be provided in the 'ligands' list."
)
binder_id = ligands[0].get("id")
if not binder_id:
raise ValueError("The first ligand in the list must have a valid 'id'.")
# --- Sequences Section ---
sequences = [{"protein": {"id": "A", "sequence": protein_sequence}}]
for i, ligand_data in enumerate(ligands):
chain_id = ligand_data.get("id")
if not chain_id:
raise ValueError(f"Ligand at index {i} must have an 'id' key.")
entry = {"id": chain_id}
if "smiles" in ligand_data:
entry["smiles"] = ligand_data["smiles"]
elif "ccd" in ligand_data:
entry["ccd"] = ligand_data["ccd"]
else:
raise ValueError(
f"Ligand at index {i} must have a 'smiles' or 'ccd' key."
)
sequences.append({"ligand": entry})
# --- Properties Section (for Affinity) ---
properties = [{"affinity": {"binder": binder_id}}]
# --- Final YAML Structure ---
yaml_input = {"version": 1, "sequences": sequences, "properties": properties}
# Add optional fields
if "constraints" in arguments:
yaml_input["constraints"] = arguments["constraints"]
if "templates" in arguments:
yaml_input["templates"] = arguments["templates"]
return yaml_input
[docs]
def run(self, arguments: dict | None = None, timeout: int = 1200) -> dict:
"""
Executes the Boltz prediction.
Args:
arguments (dict): A dictionary containing the necessary inputs.
- protein_sequence (str): The amino acid sequence of the protein.
- ligands (list[dict]): A list of ligands, each with a 'smiles' or 'ccd' key.
- constraints (list[dict], optional): Covalent bonds or other constraints.
- templates (list[dict], optional): Structural templates.
- other optional boltz CLI flags (e.g., 'recycling_steps').
timeout (int): The maximum time in seconds to wait for the Boltz command to complete.
Returns:
dict: A dictionary containing the path to the predicted structure and affinity data, or an error.
"""
arguments = arguments or {}
if not arguments.get("protein_sequence"):
return {"error": "The 'protein_sequence' parameter is required."}
# Create a temporary directory to store input and output files
with tempfile.TemporaryDirectory() as temp_dir:
input_filename = "boltz_input"
input_yaml_path = os.path.join(temp_dir, f"{input_filename}.yaml")
output_dir = os.path.join(temp_dir, "results")
os.makedirs(output_dir, exist_ok=True)
# Build and write the input YAML file
yaml_data = self._build_yaml_input(arguments)
with open(input_yaml_path, "w") as f:
yaml.dump(yaml_data, f, sort_keys=False)
# Construct the command-line arguments for Boltz
command = [
"boltz",
"predict",
input_yaml_path,
"--out_dir",
output_dir,
"--use_msa_server",
"--override", # Override existing results if any
]
# Add optional command-line flags from arguments
for key in [
"recycling_steps",
"diffusion_samples",
"sampling_steps",
"step_scale",
]:
if key in arguments:
command.extend([f"--{key}", str(arguments[key])])
if arguments.get("use_potentials", False):
command.append("--use_potentials")
# Execute the Boltz command
subprocess.run(
command,
capture_output=True,
text=True,
timeout=timeout,
check=True, # Will raise CalledProcessError on non-zero exit codes
)
# --- Parse the output files ---
# 1. locate the Boltz run folder under your out_dir
root_dirs = [
d
for d in os.listdir(output_dir)
if os.path.isdir(os.path.join(output_dir, d))
]
if not root_dirs:
return {"error": "No Boltz run folder found under out_dir"}
if len(root_dirs) > 1:
# you could pick the latest by timestamp instead of the first
run_dir_name = sorted(root_dirs)[-1]
else:
run_dir_name = root_dirs[0]
run_root = os.path.join(output_dir, run_dir_name)
# 2. now point at predictions/<input_filename>
prediction_folder = os.path.join(run_root, "predictions", input_filename)
results = {}
# 3. structure .cif
if arguments.get("return_structure", False):
structure_file = os.path.join(
prediction_folder, f"{input_filename}_model_0.cif"
)
if os.path.exists(structure_file):
with open(structure_file, "r") as f:
results["predicted_structure"] = f.read()
results["structure_format"] = "cif"
else:
results["structure_error"] = (
f"Missing {os.path.basename(structure_file)}"
)
# 4. affinity .json
affinity_file = os.path.join(
prediction_folder, f"affinity_{input_filename}.json"
)
if os.path.exists(affinity_file):
with open(affinity_file, "r") as f:
results["affinity_prediction"] = json.load(f)
else:
results["affinity_error"] = f"Missing {os.path.basename(affinity_file)}"
return results
if __name__ == "__main__":
# Example usage
tool = Boltz2DockingTool(tool_config={})
query = {
"protein_sequence": "ACDEFGHIKLMNPQRSTVWY",
"ligands": [
{"id": "LIG1", "smiles": "C1=CC=CC=C1"},
],
"use_potentials": False,
"diffusion_samples": 1,
"return_structure": False,
}
result = tool.run(query)
pprint.pprint(result)