Source code for tooluniverse.dna_tools

"""
DNA Design Tools

Provides local computational tools for DNA sequence analysis and design,
including restriction site detection, ORF finding, GC content calculation,
reverse complement, sequence translation, codon optimization, virtual digest,
primer design, Gibson assembly design, and Golden Gate assembly design.

No external API calls required. Pure Python implementation.
"""

import math
import re
from typing import Dict, Any, List, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool

# Standard genetic codon table (NCBI Standard Code 1)
STANDARD_CODON_TABLE = {
    "TTT": "F",
    "TTC": "F",
    "TTA": "L",
    "TTG": "L",
    "CTT": "L",
    "CTC": "L",
    "CTA": "L",
    "CTG": "L",
    "ATT": "I",
    "ATC": "I",
    "ATA": "I",
    "ATG": "M",
    "GTT": "V",
    "GTC": "V",
    "GTA": "V",
    "GTG": "V",
    "TCT": "S",
    "TCC": "S",
    "TCA": "S",
    "TCG": "S",
    "CCT": "P",
    "CCC": "P",
    "CCA": "P",
    "CCG": "P",
    "ACT": "T",
    "ACC": "T",
    "ACA": "T",
    "ACG": "T",
    "GCT": "A",
    "GCC": "A",
    "GCA": "A",
    "GCG": "A",
    "TAT": "Y",
    "TAC": "Y",
    "TAA": "*",
    "TAG": "*",
    "CAT": "H",
    "CAC": "H",
    "CAA": "Q",
    "CAG": "Q",
    "AAT": "N",
    "AAC": "N",
    "AAA": "K",
    "AAG": "K",
    "GAT": "D",
    "GAC": "D",
    "GAA": "E",
    "GAG": "E",
    "TGT": "C",
    "TGC": "C",
    "TGA": "*",
    "TGG": "W",
    "CGT": "R",
    "CGC": "R",
    "CGA": "R",
    "CGG": "R",
    "AGT": "S",
    "AGC": "S",
    "AGA": "R",
    "AGG": "R",
    "GGT": "G",
    "GGC": "G",
    "GGA": "G",
    "GGG": "G",
}

COMPLEMENT = str.maketrans("ATGCatgc", "TACGtacg")

# Codon frequency tables: species -> amino_acid -> preferred codon
# Source: Codon Usage Database (highest-frequency codon per amino acid per species)
CODON_FREQ_TABLES = {
    "human": {
        "A": "GCC",
        "R": "AGG",
        "N": "AAC",
        "D": "GAC",
        "C": "TGC",
        "Q": "CAG",
        "E": "GAG",
        "G": "GGC",
        "H": "CAC",
        "I": "ATC",
        "L": "CTG",
        "K": "AAG",
        "M": "ATG",
        "F": "TTC",
        "P": "CCC",
        "S": "AGC",
        "T": "ACC",
        "W": "TGG",
        "Y": "TAC",
        "V": "GTG",
        "*": "TGA",
    },
    "ecoli": {
        "A": "GCG",
        "R": "CGT",
        "N": "AAC",
        "D": "GAT",
        "C": "TGC",
        "Q": "CAG",
        "E": "GAA",
        "G": "GGC",
        "H": "CAC",
        "I": "ATC",
        "L": "CTG",
        "K": "AAA",
        "M": "ATG",
        "F": "TTT",
        "P": "CCG",
        "S": "AGC",
        "T": "ACC",
        "W": "TGG",
        "Y": "TAT",
        "V": "GTG",
        "*": "TAA",
    },
    "mouse": {
        "A": "GCC",
        "R": "AGG",
        "N": "AAC",
        "D": "GAC",
        "C": "TGC",
        "Q": "CAG",
        "E": "GAG",
        "G": "GGC",
        "H": "CAC",
        "I": "ATC",
        "L": "CTG",
        "K": "AAG",
        "M": "ATG",
        "F": "TTC",
        "P": "CCC",
        "S": "AGC",
        "T": "ACC",
        "W": "TGG",
        "Y": "TAC",
        "V": "GTG",
        "*": "TGA",
    },
    "yeast": {
        "A": "GCT",
        "R": "AGA",
        "N": "AAT",
        "D": "GAT",
        "C": "TGT",
        "Q": "CAA",
        "E": "GAA",
        "G": "GGT",
        "H": "CAT",
        "I": "ATT",
        "L": "TTG",
        "K": "AAG",
        "M": "ATG",
        "F": "TTT",
        "P": "CCA",
        "S": "TCT",
        "T": "ACT",
        "W": "TGG",
        "Y": "TAT",
        "V": "GTT",
        "*": "TAA",
    },
}

# Codon adaptation index reference values for each species (relative adaptiveness)
# Simplified: ratio of codon usage frequency to max frequency in synonymous group
CAI_REFERENCE = {
    "human": {
        "GCC": 1.0,
        "GCT": 0.71,
        "GCA": 0.54,
        "GCG": 0.11,
        "AGG": 1.0,
        "AGA": 0.84,
        "CGC": 0.77,
        "CGG": 0.64,
        "CGT": 0.44,
        "CGA": 0.34,
        "AAC": 1.0,
        "AAT": 0.75,
        "GAC": 1.0,
        "GAT": 0.81,
        "TGC": 1.0,
        "TGT": 0.72,
        "CAG": 1.0,
        "CAA": 0.37,
        "GAG": 1.0,
        "GAA": 0.69,
        "GGC": 1.0,
        "GGG": 0.77,
        "GGA": 0.74,
        "GGT": 0.56,
        "CAC": 1.0,
        "CAT": 0.69,
        "ATC": 1.0,
        "ATT": 0.70,
        "ATA": 0.34,
        "CTG": 1.0,
        "CTC": 0.58,
        "TTG": 0.39,
        "CTT": 0.36,
        "CTA": 0.19,
        "TTA": 0.12,
        "AAG": 1.0,
        "AAA": 0.74,
        "ATG": 1.0,
        "TTC": 1.0,
        "TTT": 0.72,
        "CCC": 1.0,
        "CCT": 0.78,
        "CCA": 0.73,
        "CCG": 0.20,
        "AGC": 1.0,
        "TCC": 0.87,
        "TCT": 0.76,
        "AGT": 0.65,
        "TCA": 0.58,
        "TCG": 0.18,
        "ACC": 1.0,
        "ACA": 0.79,
        "ACT": 0.73,
        "ACG": 0.25,
        "TGG": 1.0,
        "TAC": 1.0,
        "TAT": 0.72,
        "GTG": 1.0,
        "GTC": 0.62,
        "GTT": 0.56,
        "GTA": 0.28,
    },
    "ecoli": {
        "GCG": 1.0,
        "GCC": 0.53,
        "GCT": 0.49,
        "GCA": 0.40,
        "CGT": 1.0,
        "CGC": 0.97,
        "CGA": 0.38,
        "CGG": 0.35,
        "AGA": 0.10,
        "AGG": 0.06,
        "AAC": 1.0,
        "AAT": 0.49,
        "GAT": 1.0,
        "GAC": 0.53,
        "TGC": 1.0,
        "TGT": 0.52,
        "CAG": 1.0,
        "CAA": 0.35,
        "GAA": 1.0,
        "GAG": 0.50,
        "GGC": 1.0,
        "GGT": 0.75,
        "GGA": 0.28,
        "GGG": 0.24,
        "CAC": 1.0,
        "CAT": 0.69,
        "ATC": 1.0,
        "ATT": 0.92,
        "ATA": 0.11,
        "CTG": 1.0,
        "TTA": 0.20,
        "TTG": 0.18,
        "CTT": 0.15,
        "CTC": 0.12,
        "CTA": 0.06,
        "AAA": 1.0,
        "AAG": 0.25,
        "ATG": 1.0,
        "TTT": 1.0,
        "TTC": 0.56,
        "CCG": 1.0,
        "CCA": 0.29,
        "CCT": 0.24,
        "CCC": 0.16,
        "AGC": 1.0,
        "TCT": 0.85,
        "TCC": 0.66,
        "TCA": 0.58,
        "TCG": 0.54,
        "AGT": 0.42,
        "ACC": 1.0,
        "ACT": 0.69,
        "ACA": 0.40,
        "ACG": 0.37,
        "TGG": 1.0,
        "TAT": 1.0,
        "TAC": 0.57,
        "GTG": 1.0,
        "GTT": 0.68,
        "GTC": 0.38,
        "GTA": 0.33,
    },
    "mouse": {
        "GCC": 1.0,
        "GCT": 0.73,
        "GCA": 0.51,
        "GCG": 0.10,
        "AGG": 1.0,
        "AGA": 0.83,
        "CGC": 0.74,
        "CGG": 0.63,
        "CGT": 0.42,
        "CGA": 0.32,
        "AAC": 1.0,
        "AAT": 0.73,
        "GAC": 1.0,
        "GAT": 0.80,
        "TGC": 1.0,
        "TGT": 0.70,
        "CAG": 1.0,
        "CAA": 0.36,
        "GAG": 1.0,
        "GAA": 0.67,
        "GGC": 1.0,
        "GGG": 0.75,
        "GGA": 0.72,
        "GGT": 0.54,
        "CAC": 1.0,
        "CAT": 0.67,
        "ATC": 1.0,
        "ATT": 0.68,
        "ATA": 0.32,
        "CTG": 1.0,
        "CTC": 0.56,
        "TTG": 0.38,
        "CTT": 0.34,
        "CTA": 0.18,
        "TTA": 0.11,
        "AAG": 1.0,
        "AAA": 0.72,
        "ATG": 1.0,
        "TTC": 1.0,
        "TTT": 0.70,
        "CCC": 1.0,
        "CCT": 0.76,
        "CCA": 0.71,
        "CCG": 0.18,
        "AGC": 1.0,
        "TCC": 0.85,
        "TCT": 0.74,
        "AGT": 0.63,
        "TCA": 0.56,
        "TCG": 0.17,
        "ACC": 1.0,
        "ACA": 0.77,
        "ACT": 0.71,
        "ACG": 0.23,
        "TGG": 1.0,
        "TAC": 1.0,
        "TAT": 0.70,
        "GTG": 1.0,
        "GTC": 0.60,
        "GTT": 0.54,
        "GTA": 0.26,
    },
    "yeast": {
        "GCT": 1.0,
        "GCC": 0.62,
        "GCA": 0.55,
        "GCG": 0.12,
        "AGA": 1.0,
        "AGG": 0.34,
        "CGT": 0.25,
        "CGC": 0.10,
        "CGA": 0.09,
        "CGG": 0.06,
        "AAT": 1.0,
        "AAC": 0.75,
        "GAT": 1.0,
        "GAC": 0.65,
        "TGT": 1.0,
        "TGC": 0.77,
        "CAA": 1.0,
        "CAG": 0.69,
        "GAA": 1.0,
        "GAG": 0.69,
        "GGT": 1.0,
        "GGA": 0.62,
        "GGG": 0.21,
        "GGC": 0.20,
        "CAT": 1.0,
        "CAC": 0.64,
        "ATT": 1.0,
        "ATC": 0.71,
        "ATA": 0.27,
        "TTG": 1.0,
        "TTA": 0.50,
        "CTT": 0.23,
        "CTC": 0.07,
        "CTG": 0.06,
        "CTA": 0.06,
        "AAG": 1.0,
        "AAA": 0.79,
        "ATG": 1.0,
        "TTT": 1.0,
        "TTC": 0.80,
        "CCA": 1.0,
        "CCT": 0.62,
        "CCC": 0.34,
        "CCG": 0.16,
        "TCT": 1.0,
        "AGT": 0.78,
        "TCA": 0.74,
        "TCC": 0.62,
        "AGC": 0.27,
        "TCG": 0.23,
        "ACT": 1.0,
        "ACA": 0.78,
        "ACC": 0.60,
        "ACG": 0.22,
        "TGG": 1.0,
        "TAT": 1.0,
        "TAC": 0.76,
        "GTT": 1.0,
        "GTC": 0.54,
        "GTG": 0.46,
        "GTA": 0.40,
    },
}

# SantaLucia 1998 nearest-neighbor parameters: dinucleotide -> (dH kcal/mol, dS cal/mol/K)
NN_PARAMS = {
    "AA": (-7.9, -22.2),
    "AT": (-7.2, -20.4),
    "TA": (-7.2, -21.3),
    "CA": (-8.5, -22.7),
    "GT": (-8.4, -22.4),
    "CT": (-7.8, -21.0),
    "GA": (-8.2, -22.2),
    "CG": (-10.6, -27.2),
    "GC": (-9.8, -24.4),
    "GG": (-8.0, -19.9),
}

# NEB common restriction enzymes: enzyme name -> recognition sequence
NEB_ENZYMES = {
    "EcoRI": "GAATTC",
    "BamHI": "GGATCC",
    "HindIII": "AAGCTT",
    "NcoI": "CCATGG",
    "NdeI": "CATATG",
    "XhoI": "CTCGAG",
    "XbaI": "TCTAGA",
    "SalI": "GTCGAC",
    "SmaI": "CCCGGG",
    "KpnI": "GGTACC",
    "SacI": "GAGCTC",
    "ClaI": "ATCGAT",
    "SpeI": "ACTAGT",
    "NotI": "GCGGCCGC",
    "PstI": "CTGCAG",
    "EcoRV": "GATATC",
    "NheI": "GCTAGC",
    "MluI": "ACGCGT",
    "ApaI": "GGGCCC",
    "SphI": "GCATGC",
    "BglII": "AGATCT",
    "AgeI": "ACCGGT",
    "AscI": "GGCGCGCC",
    "PacI": "TTAATTAA",
    "SfiI": "GGCCNNNNNGGCC",
}


[docs] @register_tool("DNATool") class DNATool(BaseTool): """ Local DNA sequence analysis and design tools. No external API calls. Provides: - Restriction site detection (NEB enzyme library) - Open reading frame (ORF) finding - GC content calculation - Reverse complement generation - DNA to protein translation """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.parameter = tool_config.get("parameter", {}) self.required = self.parameter.get("required", [])
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute DNA analysis tool with given arguments.""" operation = arguments.get("operation") if not operation: return {"status": "error", "error": "Missing required parameter: operation"} operation_handlers = { "find_restriction_sites": self._find_restriction_sites, "find_orfs": self._find_orfs, "calculate_gc_content": self._calculate_gc_content, "reverse_complement": self._reverse_complement, "translate_sequence": self._translate_sequence, "codon_optimize": self._codon_optimize, "virtual_digest": self._virtual_digest, "primer_design": self._primer_design, "gibson_design": self._gibson_design, "golden_gate_design": self._golden_gate_design, } handler = operation_handlers.get(operation) if not handler: return { "status": "error", "error": f"Unknown operation: {operation}", "available_operations": list(operation_handlers.keys()), } try: return handler(arguments) except Exception as e: return {"status": "error", "error": f"Operation failed: {str(e)}"}
[docs] def _validate_dna_sequence(self, seq: str) -> Optional[str]: """Validate DNA sequence, returns error message or None if valid.""" valid_chars = set("ATGCNatgcn") invalid = set(seq) - valid_chars if invalid: return f"Invalid DNA characters: {invalid}. Only A, T, G, C, N allowed." return None
[docs] def _find_restriction_sites(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Find restriction enzyme recognition sites in a DNA sequence.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} enzymes_requested = arguments.get("enzymes") if enzymes_requested: if isinstance(enzymes_requested, str): enzymes_requested = [enzymes_requested] enzyme_dict = { name: seq for name, seq in NEB_ENZYMES.items() if name in enzymes_requested } not_found = [e for e in enzymes_requested if e not in NEB_ENZYMES] if not_found: return { "status": "error", "error": f"Unknown enzymes: {not_found}. Available: {sorted(NEB_ENZYMES.keys())}", } else: enzyme_dict = NEB_ENZYMES results = {} for enzyme_name, recognition_seq in enzyme_dict.items(): if "N" in recognition_seq: pattern = recognition_seq.replace("N", "[ATGC]") positions = [ m.start() + 1 for m in re.finditer(f"(?={pattern})", sequence) ] else: positions = [] start = 0 while True: pos = sequence.find(recognition_seq, start) if pos == -1: break positions.append(pos + 1) # 1-based start = pos + 1 if positions: results[enzyme_name] = { "recognition_sequence": recognition_seq, "cut_sites": positions, "num_cuts": len(positions), } return { "status": "success", "data": { "sequence_length": len(sequence), "enzymes_with_sites": results, "enzymes_cutting": sorted(results.keys()), "num_enzymes_cutting": len(results), }, }
[docs] def _find_orfs(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Find open reading frames (ORFs) in a DNA sequence.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} min_length = arguments.get("min_length", 100) # minimum nt length strand = arguments.get("strand", "both") # "forward", "reverse", "both" _STOP_SET = {"TAA", "TAG", "TGA"} def find_orfs_in_sequence(seq: str, is_reverse: bool = False) -> List[Dict]: """Scan all three reading frames using a state-machine (open/closed).""" orfs = [] seq_len = len(seq) strand_label = "-" if is_reverse else "+" for frame_offset in (0, 1, 2): orf_open = False orf_start_idx = 0 pos = frame_offset while pos + 3 <= seq_len: codon = seq[pos : pos + 3] if not orf_open: if codon == "ATG": orf_open = True orf_start_idx = pos else: if codon in _STOP_SET: orf_nt_len = pos + 3 - orf_start_idx if orf_nt_len >= min_length: if is_reverse: coord_start = seq_len - (pos + 3) + 1 coord_end = seq_len - orf_start_idx else: coord_start = orf_start_idx + 1 # 1-based coord_end = pos + 3 orfs.append( { "start": coord_start, "end": coord_end, "length_nt": orf_nt_len, "length_aa": orf_nt_len // 3 - 1, "frame": frame_offset + 1, "strand": strand_label, "sequence": seq[orf_start_idx : pos + 3], } ) orf_open = False pos += 3 return orfs all_orfs = [] if strand in ("forward", "both"): all_orfs.extend(find_orfs_in_sequence(sequence, is_reverse=False)) if strand in ("reverse", "both"): rev_comp = sequence.translate(COMPLEMENT)[::-1] all_orfs.extend(find_orfs_in_sequence(rev_comp, is_reverse=True)) all_orfs.sort(key=lambda x: x["length_nt"], reverse=True) return { "status": "success", "data": { "sequence_length": len(sequence), "min_length_nt": min_length, "num_orfs_found": len(all_orfs), "orfs": all_orfs[:50], }, }
[docs] def _calculate_gc_content(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Calculate GC content and nucleotide composition of a DNA sequence.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} total = len(sequence) if total == 0: return {"status": "error", "error": "Empty sequence"} counts = { "A": sequence.count("A"), "T": sequence.count("T"), "G": sequence.count("G"), "C": sequence.count("C"), "N": sequence.count("N"), } gc_count = counts["G"] + counts["C"] at_count = counts["A"] + counts["T"] effective_total = total - counts["N"] gc_content = (gc_count / effective_total * 100) if effective_total > 0 else 0 return { "status": "success", "data": { "gc_content_percent": round(gc_content, 2), "at_content_percent": round( (at_count / effective_total * 100) if effective_total > 0 else 0, 2 ), "nucleotide_counts": counts, "sequence_length": total, "effective_length": effective_total, "interpretation": ( "High GC" if gc_content > 60 else "Low GC" if gc_content < 40 else "Normal GC" ), }, }
[docs] def _reverse_complement(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Generate the reverse complement of a DNA sequence.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} complement_map = str.maketrans("ATGCNatgcn", "TACGNtacgn") rev_comp = sequence.translate(complement_map)[::-1] return { "status": "success", "data": { "original": sequence, "reverse_complement": rev_comp, "length": len(sequence), }, }
[docs] def _translate_sequence(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Translate a DNA sequence to protein using the standard codon table.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} codon_table_name = arguments.get("codon_table", "standard") if codon_table_name != "standard": return { "status": "error", "error": "Only 'standard' codon table is currently supported", } if len(sequence) % 3 != 0: trimmed_len = len(sequence) - (len(sequence) % 3) sequence_trimmed = sequence[:trimmed_len] warning = f"Sequence length {len(sequence)} is not divisible by 3; trimmed to {trimmed_len} nt" else: sequence_trimmed = sequence warning = None protein = [] stop_positions = [] for i in range(0, len(sequence_trimmed), 3): codon = sequence_trimmed[i : i + 3] aa = STANDARD_CODON_TABLE.get(codon, "X") if aa == "*": stop_positions.append(i // 3 + 1) protein.append("*") else: protein.append(aa) protein_seq = "".join(protein) if "*" in protein_seq: first_stop = protein_seq.index("*") protein_seq_no_stop = protein_seq[:first_stop] else: protein_seq_no_stop = protein_seq result = { "status": "success", "data": { "dna_sequence": sequence, "protein_sequence": protein_seq_no_stop, "full_translation": protein_seq, "protein_length_aa": len(protein_seq_no_stop), "stop_codons_found": len(stop_positions), "stop_codon_positions": stop_positions, "codon_table": codon_table_name, }, } if warning: result["data"]["warning"] = warning return result
[docs] def _codon_optimize(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Codon-optimize an amino acid sequence for expression in a target species.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().strip() species = (arguments.get("species") or "human").lower() if species not in CODON_FREQ_TABLES: return { "status": "error", "error": f"Unknown species: {species}. Available: {sorted(CODON_FREQ_TABLES.keys())}", } codon_table = CODON_FREQ_TABLES[species] cai_table = CAI_REFERENCE[species] valid_aa = set("ACDEFGHIKLMNPQRSTVWY*") invalid = set(sequence) - valid_aa if invalid: return { "status": "error", "error": f"Invalid amino acid characters: {invalid}. Use single-letter codes.", } dna_codons = [] cai_values = [] for aa in sequence: if aa == "*": codon = codon_table.get("*", "TAA") else: codon = codon_table.get(aa) if codon is None: return { "status": "error", "error": f"No codon found for amino acid: {aa}", } dna_codons.append(codon) cai_values.append(cai_table.get(codon, 1.0)) optimized_dna = "".join(dna_codons) length_bp = len(optimized_dna) gc = sum(1 for b in optimized_dna if b in "GC") gc_content = round(gc / length_bp * 100, 2) if length_bp > 0 else 0.0 if cai_values: log_sum = sum(math.log(v) for v in cai_values if v > 0) cai = round(math.exp(log_sum / len(cai_values)), 4) else: cai = 0.0 return { "status": "success", "data": { "optimized_dna": optimized_dna, "gc_content": gc_content, "cai": cai, "length_bp": length_bp, }, }
[docs] def _virtual_digest(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Perform a virtual restriction digest of a DNA sequence.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} enzymes_requested = arguments.get("enzymes") circular = bool(arguments.get("circular", False)) if enzymes_requested: if isinstance(enzymes_requested, str): enzymes_requested = [enzymes_requested] not_found = [e for e in enzymes_requested if e not in NEB_ENZYMES] if not_found: return { "status": "error", "error": f"Unknown enzymes: {not_found}. Available: {sorted(NEB_ENZYMES.keys())}", } enzyme_dict = {name: NEB_ENZYMES[name] for name in enzymes_requested} else: enzyme_dict = NEB_ENZYMES cut_sites_list = [] enzymes_used = [] for enzyme_name, recognition_seq in enzyme_dict.items(): if "N" in recognition_seq: pattern = recognition_seq.replace("N", "[ATGC]") positions = [m.start() for m in re.finditer(f"(?={pattern})", sequence)] else: positions = [] start = 0 while True: pos = sequence.find(recognition_seq, start) if pos == -1: break positions.append(pos) start = pos + 1 for pos in positions: cut_pos = pos + len(recognition_seq) cut_sites_list.append({"enzyme": enzyme_name, "position": cut_pos}) if positions: enzymes_used.append(enzyme_name) cut_sites_list.sort(key=lambda x: x["position"]) seq_len = len(sequence) fragments = [] if not cut_sites_list: fragments.append( { "sequence": sequence, "length": seq_len, "start": 0, "end": seq_len, } ) else: cut_positions = sorted(set(cs["position"] for cs in cut_sites_list)) if circular: boundaries = cut_positions for i in range(len(boundaries)): start = boundaries[i] end = boundaries[(i + 1) % len(boundaries)] if end > start: frag_seq = sequence[start:end] else: frag_seq = sequence[start:] + sequence[:end] fragments.append( { "sequence": frag_seq, "length": len(frag_seq), "start": start, "end": end, } ) else: starts = [0] + cut_positions ends = cut_positions + [seq_len] for s, e in zip(starts, ends): frag_seq = sequence[s:e] if frag_seq: fragments.append( { "sequence": frag_seq, "length": len(frag_seq), "start": s, "end": e, } ) return { "status": "success", "data": { "fragments": fragments, "n_fragments": len(fragments), "enzymes_used": sorted(enzymes_used), "cut_sites": cut_sites_list, }, }
[docs] def _calc_tm_nn(self, primer: str) -> float: """Calculate melting temperature using SantaLucia 1998 nearest-neighbor model.""" seq = primer.upper() n = len(seq) if n < 2: return 0.0 dH = 0.0 # kcal/mol dS = 0.0 # cal/mol/K # Initiation parameters (SantaLucia 1998) dH += 0.2 dS += -5.7 for i in range(n - 1): dinuc = seq[i : i + 2] params = NN_PARAMS.get(dinuc) if params is None: rev_dinuc = dinuc.translate(COMPLEMENT)[::-1] params = NN_PARAMS.get(rev_dinuc) if params: dH += params[0] dS += params[1] # Convert: Tm = dH*1000 / (dS + R*ln(CT/4)) - 273.15 # R = 1.987 cal/mol/K, CT = 250e-9 M (250 nM typical) R = 1.987 CT = 250e-9 dS_total = dS + R * math.log(CT / 4) if dS_total == 0: return 0.0 tm = (dH * 1000) / dS_total - 273.15 return round(tm, 1)
[docs] def _primer_design(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Design PCR primers for a target region using nearest-neighbor Tm calculation.""" sequence = arguments.get("sequence", "") if not sequence: return {"status": "error", "error": "sequence is required"} sequence = sequence.upper().replace(" ", "").replace("\n", "") error = self._validate_dna_sequence(sequence) if error: return {"status": "error", "error": error} seq_len = len(sequence) target_start = arguments.get("target_start") target_end = arguments.get("target_end") tm_target = float(arguments.get("tm_target") or 60.0) product_size_min = int(arguments.get("product_size_min") or 100) product_size_max = int(arguments.get("product_size_max") or 1000) if target_start is None: target_start = 0 if target_end is None: target_end = seq_len target_start = max(0, int(target_start)) target_end = min(seq_len, int(target_end)) if target_end - target_start < product_size_min: return { "status": "error", "error": f"Target region ({target_end - target_start} bp) is smaller than product_size_min ({product_size_min} bp)", } fwd_search_start = max(0, target_start - 50) fwd_search_end = min(seq_len - 18, target_start + 50) rev_search_start = max(18, target_end - 50) rev_search_end = min(seq_len, target_end + 50) complement_map = str.maketrans("ATGCNatgcn", "TACGNtacgn") def has_3prime_repeat(seq: str, max_repeat: int = 3) -> bool: """Check if 3' end has too many identical bases.""" if len(seq) < max_repeat + 1: return False tail = seq[-max_repeat:] return len(set(tail)) == 1 def gc_content(seq: str) -> float: gc = sum(1 for b in seq if b in "GC") return gc / len(seq) * 100 if len(seq) > 0 else 0 def score_primer(primer: str, tm: float) -> float: """Lower is better.""" return abs(tm - tm_target) best_fwd = None best_fwd_score = float("inf") for start in range(fwd_search_start, fwd_search_end + 1): for length in range(18, 26): end = start + length if end > seq_len: break primer = sequence[start:end] gc = gc_content(primer) if gc < 40 or gc > 60: continue if has_3prime_repeat(primer): continue tm = self._calc_tm_nn(primer) score = score_primer(primer, tm) if score < best_fwd_score: best_fwd_score = score best_fwd = { "sequence": primer, "tm": tm, "gc_content": round(gc, 1), "length": length, "start": start, } if not best_fwd: return { "status": "error", "error": "Could not design a suitable forward primer in the specified region", } best_rev = None best_rev_score = float("inf") for end in range(rev_search_end, rev_search_start - 1, -1): for length in range(18, 26): start = end - length if start < 0: break primer_template = sequence[start:end] # Reverse primer is reverse complement of template rev_primer = primer_template.translate(complement_map)[::-1] gc = gc_content(rev_primer) if gc < 40 or gc > 60: continue if has_3prime_repeat(rev_primer): continue tm = self._calc_tm_nn(rev_primer) score = score_primer(rev_primer, tm) if score < best_rev_score: best_rev_score = score best_rev = { "sequence": rev_primer, "tm": tm, "gc_content": round(gc, 1), "length": length, "end": end, } if not best_rev: return { "status": "error", "error": "Could not design a suitable reverse primer in the specified region", } product_size = best_rev["end"] - best_fwd["start"] if product_size < product_size_min or product_size > product_size_max: return { "status": "error", "error": f"Designed product size ({product_size} bp) is outside the range [{product_size_min}, {product_size_max}] bp", } return { "status": "success", "data": { "forward_primer": best_fwd, "reverse_primer": best_rev, "product_size": product_size, }, }
[docs] def _gibson_design(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Design Gibson Assembly overlaps for a set of DNA fragments.""" fragments = arguments.get("fragments") if not fragments or not isinstance(fragments, list) or len(fragments) < 2: return { "status": "error", "error": "fragments must be a list of at least 2 DNA sequences", } overlap_length = int(arguments.get("overlap_length") or 20) if overlap_length < 1: return {"status": "error", "error": "overlap_length must be at least 1"} for i, frag in enumerate(fragments): frag_upper = frag.upper().replace(" ", "").replace("\n", "") err = self._validate_dna_sequence(frag_upper) if err: return {"status": "error", "error": f"Fragment {i + 1}: {err}"} if len(frag_upper) <= overlap_length: return { "status": "error", "error": f"Fragment {i + 1} (length {len(frag_upper)}) must be longer than overlap_length ({overlap_length})", } fragments_clean = [ f.upper().replace(" ", "").replace("\n", "") for f in fragments ] n = len(fragments_clean) assembly_fragments = [] for i, frag in enumerate(fragments_clean): prev_frag = fragments_clean[(i - 1) % n] next_frag = fragments_clean[(i + 1) % n] left_overlap = prev_frag[-overlap_length:] right_overlap = next_frag[:overlap_length] with_overlaps = left_overlap + frag + right_overlap assembly_fragments.append( { "name": f"Fragment_{i + 1}", "original_sequence": frag, "with_overlaps": with_overlaps, "left_overlap": left_overlap, "right_overlap": right_overlap, } ) return { "status": "success", "data": { "assembly_fragments": assembly_fragments, "n_fragments": n, "overlap_length": overlap_length, }, }
[docs] def _golden_gate_design(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Design Golden Gate Assembly parts with BsaI or BbsI overhangs.""" parts = arguments.get("parts") if not parts or not isinstance(parts, list) or len(parts) < 2: return { "status": "error", "error": "parts must be a list of at least 2 DNA sequences", } enzyme = (arguments.get("enzyme") or "BsaI").upper() if enzyme not in ("BSAI", "BBSI"): return {"status": "error", "error": "enzyme must be 'BsaI' or 'BbsI'"} enzyme_display = "BsaI" if enzyme == "BSAI" else "BbsI" for i, part in enumerate(parts): part_upper = part.upper().replace(" ", "").replace("\n", "") err = self._validate_dna_sequence(part_upper) if err: return {"status": "error", "error": f"Part {i + 1}: {err}"} parts_clean = [p.upper().replace(" ", "").replace("\n", "") for p in parts] n_parts = len(parts_clean) # BsaI: recognition GGTCTC(1), cuts 1 nt away on top, 5 nt away on bottom # Creating: GGTCTCN[4bp overhang] -- part -- NGAGACC (reverse complement BsaI site) # BbsI: recognition GAAGAC(2), cuts 2 nt away on top, 6 nt away on bottom # Creating: GAAGACNN[4bp overhang] -- part -- NNGTCTTC complement_map = str.maketrans("ATGC", "TACG") def rev_comp(seq: str) -> str: return seq.translate(complement_map)[::-1] # Precomputed non-palindromic 4-mers (overhang != rev_comp(overhang)) candidate_overhangs = [ "AAAC", "AAAG", "AAAT", "AACG", "AACT", "AAGC", "AAGT", "AATC", "AATG", "ACAG", "ACAT", "ACCG", "ACCT", "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT", "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "ATAC", "ATAG", "ATCA", "ATCC", "ATCG", "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", ] non_palindromic = [oh for oh in candidate_overhangs if oh != rev_comp(oh)] if len(non_palindromic) < n_parts + 1: return { "status": "error", "error": f"Cannot generate enough unique overhangs for {n_parts} parts", } overhangs = non_palindromic[: n_parts + 1] if enzyme == "BSAI": # BsaI site: GGTCTCN where N is 1 bp spacer before the overhang # Forward site: GGTCTCA[overhang] # Reverse site (at end of part): rev_comp([overhang]TGAGACC) = GGTCTCA[rev_comp(overhang)] fwd_site_prefix = "GGTCTCA" # BsaI recognition + A spacer else: # BbsI site: GAAGACNN where NN is 2 bp spacer fwd_site_prefix = "GAAGACAA" parts_with_overhangs = [] for i, part in enumerate(parts_clean): left_oh = overhangs[i] right_oh = overhangs[i + 1] rev_right_oh = rev_comp(right_oh) full_sequence = ( fwd_site_prefix + left_oh + part + rev_right_oh + rev_comp(fwd_site_prefix) ) parts_with_overhangs.append( { "name": f"Part_{i + 1}", "sequence": part, "left_overhang": left_oh, "right_overhang": right_oh, "full_sequence": full_sequence, } ) protocol_note = ( f"Digest with {enzyme_display} and T4 ligase. " f"Cycle 25-30 times: 37°C 1 min (digest) / 16°C 1 min (ligate). " f"Final: 50°C 5 min, 80°C 10 min. " f"Overhangs are 4 bp non-palindromic sequences ensuring directional assembly." ) return { "status": "success", "data": { "parts_with_overhangs": parts_with_overhangs, "overhangs": overhangs, "enzyme": enzyme_display, "protocol_note": protocol_note, }, }