Source code for tooluniverse.blast_tool

from typing import Any, Dict
from .base_tool import BaseTool
from .tool_registry import register_tool

# Optional dependency - Biopython
try:
    from Bio.Blast import NCBIWWW, NCBIXML
    from Bio.Seq import Seq

    BIOPYTHON_AVAILABLE = True
except ImportError:
    BIOPYTHON_AVAILABLE = False


[docs] @register_tool("NCBIBlastTool") class NCBIBlastTool(BaseTool):
[docs] def __init__(self, tool_config: Dict): super().__init__(tool_config) self.timeout = 300 # BLAST can take a long time self.max_wait_time = 600 # Maximum wait time for results
[docs] def _parse_blast_results(self, blast_xml: str) -> Dict[str, Any]: """Parse BLAST XML results into structured data""" try: from io import StringIO blast_record = NCBIXML.read(StringIO(blast_xml)) results = { "query_id": blast_record.query_id, "query_length": blast_record.query_length, "database": blast_record.database, "algorithm": blast_record.application, "alignments": [], } for alignment in blast_record.alignments: alignment_data = { "hit_id": getattr(alignment, "hit_id", "unknown"), "hit_def": getattr(alignment, "hit_def", "unknown"), "hit_length": getattr(alignment, "hit_length", 0), "hsps": [], } for hsp in alignment.hsps: hsp_data = { "score": getattr(hsp, "score", 0), "bits": getattr(hsp, "bits", 0), "expect": getattr(hsp, "expect", 0), "identities": getattr(hsp, "identities", 0), "positives": getattr(hsp, "positives", 0), "gaps": getattr(hsp, "gaps", 0), "align_length": getattr(hsp, "align_length", 0), "query_start": getattr(hsp, "query_start", 0), "query_end": getattr(hsp, "query_end", 0), "hit_start": getattr(hsp, "hit_start", 0), "hit_end": getattr(hsp, "hit_end", 0), "query": getattr(hsp, "query", ""), "match": getattr(hsp, "match", ""), "sbjct": getattr(hsp, "sbjct", ""), } alignment_data["hsps"].append(hsp_data) results["alignments"].append(alignment_data) return results except Exception as e: return { "error": f"Failed to parse BLAST results: {str(e)}", "raw_xml": ( blast_xml[:1000] + "..." if len(blast_xml) > 1000 else blast_xml ), }
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute BLAST search using NCBI Web service""" # Check if Biopython is available if not BIOPYTHON_AVAILABLE: return { "status": "error", "error": "Biopython is required for BLAST tools. Install with: pip install biopython", } try: sequence = arguments.get("sequence", "") # Determine blast_type from tool name or arguments tool_name = self.tool_config.get("name", "") if "protein" in tool_name.lower(): default_blast_type = "blastp" default_database = "nr" else: default_blast_type = "blastn" default_database = "nt" blast_type = arguments.get("blast_type", default_blast_type) database = arguments.get("database", default_database) expect = arguments.get("expect", 10.0) hitlist_size = arguments.get("hitlist_size", 50) if not sequence: return { "status": "error", "error": "Missing required parameter: sequence", } # Validate sequence try: seq_obj = Seq(sequence) if len(seq_obj) < 10: return { "status": "error", "error": "Sequence too short (minimum 10 residues)", } except Exception as e: return { "status": "error", "error": f"Invalid sequence format: {str(e)}", } # Perform BLAST search result_handle = NCBIWWW.qblast( blast_type, database, sequence, expect=expect, hitlist_size=hitlist_size, format_type="XML", ) # Read results blast_xml = result_handle.read() result_handle.close() # Parse results parsed_results = self._parse_blast_results(blast_xml) if "error" in parsed_results: return { "status": "error", "error": parsed_results["error"], "raw_data": parsed_results.get("raw_xml", ""), } return { "status": "success", "data": parsed_results, "query_sequence": sequence, "blast_type": blast_type, "database": database, "hit_count": len(parsed_results["alignments"]), } except Exception as e: return { "status": "error", "error": f"BLAST search failed: {str(e)}", }