Source code for tooluniverse.ncbi_nucleotide_tool
"""
NCBI Nucleotide Search Tool
This tool provides search capabilities for NCBI's Nucleotide database (GenBank/EMBL/RefSeq)
using E-utilities (esearch + efetch). Allows searching by organism name, gene name, keywords,
and retrieving accession numbers.
"""
from typing import Dict, Any
from .ncbi_eutils_tool import NCBIEUtilsTool
from .tool_registry import register_tool
[docs]
@register_tool("NCBINucleotideSearchTool")
class NCBINucleotideSearchTool(NCBIEUtilsTool):
"""
NCBI Nucleotide Database Search Tool using E-utilities.
Searches GenBank/EMBL/RefSeq for DNA/RNA sequences by organism, gene, keywords.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.db = "nucleotide" # Database name for E-utilities
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the nucleotide search with given arguments."""
operation = arguments.get("operation")
if not operation:
return {"status": "error", "error": "Missing required parameter: operation"}
if operation == "search":
return self._search_nucleotide(arguments)
elif operation == "fetch_accession":
return self._fetch_accession(arguments)
elif operation == "fetch_sequence":
return self._fetch_sequence(arguments)
else:
return {
"status": "error",
"error": f"Unknown operation: {operation}",
}
[docs]
def _build_search_term(self, arguments: Dict[str, Any]) -> str:
"""Build NCBI search term from arguments."""
terms = []
# Organism filter
if arguments.get("organism"):
terms.append(f"{arguments['organism']}[Organism]")
# Gene name filter
if arguments.get("gene"):
terms.append(f"{arguments['gene']}[Gene]")
# Strain filter
if arguments.get("strain"):
terms.append(f"{arguments['strain']}[Strain]")
# Keywords/title search
if arguments.get("keywords"):
terms.append(f"{arguments['keywords']}[Title]")
# Sequence type filter
if arguments.get("seq_type"):
seq_type = arguments["seq_type"]
if seq_type == "complete_genome":
terms.append("complete genome[Title]")
elif seq_type == "mrna":
terms.append("mRNA[Filter]")
elif seq_type == "refseq":
terms.append("RefSeq[Filter]")
# Free text query (if no specific filters provided)
if arguments.get("query") and not terms:
return arguments["query"]
# Combine terms with AND
if terms:
return " AND ".join(f"({term})" for term in terms)
return arguments.get("query", "")
[docs]
def _search_nucleotide(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Search NCBI nucleotide database and return UIDs.
Uses esearch to find matching records.
"""
try:
# Build search term
search_term = self._build_search_term(arguments)
if not search_term:
return {
"status": "error",
"error": "No search criteria provided. Use organism, gene, keywords, or query.",
}
# Build esearch parameters
params = {
"db": self.db,
"term": search_term,
"retmode": "json",
"retmax": arguments.get("limit", 20),
"sort": arguments.get("sort", "relevance"),
}
# Make request
result = self._make_request("/esearch.fcgi", params)
if result["status"] == "error":
return result
# Extract UIDs from esearch response
data = result.get("data", {})
if isinstance(data, dict):
esearch_result = data.get("esearchresult", {})
uids = esearch_result.get("idlist", [])
count = int(esearch_result.get("count", 0))
int(esearch_result.get("retmax", 0))
return {
"status": "success",
"data": {
"uids": uids,
"count": count,
"returned": len(uids),
"search_term": search_term,
},
"total_count": count,
"url": result.get("url"),
}
else:
return {
"status": "error",
"error": "Unexpected response format from NCBI",
}
except Exception as e:
return {"status": "error", "error": f"Search failed: {str(e)}"}
[docs]
def _fetch_accession(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Fetch accession numbers for given UIDs.
Uses efetch with rettype=acc to get just accession numbers.
"""
try:
uids = arguments.get("uids", [])
if isinstance(uids, str):
uids = [uids]
if not uids:
return {"status": "error", "error": "Missing required parameter: uids"}
# Build efetch parameters
params = {
"db": self.db,
"id": ",".join(str(uid) for uid in uids),
"rettype": "acc",
"retmode": "text",
}
# Make request
result = self._make_request("/efetch.fcgi", params)
if result["status"] == "error":
return result
# Parse accession numbers from response
data = result.get("data", "")
if isinstance(data, str):
accessions = [
acc.strip() for acc in data.strip().split("\n") if acc.strip()
]
return {
"status": "success",
"data": accessions,
"count": len(accessions),
"url": result.get("url"),
}
else:
return {
"status": "error",
"error": "Unexpected response format from NCBI",
}
except Exception as e:
return {"status": "error", "error": f"Fetch accessions failed: {str(e)}"}
[docs]
def _fetch_sequence(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Fetch sequence data for given accession(s).
Uses efetch to retrieve sequences in specified format.
"""
try:
accession = arguments.get("accession")
if not accession:
return {
"status": "error",
"error": "Missing required parameter: accession",
}
# Get format (default to fasta)
seq_format = arguments.get("format", "fasta")
# Build efetch parameters
params = {
"db": self.db,
"id": accession,
"rettype": seq_format,
"retmode": "text",
}
# Make request
result = self._make_request("/efetch.fcgi", params)
if result["status"] == "error":
return result
# Return sequence data
data = result.get("data", "")
return {
"status": "success",
"data": data,
"accession": accession,
"format": seq_format,
"length": len(data),
"url": result.get("url"),
}
except Exception as e:
return {"status": "error", "error": f"Fetch sequence failed: {str(e)}"}