Source code for tooluniverse.dfam_tool

# dfam_tool.py
"""
Dfam tool for ToolUniverse.

Dfam is a comprehensive database of transposable element (TE) and repetitive
DNA families with consensus sequences, profile HMMs, and genome annotations.
Maintained by the Institute for Systems Biology and partners.

API: https://www.dfam.org/api/
No authentication required.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

DFAM_BASE_URL = "https://www.dfam.org/api"


[docs] @register_tool("DfamTool") class DfamTool(BaseTool): """ Tool for querying Dfam transposable element / repeat element database. Supports: - Search TE families by name prefix, clade (taxon ID), and repeat type - Get detailed family info including consensus sequence and classification - Get TE annotation hits for genomic regions No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "search_families")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Dfam API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return {"error": f"Dfam API timed out after {self.timeout}s"} except requests.exceptions.ConnectionError: return {"error": "Failed to connect to Dfam API"} except requests.exceptions.HTTPError as e: status = e.response.status_code if e.response is not None else "unknown" if status == 404: return { "error": "Resource not found in Dfam. Check the accession or query." } return {"error": f"Dfam API HTTP {status}"} except Exception as e: return {"error": f"Unexpected error: {str(e)}"}
[docs] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint == "search_families": return self._search_families(arguments) elif self.endpoint == "get_family": return self._get_family(arguments) elif self.endpoint == "get_annotations": return self._get_annotations(arguments) else: return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs] def _search_families(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search Dfam TE families by name prefix, clade, or repeat type.""" params = {"format": "summary"} name_prefix = arguments.get("name_prefix") if name_prefix: params["name_prefix"] = name_prefix clade = arguments.get("clade") if clade: params["clade"] = clade repeat_type = arguments.get("repeat_type") if repeat_type: params["type"] = repeat_type limit = arguments.get("limit", 20) params["limit"] = min(limit, 50) url = f"{DFAM_BASE_URL}/families" response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() results = data.get("results", []) families = [] for fam in results: families.append( { "accession": fam.get("accession"), "name": fam.get("name"), "title": fam.get("title"), "description": fam.get("description"), "length": fam.get("length"), "repeat_type": fam.get("repeat_type_name"), "repeat_subtype": fam.get("repeat_subtype_name"), "classification": fam.get("classification"), } ) return { "data": families, "metadata": { "source": "Dfam (dfam.org)", "total_count": data.get("total_count", len(families)), "returned": len(families), }, }
[docs] def _get_family(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed info for a specific Dfam TE family.""" accession = arguments.get("accession", "") if not accession: return {"error": "accession is required (e.g., 'DF000000003' for AluSc)."} url = f"{DFAM_BASE_URL}/families/{accession}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() citations = [] for c in data.get("citations", []): citations.append( { "pmid": c.get("pmid"), "title": c.get("title"), "authors": c.get("authors"), } ) return { "data": { "accession": data.get("accession"), "name": data.get("name"), "title": data.get("title"), "description": data.get("description"), "length": data.get("length"), "classification": data.get("classification"), "repeat_type": data.get("repeat_type_name"), "repeat_subtype": data.get("repeat_subtype_name"), "consensus_sequence": data.get("consensus_sequence"), "author": data.get("author"), "date_created": data.get("date_created"), "date_modified": data.get("date_modified"), "curation_state": data.get("curation_state_name"), "clades": data.get("clades", []), "citations": citations, "aliases": data.get("aliases", []), }, "metadata": { "source": "Dfam (dfam.org)", }, }
[docs] def _get_annotations(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get TE annotation hits for a genomic region.""" assembly = arguments.get("assembly", "hg38") chrom = arguments.get("chrom", "") start = arguments.get("start") end = arguments.get("end") if not chrom or start is None or end is None: return { "error": "chrom, start, and end are required (e.g., chrom='chr1', start=10000, end=50000)." } # Dfam API expects lowercase boolean strings nrph = arguments.get("nrph", True) params = { "assembly": assembly, "chrom": chrom, "start": int(start), "end": int(end), "nrph": "true" if nrph else "false", } url = f"{DFAM_BASE_URL}/annotations" response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() hits = data.get("hits", []) annotations = [] for hit in hits[:50]: # Limit to 50 hits to avoid huge responses annotations.append( { "accession": hit.get("accession"), "query": hit.get("query"), "type": hit.get("type"), "strand": hit.get("strand"), "bit_score": hit.get("bit_score"), "e_value": hit.get("e_value"), "seq_start": hit.get("seq_start"), "seq_end": hit.get("seq_end"), "ali_start": hit.get("ali_start"), "ali_end": hit.get("ali_end"), "model_start": hit.get("model_start"), "model_end": hit.get("model_end"), } ) return { "data": annotations, "metadata": { "source": "Dfam (dfam.org)", "assembly": assembly, "region": f"{chrom}:{start}-{end}", "total_hits": len(hits), "returned": len(annotations), }, }