Source code for tooluniverse.rfam_tool
"""
Rfam Database API Tool
This tool provides access to the Rfam database (v15.1, January 2026) containing
4,227 RNA families. Rfam provides multiple sequence alignments, consensus secondary
structures, covariance models, and annotations for non-coding RNA families.
"""
import requests
import time
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
RFAM_BASE_URL = "https://rfam.org"
RFAM_BATCH_URL = "https://batch.rfam.org"
[docs]
@register_tool("RfamTool")
class RfamTool(BaseTool):
"""
Rfam Database API tool for RNA family data.
Provides access to:
- RNA family information and metadata
- Secondary structure diagrams
- Covariance models
- Sequence alignments (Stockholm, FASTA formats)
- Phylogenetic trees
- Sequence searches
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.parameter = tool_config.get("parameter", {})
self.required = self.parameter.get("required", [])
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Rfam API tool with given arguments."""
# Validate required parameters
for param in self.required:
if param not in arguments or arguments[param] is None:
return {
"status": "error",
"error": f"Missing required parameter: {param}",
}
operation = arguments.get("operation")
if not operation:
return {"status": "error", "error": "Missing required parameter: operation"}
# Route to appropriate operation handler
operation_handlers = {
"get_family": self._get_family,
"get_family_accession": self._get_family_accession,
"get_family_id": self._get_family_id,
"get_covariance_model": self._get_covariance_model,
"get_alignment": self._get_alignment,
"get_tree_data": self._get_tree_data,
"get_sequence_regions": self._get_sequence_regions,
"get_structure_mapping": self._get_structure_mapping,
"search_sequence": self._search_sequence,
}
handler = operation_handlers.get(operation)
if not handler:
return {
"status": "error",
"error": f"Unknown operation: {operation}",
"available_operations": list(operation_handlers.keys()),
}
try:
return handler(arguments)
except Exception as e:
return {
"status": "error",
"error": f"Operation failed: {str(e)}",
"operation": operation,
}
[docs]
def _get_family(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get RNA family information."""
family_id = arguments.get("family_id")
format_type = arguments.get("format", "json")
if not family_id:
return {
"status": "error",
"error": "family_id is required (RF accession or family name)",
}
# Build URL with content type
url = f"{RFAM_BASE_URL}/family/{family_id}"
headers = {}
if format_type == "json":
headers["Accept"] = "application/json"
elif format_type == "xml":
headers["Accept"] = "text/xml"
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
if format_type == "json":
return {
"status": "success",
"data": response.json(),
"family_id": family_id,
}
else:
return {
"status": "success",
"data": response.text,
"format": format_type,
"family_id": family_id,
}
elif response.status_code == 404:
return {
"status": "error",
"error": f"Family {family_id} not found",
"message": "Check family ID/accession is correct",
}
else:
return {
"status": "error",
"error": f"API request failed with status {response.status_code}",
"detail": response.text[:500],
}
[docs]
def _get_family_accession(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Convert family ID to accession."""
family_id = arguments.get("family_id")
if not family_id:
return {"status": "error", "data": {"error": "family_id is required"}}
url = f"{RFAM_BASE_URL}/family/{family_id}/acc"
response = requests.get(url, timeout=30)
if response.status_code == 200:
result = {
"accession": response.text.strip(),
"family_id": family_id,
}
return {"status": "success", "data": result}
else:
return {
"status": "error",
"data": {
"error": f"Failed to get accession for {family_id}",
"detail": response.text[:500],
},
}
[docs]
def _get_family_id(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Convert family accession to ID."""
accession = arguments.get("accession")
if not accession:
return {
"status": "error",
"data": {"error": "accession is required (e.g., RF00360)"},
}
url = f"{RFAM_BASE_URL}/family/{accession}/id"
response = requests.get(url, timeout=30)
if response.status_code == 200:
result = {
"family_id": response.text.strip(),
"accession": accession,
}
return {"status": "success", "data": result}
else:
return {
"status": "error",
"data": {
"error": f"Failed to get ID for accession {accession}",
"detail": response.text[:500],
},
}
[docs]
def _get_covariance_model(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get covariance model for RNA family."""
family_id = arguments.get("family_id")
if not family_id:
return {"status": "error", "data": {"error": "family_id is required"}}
url = f"{RFAM_BASE_URL}/family/{family_id}/cm"
response = requests.get(url, timeout=30)
if response.status_code == 200:
result = {
"covariance_model": response.text,
"family_id": family_id,
"format": "Infernal CM format",
}
return {"status": "success", "data": result}
else:
return {
"status": "error",
"data": {
"error": f"Failed to get covariance model for {family_id}",
"detail": response.text[:500],
},
}
[docs]
def _get_alignment(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get sequence alignment for RNA family."""
family_id = arguments.get("family_id")
format_type = arguments.get("format", "stockholm")
gzip = arguments.get("gzip", False)
if not family_id:
return {"status": "error", "data": {"error": "family_id is required"}}
# Build URL
if format_type == "stockholm":
url = f"{RFAM_BASE_URL}/family/{family_id}/alignment"
else:
url = f"{RFAM_BASE_URL}/family/{family_id}/alignment/{format_type}"
if gzip:
url += "?gzip=1"
response = requests.get(url, timeout=30)
if response.status_code == 200:
result = {
"alignment": response.text
if not gzip
else response.content.decode("utf-8", errors="ignore"),
"format": format_type,
"family_id": family_id,
"compressed": gzip,
}
return {"status": "success", "data": result}
else:
return {
"status": "error",
"data": {
"error": f"Failed to get alignment for {family_id}",
"detail": response.text[:500],
},
}
[docs]
def _get_tree_data(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get phylogenetic tree data in NHX format."""
family_id = arguments.get("family_id")
if not family_id:
return {"status": "error", "data": {"error": "family_id is required"}}
url = f"{RFAM_BASE_URL}/family/{family_id}/tree/"
response = requests.get(url, timeout=30)
if response.status_code == 200:
result = {
"tree_data": response.text,
"format": "NHX (New Hampshire eXtended)",
"family_id": family_id,
}
return {"status": "success", "data": result}
else:
return {
"status": "error",
"data": {
"error": f"Failed to get tree data for {family_id}",
"detail": response.text[:500],
},
}
[docs]
def _get_sequence_regions(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get sequence regions for RNA family."""
family_id = arguments.get("family_id")
format_type = arguments.get("format", "text")
if not family_id:
return {"status": "error", "error": "family_id is required"}
url = f"{RFAM_BASE_URL}/family/{family_id}/regions"
headers = {}
if format_type == "json":
headers["Accept"] = "application/json"
elif format_type == "xml":
headers["Accept"] = "text/xml"
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
if format_type == "json":
result = response.json()
result["family_id"] = family_id
return {"status": "success", "data": result}
else:
result = {
"regions": response.text,
"format": format_type,
"family_id": family_id,
}
return {"status": "success", "data": result}
elif response.status_code == 403:
return {
"status": "error",
"data": {
"error": "Too many regions to list for this family",
"message": "This family has a very large number of regions. Use API with pagination or download full data.",
},
}
else:
return {
"status": "error",
"data": {
"error": f"Failed to get regions for {family_id}",
"detail": response.text[:500],
},
}
[docs]
def _get_structure_mapping(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get mapping between RNA family and PDB structures."""
family_id = arguments.get("family_id")
format_type = arguments.get("format", "json")
if not family_id:
return {"status": "error", "error": "family_id is required"}
url = f"{RFAM_BASE_URL}/family/{family_id}/structures"
headers = {}
if format_type == "json":
headers["Accept"] = "application/json"
elif format_type == "xml":
headers["Accept"] = "text/xml"
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
if format_type == "json":
return {
"status": "success",
"data": response.json(),
"family_id": family_id,
}
else:
return {
"status": "success",
"structures": response.text,
"format": format_type,
"family_id": family_id,
}
else:
return {
"status": "error",
"error": f"Failed to get structure mapping for {family_id}",
"detail": response.text[:500],
}
[docs]
def _search_sequence(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search RNA sequence against Rfam families."""
sequence = arguments.get("sequence")
if not sequence:
return {"status": "error", "error": "sequence is required"}
# Step 1: Submit search
try:
files = {"sequence_file": ("sequence.txt", sequence, "text/plain")}
headers = {"Accept": "application/json"}
submit_url = f"{RFAM_BATCH_URL}/submit-job"
submit_response = requests.post(
submit_url, files=files, headers=headers, timeout=30
)
if submit_response.status_code != 200:
return {
"status": "error",
"error": f"Failed to submit search: {submit_response.status_code}",
"detail": submit_response.text[:500],
}
submit_data = submit_response.json()
job_id = submit_data.get("jobId")
result_url = submit_data.get("resultURL")
if not job_id or not result_url:
return {
"status": "error",
"error": "Failed to get job ID from server",
"response": submit_data,
}
# Step 2: Poll for results
max_wait = arguments.get("max_wait_seconds", 120)
poll_interval = 5
elapsed = 0
while elapsed < max_wait:
time.sleep(poll_interval)
elapsed += poll_interval
result_response = requests.get(result_url, headers=headers, timeout=30)
if result_response.status_code == 200:
# Search complete
result = {
"job_id": job_id,
"results": result_response.json(),
"elapsed_seconds": elapsed,
}
return {"status": "success", "data": result}
elif result_response.status_code == 202:
# Still running
continue
elif result_response.status_code == 410:
return {
"status": "error",
"error": "Job was deleted",
"message": "Job may have had a problem. Contact Rfam help desk.",
}
elif result_response.status_code == 503:
return {
"status": "error",
"error": "Job is on hold",
"message": "Contact Rfam help desk for assistance.",
}
else:
return {
"status": "error",
"error": f"Unexpected status code: {result_response.status_code}",
"detail": result_response.text[:500],
}
# Timeout
return {
"status": "pending",
"message": f"Search is still running after {elapsed} seconds",
"job_id": job_id,
"result_url": result_url,
"instruction": f"Check {result_url} later for results",
}
except Exception as e:
return {"status": "error", "error": f"Search failed: {str(e)}"}