Source code for tooluniverse.uniref_tool
# uniref_tool.py
"""
UniProt UniRef Clusters API tool for ToolUniverse.
UniRef provides clustered sets of protein sequences at different levels
of sequence identity: UniRef100, UniRef90, and UniRef50. These clusters
group related sequences to reduce redundancy and improve search speed.
API: https://rest.uniprot.org/uniref/
No authentication required. Free public access.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
UNIREF_BASE_URL = "https://rest.uniprot.org/uniref"
[docs]
@register_tool("UniRefTool")
class UniRefTool(BaseTool):
"""
Tool for querying UniProt UniRef protein sequence clusters.
Supports:
- Get cluster details by ID (UniRef90_XXXXX, UniRef50_XXXXX, UniRef100_XXXXX)
- Search clusters by protein name, gene, or organism
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "get_cluster")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the UniRef API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"UniRef API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to UniRef API"}
except requests.exceptions.HTTPError as e:
return {"error": f"UniRef API HTTP error: {e.response.status_code}"}
except Exception as e:
return {"error": f"Unexpected error: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "get_cluster":
return self._get_cluster(arguments)
elif self.endpoint == "search_clusters":
return self._search_clusters(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _get_cluster(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get UniRef cluster details by cluster ID."""
cluster_id = arguments.get("cluster_id", "")
if not cluster_id:
return {
"error": "cluster_id parameter is required (e.g., 'UniRef90_P04637')"
}
cluster_id = cluster_id.strip()
url = f"{UNIREF_BASE_URL}/{cluster_id}"
response = requests.get(
url, headers={"Accept": "application/json"}, timeout=self.timeout
)
response.raise_for_status()
data = response.json()
# Parse representative member
rep = data.get("representativeMember") or {}
representative = {
"member_id": rep.get("memberId"),
"member_id_type": rep.get("memberIdType"),
"organism_name": rep.get("organismName"),
"organism_tax_id": rep.get("organismTaxId"),
"sequence_length": rep.get("sequenceLength"),
"protein_name": rep.get("proteinName"),
"accessions": rep.get("accessions", []),
}
# Sequence info
seq_obj = rep.get("sequence") or {}
sequence = seq_obj.get("value")
sequence_length = seq_obj.get("length")
# Common taxon
common_taxon = data.get("commonTaxon") or {}
return {
"data": {
"cluster_id": data.get("id"),
"name": data.get("name"),
"entry_type": data.get("entryType"),
"member_count": data.get("memberCount"),
"updated": data.get("updated"),
"seed_id": data.get("seedId"),
"common_taxon": {
"scientific_name": common_taxon.get("scientificName"),
"taxon_id": common_taxon.get("taxonId"),
},
"representative_member": representative,
"sequence": sequence,
"sequence_length": sequence_length
or representative.get("sequence_length"),
},
"metadata": {
"source": "UniProt UniRef",
"cluster_id": cluster_id,
},
}
[docs]
def _search_clusters(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search UniRef clusters by protein name, gene, or organism."""
query = arguments.get("query", "")
if not query:
return {
"error": "query parameter is required (e.g., 'p53', 'insulin', 'kinase Homo sapiens')"
}
cluster_type = arguments.get("cluster_type") or "UniRef90"
size = min(arguments.get("size") or 10, 25)
# Build the search query with identity filter
# UniRef API uses decimal identity: 1.0 (UniRef100), 0.9 (UniRef90), 0.5 (UniRef50)
identity_map = {
"UniRef100": "1.0",
"UniRef90": "0.9",
"UniRef50": "0.5",
}
search_query = query
identity_val = identity_map.get(cluster_type)
if identity_val:
search_query = f"{query} AND identity:{identity_val}"
url = f"{UNIREF_BASE_URL}/search"
params = {
"query": search_query,
"size": size,
}
response = requests.get(
url,
params=params,
headers={"Accept": "application/json"},
timeout=self.timeout,
)
response.raise_for_status()
data = response.json()
results_raw = data.get("results", [])
results = []
for cluster in results_raw:
rep = cluster.get("representativeMember") or {}
common_taxon = cluster.get("commonTaxon") or {}
results.append(
{
"cluster_id": cluster.get("id"),
"name": cluster.get("name"),
"entry_type": cluster.get("entryType"),
"member_count": cluster.get("memberCount"),
"updated": cluster.get("updated"),
"representative_member_id": rep.get("memberId"),
"representative_organism": rep.get("organismName"),
"representative_protein": rep.get("proteinName"),
"representative_sequence_length": rep.get("sequenceLength"),
"common_taxon": common_taxon.get("scientificName"),
}
)
return {
"data": results,
"metadata": {
"source": "UniProt UniRef",
"query": query,
"cluster_type": cluster_type,
"returned": len(results),
},
}