Source code for tooluniverse.uniprot_ref_tool
# uniprot_ref_tool.py
"""
UniProt Reference Datasets API tool for ToolUniverse.
Provides access to UniProt's controlled vocabularies and reference datasets:
- Diseases: Curated disease definitions with OMIM/MeSH cross-references
- Keywords: Standardized annotation terms for protein function/process/location
- Proteomes: Reference proteome data with chromosome-level protein counts
API: https://rest.uniprot.org/
No authentication required.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
UNIPROT_BASE_URL = "https://rest.uniprot.org"
[docs]
@register_tool("UniProtRefTool")
class UniProtRefTool(BaseTool):
"""
Tool for querying UniProt reference datasets (diseases, keywords, proteomes).
UniProt maintains curated controlled vocabularies that standardize protein
annotations across the database. This tool provides access to:
- Disease vocabulary: 6K+ curated disease entries with cross-refs to OMIM, MeSH, MedGen
- Keyword vocabulary: 1.2K+ standardized terms for biological process, function, etc.
- Proteomes: Reference proteome summaries with assembly and component data
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "search_diseases")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the UniProt reference dataset API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"UniProt API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to UniProt API"}
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
return {"error": "Resource not found in UniProt"}
return {"error": f"UniProt API HTTP error: {e.response.status_code}"}
except Exception as e:
return {"error": f"Unexpected error querying UniProt: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate UniProt reference endpoint."""
if self.endpoint == "search_diseases":
return self._search_diseases(arguments)
elif self.endpoint == "get_disease":
return self._get_disease(arguments)
elif self.endpoint == "search_keywords":
return self._search_keywords(arguments)
elif self.endpoint == "get_keyword":
return self._get_keyword(arguments)
elif self.endpoint == "get_proteome":
return self._get_proteome(arguments)
elif self.endpoint == "search_proteomes":
return self._search_proteomes(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _search_diseases(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search UniProt controlled disease vocabulary."""
query = arguments.get("query", "")
if not query:
return {"error": "query parameter is required"}
size = min(arguments.get("size") or 10, 25)
url = f"{UNIPROT_BASE_URL}/diseases/search"
params = {"query": query, "size": size, "format": "json"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
diseases = []
for r in data.get("results", []):
xrefs = []
for xr in r.get("crossReferences", []):
xrefs.append(
{
"database": xr.get("databaseType"),
"id": xr.get("id"),
}
)
diseases.append(
{
"id": r.get("id"),
"name": r.get("name"),
"acronym": r.get("acronym"),
"definition": r.get("definition"),
"alternative_names": r.get("alternativeNames", []),
"cross_references": xrefs,
"reviewed_protein_count": r.get("statistics", {}).get(
"reviewedProteinCount"
),
}
)
return {
"data": diseases,
"metadata": {
"source": "UniProt Controlled Disease Vocabulary",
"query": query,
"total_results": len(diseases),
},
}
[docs]
def _get_disease(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get a specific disease entry by UniProt disease ID."""
disease_id = arguments.get("disease_id", "")
if not disease_id:
return {"error": "disease_id parameter is required"}
url = f"{UNIPROT_BASE_URL}/diseases/{disease_id}"
params = {"format": "json"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
r = response.json()
xrefs = []
for xr in r.get("crossReferences", []):
xrefs.append(
{
"database": xr.get("databaseType"),
"id": xr.get("id"),
"properties": xr.get("properties", []),
}
)
return {
"data": {
"id": r.get("id"),
"name": r.get("name"),
"acronym": r.get("acronym"),
"definition": r.get("definition"),
"alternative_names": r.get("alternativeNames", []),
"cross_references": xrefs,
"reviewed_protein_count": r.get("statistics", {}).get(
"reviewedProteinCount"
),
"unreviewed_protein_count": r.get("statistics", {}).get(
"unreviewedProteinCount"
),
},
"metadata": {
"source": "UniProt Controlled Disease Vocabulary",
},
}
[docs]
def _search_keywords(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search UniProt keyword controlled vocabulary."""
query = arguments.get("query", "")
if not query:
return {"error": "query parameter is required"}
size = min(arguments.get("size") or 10, 25)
url = f"{UNIPROT_BASE_URL}/keywords/search"
params = {"query": query, "size": size, "format": "json"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
keywords = []
for r in data.get("results", []):
kw = r.get("keyword", {})
keywords.append(
{
"id": kw.get("id"),
"name": kw.get("name"),
"category": r.get("category", {}).get("name")
if isinstance(r.get("category"), dict)
else r.get("category"),
"definition": r.get("definition"),
"reviewed_protein_count": r.get("statistics", {}).get(
"reviewedProteinCount"
),
}
)
return {
"data": keywords,
"metadata": {
"source": "UniProt Keyword Controlled Vocabulary",
"query": query,
"total_results": len(keywords),
},
}
[docs]
def _get_keyword(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get a specific keyword entry by UniProt keyword ID."""
keyword_id = arguments.get("keyword_id", "")
if not keyword_id:
return {"error": "keyword_id parameter is required"}
url = f"{UNIPROT_BASE_URL}/keywords/{keyword_id}"
params = {"format": "json"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
r = response.json()
kw = r.get("keyword", {})
parents = []
for p in r.get("parents", []):
pkw = p.get("keyword", {})
parents.append(
{
"id": pkw.get("id"),
"name": pkw.get("name"),
}
)
go_mappings = []
for g in r.get("geneOntologies", []):
go_mappings.append(
{
"go_id": g.get("goId"),
"name": g.get("name"),
}
)
return {
"data": {
"id": kw.get("id"),
"name": kw.get("name"),
"category": r.get("category", {}).get("name")
if isinstance(r.get("category"), dict)
else r.get("category"),
"definition": r.get("definition"),
"parents": parents,
"go_mappings": go_mappings,
"reviewed_protein_count": r.get("statistics", {}).get(
"reviewedProteinCount"
),
"unreviewed_protein_count": r.get("statistics", {}).get(
"unreviewedProteinCount"
),
},
"metadata": {
"source": "UniProt Keyword Controlled Vocabulary",
},
}
[docs]
def _get_proteome(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get reference proteome information by UniProt proteome ID."""
proteome_id = arguments.get("proteome_id", "")
if not proteome_id:
return {"error": "proteome_id parameter is required"}
url = f"{UNIPROT_BASE_URL}/proteomes/{proteome_id}"
params = {"format": "json"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
r = response.json()
tax = r.get("taxonomy", {})
components = []
total_proteins = 0
for c in r.get("components", [])[:30]:
count = c.get("proteinCount", 0) or 0
total_proteins += count
genome_acc = None
for xr in c.get("proteomeCrossReferences", []):
if xr.get("database") == "GenomeAccession":
genome_acc = xr.get("id")
break
components.append(
{
"name": c.get("name"),
"protein_count": count,
"genome_accession": genome_acc,
}
)
return {
"data": {
"id": r.get("id"),
"description": (r.get("description") or "")[:500],
"organism": {
"scientific_name": tax.get("scientificName"),
"common_name": tax.get("commonName"),
"taxon_id": tax.get("taxonId"),
},
"proteome_type": r.get("proteomeType"),
"components": components,
"total_protein_count": total_proteins,
},
"metadata": {
"source": "UniProt Proteomes",
},
}
[docs]
def _search_proteomes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search UniProt reference proteomes."""
query = arguments.get("query", "")
if not query:
return {"error": "query parameter is required"}
size = min(arguments.get("size") or 10, 25)
url = f"{UNIPROT_BASE_URL}/proteomes/search"
params = {"query": query, "size": size, "format": "json"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
proteomes = []
for r in data.get("results", []):
tax = r.get("taxonomy", {})
total_proteins = sum(
(c.get("proteinCount") or 0) for c in r.get("components", [])
)
proteomes.append(
{
"id": r.get("id"),
"organism": tax.get("scientificName"),
"common_name": tax.get("commonName"),
"taxon_id": tax.get("taxonId"),
"proteome_type": r.get("proteomeType"),
"protein_count": total_proteins,
}
)
return {
"data": proteomes,
"metadata": {
"source": "UniProt Proteomes",
"query": query,
"total_results": len(proteomes),
},
}