Source code for tooluniverse.sgd_protein_tool
# sgd_protein_tool.py
"""
SGD (Saccharomyces Genome Database) protein-feature & literature tool.
Complements the existing SGDTool (gene overview, phenotype, GO, interaction,
regulation, sequence, disease) by exposing three SGD locus sub-resources that
were previously unwrapped:
* protein_domain_details -> mapped protein domains (Pfam, InterPro,
SMART, PROSITE, CDD, Gene3D, SUPERFAMILY,
PANTHER, PRINTS) with residue coordinates.
* posttranslational_details -> curated post-translational modification
sites (phosphorylation, ubiquitination,
acetylation, ...) with residue + reference.
* literature_details -> categorized literature references
(primary, review, interaction, phenotype,
GO, disease, PTM, regulation, ...).
SGD webservice base: https://www.yeastgenome.org/backend
No authentication required. The {locus} path segment accepts an SGD ID
(e.g. S000001855), a systematic name (e.g. YFL039C), or a standard gene
name (e.g. ACT1) -- the backend resolves all three.
"""
import requests
from typing import Dict, Any, List
from .base_tool import BaseTool
from .tool_registry import register_tool
SGD_BASE_URL = "https://www.yeastgenome.org/backend"
# SGD's backend rejects the default python-requests User-Agent on some paths;
# a browser-style UA is accepted consistently.
_HEADERS = {
"Accept": "application/json",
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
),
}
# Per-literature-category cap so a single ACT1 query (1000+ refs, ~1 MB)
# returns a bounded, useful payload instead of a megabyte dump.
_MAX_REFS_PER_CATEGORY = 25
[docs]
@register_tool("SGDProteinTool")
class SGDProteinTool(BaseTool):
"""
Query SGD protein-domain, post-translational-modification, and literature
sub-resources for a budding-yeast (S. cerevisiae) gene/locus.
Dispatch is driven by ``fields.endpoint`` in the tool config; the runtime
argument is always a single ``locus`` string. No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
self.endpoint = tool_config.get("fields", {}).get("endpoint", "")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the SGD sub-resource call. Never raises."""
try:
locus = (arguments or {}).get("locus", "")
if isinstance(locus, str):
locus = locus.strip()
if not locus:
return {
"status": "error",
"error": (
"locus parameter is required (gene name e.g. 'ACT1', "
"systematic name e.g. 'YFL039C', or SGD ID e.g. "
"'S000001855')"
),
}
if self.endpoint == "protein_domain_details":
return self._protein_domains(locus)
if self.endpoint == "posttranslational_details":
return self._ptm(locus)
if self.endpoint == "literature_details":
return self._literature(locus)
return {
"status": "error",
"error": f"Unknown SGD endpoint configured: {self.endpoint!r}",
}
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"SGD API request timed out after {self.timeout} seconds",
}
except requests.exceptions.ConnectionError:
return {
"status": "error",
"error": "Failed to connect to SGD API. Check network connectivity.",
}
except requests.exceptions.HTTPError as e:
code = getattr(getattr(e, "response", None), "status_code", "unknown")
hint = " (locus not found)" if code == 404 else ""
return {"status": "error", "error": f"SGD API HTTP error: {code}{hint}"}
except ValueError:
return {
"status": "error",
"error": "SGD API returned a non-JSON response",
}
except Exception as e: # noqa: BLE001 - never propagate to caller
return {"status": "error", "error": f"Unexpected error querying SGD: {e}"}
# ----------------------------- helpers ------------------------------- #
[docs]
def _get(self, locus: str, sub: str):
"""Issue the GET and return parsed JSON (raises on HTTP/JSON error)."""
url = f"{SGD_BASE_URL}/locus/{locus}/{sub}"
resp = requests.get(url, headers=_HEADERS, timeout=self.timeout)
resp.raise_for_status()
return resp.json()
[docs]
@staticmethod
def _locus_label(raw_list: List[dict]) -> Dict[str, Any]:
"""Pull the gene/systematic label from the first row that carries it."""
for row in raw_list:
loc = row.get("locus") or {}
if loc:
return {
"gene_name": loc.get("display_name"),
"systematic_name": loc.get("format_name"),
"sgd_link": loc.get("link"),
}
return {"gene_name": None, "systematic_name": None, "sgd_link": None}
# ---------------------------- endpoints ------------------------------ #
[docs]
def _protein_domains(self, locus: str) -> Dict[str, Any]:
raw = self._get(locus, "protein_domain_details")
if not isinstance(raw, list):
raw = []
domains = []
for row in raw:
dom = row.get("domain") or {}
src = row.get("source") or {}
domains.append(
{
"accession": dom.get("display_name"),
"description": (
None
if dom.get("description") in (None, "-")
else dom.get("description")
),
"source": src.get("display_name"),
"start": row.get("start"),
"end": row.get("end"),
"domain_link": dom.get("link"),
}
)
label = self._locus_label(raw)
return {
"status": "success",
"data": {
**label,
"domain_count": len(domains),
"domains": domains,
},
"metadata": {
"source": "SGD",
"query": locus,
"endpoint": "locus/protein_domain_details",
},
}
[docs]
def _ptm(self, locus: str) -> Dict[str, Any]:
raw = self._get(locus, "posttranslational_details")
if not isinstance(raw, list):
raw = []
sites = []
for row in raw:
ref = row.get("reference") or {}
sites.append(
{
"modification": row.get("type"),
"residue": row.get("site_residue"),
"position": row.get("site_index"),
"reference": ref.get("display_name"),
"pubmed_id": ref.get("pubmed_id"),
}
)
label = self._locus_label(raw)
return {
"status": "success",
"data": {
**label,
"site_count": len(sites),
"sites": sites,
},
"metadata": {
"source": "SGD",
"query": locus,
"endpoint": "locus/posttranslational_details",
},
}
[docs]
def _literature(self, locus: str) -> Dict[str, Any]:
raw = self._get(locus, "literature_details")
if not isinstance(raw, dict):
raw = {}
counts: Dict[str, int] = {}
references: Dict[str, List[dict]] = {}
for category, refs in raw.items():
if not isinstance(refs, list):
continue
counts[category] = len(refs)
trimmed = []
for ref in refs[:_MAX_REFS_PER_CATEGORY]:
trimmed.append(
{
"citation": ref.get("citation") or ref.get("display_name"),
"pubmed_id": ref.get("pubmed_id"),
"year": ref.get("year"),
"reference_link": ref.get("link"),
}
)
references[category] = trimmed
return {
"status": "success",
"data": {
"query": locus,
"total_references": sum(counts.values()),
"counts_by_category": counts,
"references": references,
"truncated_per_category": _MAX_REFS_PER_CATEGORY,
},
"metadata": {
"source": "SGD",
"query": locus,
"endpoint": "locus/literature_details",
},
}