Source code for tooluniverse.sgd_protein_tool

# sgd_protein_tool.py
"""
SGD (Saccharomyces Genome Database) protein-feature & literature tool.

Complements the existing SGDTool (gene overview, phenotype, GO, interaction,
regulation, sequence, disease) by exposing three SGD locus sub-resources that
were previously unwrapped:

  * protein_domain_details      -> mapped protein domains (Pfam, InterPro,
                                   SMART, PROSITE, CDD, Gene3D, SUPERFAMILY,
                                   PANTHER, PRINTS) with residue coordinates.
  * posttranslational_details   -> curated post-translational modification
                                   sites (phosphorylation, ubiquitination,
                                   acetylation, ...) with residue + reference.
  * literature_details          -> categorized literature references
                                   (primary, review, interaction, phenotype,
                                   GO, disease, PTM, regulation, ...).

SGD webservice base: https://www.yeastgenome.org/backend
No authentication required. The {locus} path segment accepts an SGD ID
(e.g. S000001855), a systematic name (e.g. YFL039C), or a standard gene
name (e.g. ACT1) -- the backend resolves all three.
"""

import requests
from typing import Dict, Any, List
from .base_tool import BaseTool
from .tool_registry import register_tool

SGD_BASE_URL = "https://www.yeastgenome.org/backend"

# SGD's backend rejects the default python-requests User-Agent on some paths;
# a browser-style UA is accepted consistently.
_HEADERS = {
    "Accept": "application/json",
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
    ),
}

# Per-literature-category cap so a single ACT1 query (1000+ refs, ~1 MB)
# returns a bounded, useful payload instead of a megabyte dump.
_MAX_REFS_PER_CATEGORY = 25



[docs]
@register_tool("SGDProteinTool")
class SGDProteinTool(BaseTool):
    """
    Query SGD protein-domain, post-translational-modification, and literature
    sub-resources for a budding-yeast (S. cerevisiae) gene/locus.

    Dispatch is driven by ``fields.endpoint`` in the tool config; the runtime
    argument is always a single ``locus`` string. No authentication required.
    """


[docs]
    def __init__(self, tool_config: Dict[str, Any]):
        super().__init__(tool_config)
        self.timeout = tool_config.get("timeout", 30)
        self.endpoint = tool_config.get("fields", {}).get("endpoint", "")



[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the SGD sub-resource call. Never raises."""
        try:
            locus = (arguments or {}).get("locus", "")
            if isinstance(locus, str):
                locus = locus.strip()
            if not locus:
                return {
                    "status": "error",
                    "error": (
                        "locus parameter is required (gene name e.g. 'ACT1', "
                        "systematic name e.g. 'YFL039C', or SGD ID e.g. "
                        "'S000001855')"
                    ),
                }

            if self.endpoint == "protein_domain_details":
                return self._protein_domains(locus)
            if self.endpoint == "posttranslational_details":
                return self._ptm(locus)
            if self.endpoint == "literature_details":
                return self._literature(locus)
            return {
                "status": "error",
                "error": f"Unknown SGD endpoint configured: {self.endpoint!r}",
            }
        except requests.exceptions.Timeout:
            return {
                "status": "error",
                "error": f"SGD API request timed out after {self.timeout} seconds",
            }
        except requests.exceptions.ConnectionError:
            return {
                "status": "error",
                "error": "Failed to connect to SGD API. Check network connectivity.",
            }
        except requests.exceptions.HTTPError as e:
            code = getattr(getattr(e, "response", None), "status_code", "unknown")
            hint = " (locus not found)" if code == 404 else ""
            return {"status": "error", "error": f"SGD API HTTP error: {code}{hint}"}
        except ValueError:
            return {
                "status": "error",
                "error": "SGD API returned a non-JSON response",
            }
        except Exception as e:  # noqa: BLE001 - never propagate to caller
            return {"status": "error", "error": f"Unexpected error querying SGD: {e}"}


    # ----------------------------- helpers ------------------------------- #


[docs]
    def _get(self, locus: str, sub: str):
        """Issue the GET and return parsed JSON (raises on HTTP/JSON error)."""
        url = f"{SGD_BASE_URL}/locus/{locus}/{sub}"
        resp = requests.get(url, headers=_HEADERS, timeout=self.timeout)
        resp.raise_for_status()
        return resp.json()



[docs]
    @staticmethod
    def _locus_label(raw_list: List[dict]) -> Dict[str, Any]:
        """Pull the gene/systematic label from the first row that carries it."""
        for row in raw_list:
            loc = row.get("locus") or {}
            if loc:
                return {
                    "gene_name": loc.get("display_name"),
                    "systematic_name": loc.get("format_name"),
                    "sgd_link": loc.get("link"),
                }
        return {"gene_name": None, "systematic_name": None, "sgd_link": None}


    # ---------------------------- endpoints ------------------------------ #


[docs]
    def _protein_domains(self, locus: str) -> Dict[str, Any]:
        raw = self._get(locus, "protein_domain_details")
        if not isinstance(raw, list):
            raw = []

        domains = []
        for row in raw:
            dom = row.get("domain") or {}
            src = row.get("source") or {}
            domains.append(
                {
                    "accession": dom.get("display_name"),
                    "description": (
                        None
                        if dom.get("description") in (None, "-")
                        else dom.get("description")
                    ),
                    "source": src.get("display_name"),
                    "start": row.get("start"),
                    "end": row.get("end"),
                    "domain_link": dom.get("link"),
                }
            )

        label = self._locus_label(raw)
        return {
            "status": "success",
            "data": {
                **label,
                "domain_count": len(domains),
                "domains": domains,
            },
            "metadata": {
                "source": "SGD",
                "query": locus,
                "endpoint": "locus/protein_domain_details",
            },
        }



[docs]
    def _ptm(self, locus: str) -> Dict[str, Any]:
        raw = self._get(locus, "posttranslational_details")
        if not isinstance(raw, list):
            raw = []

        sites = []
        for row in raw:
            ref = row.get("reference") or {}
            sites.append(
                {
                    "modification": row.get("type"),
                    "residue": row.get("site_residue"),
                    "position": row.get("site_index"),
                    "reference": ref.get("display_name"),
                    "pubmed_id": ref.get("pubmed_id"),
                }
            )

        label = self._locus_label(raw)
        return {
            "status": "success",
            "data": {
                **label,
                "site_count": len(sites),
                "sites": sites,
            },
            "metadata": {
                "source": "SGD",
                "query": locus,
                "endpoint": "locus/posttranslational_details",
            },
        }



[docs]
    def _literature(self, locus: str) -> Dict[str, Any]:
        raw = self._get(locus, "literature_details")
        if not isinstance(raw, dict):
            raw = {}

        counts: Dict[str, int] = {}
        references: Dict[str, List[dict]] = {}
        for category, refs in raw.items():
            if not isinstance(refs, list):
                continue
            counts[category] = len(refs)
            trimmed = []
            for ref in refs[:_MAX_REFS_PER_CATEGORY]:
                trimmed.append(
                    {
                        "citation": ref.get("citation") or ref.get("display_name"),
                        "pubmed_id": ref.get("pubmed_id"),
                        "year": ref.get("year"),
                        "reference_link": ref.get("link"),
                    }
                )
            references[category] = trimmed

        return {
            "status": "success",
            "data": {
                "query": locus,
                "total_references": sum(counts.values()),
                "counts_by_category": counts,
                "references": references,
                "truncated_per_category": _MAX_REFS_PER_CATEGORY,
            },
            "metadata": {
                "source": "SGD",
                "query": locus,
                "endpoint": "locus/literature_details",
            },
        }