tooluniverse.elm_tool 源代码

"""
ELM (Eukaryotic Linear Motif) Tool - Short Linear Motifs in Proteins

Provides access to the ELM database for querying experimentally validated
short linear motifs (SLiMs) in proteins. SLiMs are compact protein interaction
interfaces involved in regulatory functions like degradation, localization,
post-translational modification, and docking.

ELM categorizes motifs into functional classes:
- CLV: Cleavage sites (protease recognition)
- DEG: Degradation motifs (degrons)
- DOC: Docking motifs (protein-protein interaction)
- LIG: Ligand binding motifs
- MOD: Post-translational modification sites
- TRG: Targeting/localization signals

API base: http://elm.eu.org
No authentication required. Data returned as TSV, parsed to JSON.

Reference: Kumar et al., Nucleic Acids Res. 2022
"""

import csv
import io
import requests
from typing import Dict, Any, List
from .base_tool import BaseTool
from .tool_registry import register_tool


ELM_BASE_URL = "http://elm.eu.org"



[文档]
@register_tool("ELMTool")
class ELMTool(BaseTool):
    """
    Tool for querying the ELM (Eukaryotic Linear Motif) database.

    Supported operations:
    - get_instances: Get experimentally validated motif instances for a protein
    - list_classes: List all ELM motif classes with regex patterns
    """


[文档]
    def __init__(self, tool_config: Dict[str, Any]):
        super().__init__(tool_config)
        self.parameter = tool_config.get("parameter", {})
        self.required = self.parameter.get("required", [])
        self.session = requests.Session()
        self.timeout = 30



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute the ELM tool with given arguments."""
        operation = arguments.get("operation")
        if not operation:
            return {"status": "error", "error": "Missing required parameter: operation"}

        handlers = {
            "get_instances": self._get_instances,
            "list_classes": self._list_classes,
            "get_interaction_domains": self._get_interaction_domains,
        }

        handler = handlers.get(operation)
        if not handler:
            return {
                "status": "error",
                "error": "Unknown operation: {}. Available: {}".format(
                    operation, list(handlers.keys())
                ),
            }

        try:
            return handler(arguments)
        except requests.exceptions.Timeout:
            return {"status": "error", "error": "ELM API request timed out"}
        except requests.exceptions.ConnectionError:
            return {"status": "error", "error": "Failed to connect to ELM"}
        except Exception as e:
            return {"status": "error", "error": "ELM error: {}".format(str(e))}



[文档]
    def _parse_tsv(self, text: str) -> List[Dict[str, str]]:
        """Parse ELM TSV response, skipping comment lines."""
        lines = [line for line in text.strip().split("\n") if not line.startswith("#")]
        if not lines:
            return []
        reader = csv.DictReader(io.StringIO("\n".join(lines)), delimiter="\t")
        return [{k: (v or "") for k, v in row.items()} for row in reader]



[文档]
    def _get_instances(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Get experimentally validated motif instances for a protein."""
        uniprot_id = arguments.get("uniprot_id") or arguments.get("uniprot_acc")
        if not uniprot_id:
            return {
                "status": "error",
                "error": "Missing required parameter: uniprot_id (or uniprot_acc)",
            }

        motif_type = arguments.get("motif_type")

        resp = self.session.get(
            "{}/instances.tsv?q={}".format(ELM_BASE_URL, uniprot_id),
            timeout=self.timeout,
        )
        if resp.status_code != 200:
            return {
                "status": "error",
                "error": "ELM returned HTTP {}".format(resp.status_code),
            }

        rows = self._parse_tsv(resp.text)
        if not rows:
            return {
                "status": "success",
                "data": [],
                "metadata": {
                    "uniprot_id": uniprot_id,
                    "total_instances": 0,
                    "message": "No motif instances found for {}".format(uniprot_id),
                },
            }

        results = []
        for row in rows:
            elm_id = row.get("ELMIdentifier", "")
            elm_type = row.get("ELMType", "")

            if motif_type and elm_type.upper() != motif_type.upper():
                continue

            start = row.get("Start", "")
            end = row.get("End", "")
            refs = row.get("References", "")
            methods = row.get("Methods", "")
            pdb = row.get("PDB", "")

            results.append(
                {
                    "accession": row.get("Accession", ""),
                    "elm_identifier": elm_id,
                    "elm_type": elm_type,
                    "protein_name": row.get("ProteinName", ""),
                    "start": int(start) if start.isdigit() else None,
                    "end": int(end) if end.isdigit() else None,
                    "references": [r.strip() for r in refs.split(" ") if r.strip()]
                    if refs
                    else [],
                    "methods": [m.strip() for m in methods.split(";") if m.strip()]
                    if methods
                    else [],
                    "instance_logic": row.get("InstanceLogic", ""),
                    "pdb_ids": [p.strip() for p in pdb.split(" ") if p.strip()]
                    if pdb
                    else [],
                    "organism": row.get("Organism", ""),
                }
            )

        type_summary = {}
        for r in results:
            t = r["elm_type"]
            type_summary[t] = type_summary.get(t, 0) + 1

        return {
            "status": "success",
            "data": results,
            "metadata": {
                "uniprot_id": uniprot_id,
                "total_instances": len(results),
                "motif_type_summary": type_summary,
            },
        }



[文档]
    def _get_interaction_domains(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Map ELM motif classes to the protein domains that recognize them.

        For each Short Linear Motif (SLiM) class, ELM records the interaction
        domain (with Pfam accession) that binds/recognizes the motif.
        """
        elm_identifier = arguments.get("elm_identifier")
        query = arguments.get("query")
        max_results = arguments.get("max_results", 100)

        resp = self.session.get(
            "{}/interactiondomains.tsv".format(ELM_BASE_URL),
            timeout=self.timeout,
        )
        if resp.status_code != 200:
            return {
                "status": "error",
                "error": "ELM returned HTTP {}".format(resp.status_code),
            }

        rows = self._parse_tsv(resp.text)

        results = []
        for row in rows:
            elm_id = row.get("ELM identifier", "")
            pfam_acc = row.get("Interaction Domain Id", "")
            domain_desc = row.get("Interaction Domain Description", "")
            domain_name = row.get("Interaction Domain Name", "")

            if elm_identifier and elm_id.upper() != elm_identifier.upper():
                continue

            if query:
                q_lower = query.lower()
                searchable = "{}|{}|{}|{}".format(
                    elm_id, pfam_acc, domain_desc, domain_name
                ).lower()
                if q_lower not in searchable:
                    continue

            results.append(
                {
                    "elm_identifier": elm_id,
                    "pfam_accession": pfam_acc,
                    "interaction_domain_description": domain_desc,
                    "interaction_domain_name": domain_name,
                }
            )

        total = len(results)
        results = results[:max_results]

        return {
            "status": "success",
            "data": results,
            "metadata": {
                "total_mappings": total,
                "returned": len(results),
                "filter_elm_identifier": elm_identifier,
                "filter_query": query,
            },
        }



[文档]
    def _list_classes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """List ELM motif classes with regex patterns and descriptions."""
        motif_type = arguments.get("motif_type")
        query = arguments.get("query")
        max_results = arguments.get("max_results", 50)

        resp = self.session.get(
            "{}/elms/elms_index.tsv".format(ELM_BASE_URL),
            timeout=self.timeout,
        )
        if resp.status_code != 200:
            return {
                "status": "error",
                "error": "ELM returned HTTP {}".format(resp.status_code),
            }

        rows = self._parse_tsv(resp.text)

        results = []
        for row in rows:
            elm_id = row.get("ELMIdentifier", "")
            description = row.get("Description", "")
            func_name = row.get("FunctionalSiteName", "")

            if motif_type:
                prefix = elm_id.split("_")[0] if "_" in elm_id else ""
                if prefix.upper() != motif_type.upper():
                    continue

            if query:
                q_lower = query.lower()
                searchable = "{}|{}|{}".format(elm_id, description, func_name).lower()
                if q_lower not in searchable:
                    continue

            instances = row.get("#Instances", "0")
            pdb_instances = row.get("#Instances_in_PDB", "0")

            results.append(
                {
                    "accession": row.get("Accession", ""),
                    "elm_identifier": elm_id,
                    "functional_site_name": func_name,
                    "description": description,
                    "regex": row.get("Regex", ""),
                    "probability": float(row.get("Probability", "0") or "0"),
                    "num_instances": int(instances) if instances.isdigit() else 0,
                    "num_pdb_instances": int(pdb_instances)
                    if pdb_instances.isdigit()
                    else 0,
                }
            )

        total = len(results)
        results = results[:max_results]

        return {
            "status": "success",
            "data": results,
            "metadata": {
                "total_classes": total,
                "returned": len(results),
                "filter_type": motif_type,
                "filter_query": query,
            },
        }