Source code for tooluniverse.proteomicsdb_tool

"""
ProteomicsDB Tool - Mass Spectrometry-based Protein Expression Database

Provides access to ProteomicsDB (proteomicsdb.org), a comprehensive database
of mass spectrometry-based protein quantification across human tissues,
cell lines, and body fluids.

API: https://www.proteomicsdb.org/proteomicsdb/logic/
Authentication: None required (free public API).
"""

import requests
from typing import Dict, Any, Optional, List
from .base_tool import BaseTool
from .tool_registry import register_tool

BASE_URL = "https://www.proteomicsdb.org/proteomicsdb/logic"
API_V2_URL = "https://www.proteomicsdb.org/proteomicsdb/logic/api_v2/api.xsodata"


def _odata_request(url, timeout=30):
    """Execute an OData GET request and return parsed JSON."""
    try:
        resp = requests.get(url, timeout=timeout)
        if resp.status_code != 200:
            return {
                "ok": False,
                "error": "ProteomicsDB returned HTTP %d" % resp.status_code,
            }
        data = resp.json()
        return {"ok": True, "data": data}
    except requests.exceptions.Timeout:
        return {"ok": False, "error": "ProteomicsDB request timed out"}
    except requests.exceptions.ConnectionError:
        return {"ok": False, "error": "Failed to connect to ProteomicsDB"}
    except ValueError:
        ct = resp.headers.get("content-type", "")
        return {
            "ok": False,
            "error": "Invalid JSON response from ProteomicsDB",
            "content_type": ct,
            "response_snippet": resp.text[:200],
            "retryable": "text/html" in ct or resp.text.lstrip().startswith("<"),
            "suggestion": "ProteomicsDB API may be under maintenance. Retry in a few minutes.",
        }
    except Exception as e:
        return {"ok": False, "error": "Request failed: %s" % str(e)}


def _xsjs_request(endpoint, params=None, timeout=30):
    """Execute a request to a ProteomicsDB .xsjs endpoint."""
    url = "%s/%s" % (BASE_URL, endpoint)
    try:
        resp = requests.get(url, params=params, timeout=timeout)
        if resp.status_code != 200:
            return {
                "ok": False,
                "error": "ProteomicsDB returned HTTP %d" % resp.status_code,
            }
        data = resp.json()
        return {"ok": True, "data": data}
    except requests.exceptions.Timeout:
        return {"ok": False, "error": "ProteomicsDB request timed out"}
    except requests.exceptions.ConnectionError:
        return {"ok": False, "error": "Failed to connect to ProteomicsDB"}
    except ValueError:
        ct = resp.headers.get("content-type", "")
        return {
            "ok": False,
            "error": "Invalid JSON response from ProteomicsDB",
            "content_type": ct,
            "response_snippet": resp.text[:200],
            "retryable": "text/html" in ct or resp.text.lstrip().startswith("<"),
            "suggestion": "ProteomicsDB API may be under maintenance. Retry in a few minutes.",
        }
    except Exception as e:
        return {"ok": False, "error": "Request failed: %s" % str(e)}


def _resolve_protein_id(uniprot_id, taxcode=9606):
    """Resolve a UniProt accession to a ProteomicsDB internal PROTEIN_ID.

    Returns the first SwissProt (canonical) match if available, else the first match.
    """
    url = (
        "%s/Protein?$filter=UNIQUE_IDENTIFIER eq '%s' and TAXCODE eq %d"
        "&$format=json&$select=PROTEIN_ID,UNIQUE_IDENTIFIER,ENTRY_NAME,"
        "PROTEIN_NAME,GENE_NAME,TAXCODE,MASS"
    ) % (API_V2_URL, uniprot_id, taxcode)

    result = _odata_request(url)
    if not result["ok"]:
        return None, result["error"]

    results = result["data"].get("d", {}).get("results", [])
    if not results:
        return None, "Protein '%s' not found in ProteomicsDB" % uniprot_id

    # Prefer the canonical (non-isoform) SwissProt entry
    for entry in results:
        uid = entry.get("UNIQUE_IDENTIFIER", "")
        if uid == uniprot_id and "-" not in uid:
            return entry["PROTEIN_ID"], None

    # Fallback to first result
    return results[0]["PROTEIN_ID"], None



[docs]
@register_tool("ProteomicsDBTool")
class ProteomicsDBTool(BaseTool):
    """
    Tool for querying ProteomicsDB, a mass spectrometry-based protein
    expression database covering human tissues, cell lines, and body fluids.

    Complements HPA (antibody-based) with MS-based quantitative proteomics
    from the Human Proteome Project. Provides iBAQ and TOP3 quantification
    across 340+ biological sources.
    """


[docs]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.parameter = tool_config.get("parameter", {})
        self.required = self.parameter.get("required", [])



[docs]
    def run(self, arguments):
        """Execute a ProteomicsDB query."""
        operation = arguments.get("operation")
        if not operation:
            return {"status": "error", "error": "Missing required parameter: operation"}

        handlers = {
            "get_protein_expression": self._get_protein_expression,
            "search_proteins": self._search_proteins,
            "get_expression_summary": self._get_expression_summary,
            "list_tissues": self._list_tissues,
            "get_peptides_for_protein": self._get_peptides_for_protein,
        }

        handler = handlers.get(operation)
        if not handler:
            return {
                "status": "error",
                "error": "Unknown operation: %s" % operation,
                "available_operations": list(handlers.keys()),
            }

        try:
            return handler(arguments)
        except Exception as e:
            return {"status": "error", "error": "Operation failed: %s" % str(e)}



[docs]
    def _get_protein_expression(self, arguments):
        """Get protein expression across tissues/cell lines."""
        uniprot_id = arguments.get("uniprot_id")
        if not uniprot_id:
            return {"status": "error", "error": "uniprot_id parameter is required"}

        tissue_category = arguments.get("tissue_category", "tissue;fluid;cell line")
        calculation_method = arguments.get("calculation_method", "iBAQ")

        # Map user-friendly category to API format
        category_map = {
            "tissue": "tissue",
            "cell line": "cell line",
            "cell_line": "cell line",
            "fluid": "fluid",
            "all": "tissue;fluid;cell line",
        }
        if tissue_category in category_map:
            bio_source = category_map[tissue_category]
        else:
            bio_source = tissue_category

        # Map calculation method
        calc_map = {"iBAQ": "iBAQ", "ibaq": "iBAQ", "TOP3": "top3", "top3": "top3"}
        calc_unit = calc_map.get(calculation_method, "iBAQ")

        # Use the heatmap cluster endpoint (the primary expression data API)
        result = _xsjs_request(
            "getExpressionProfileHeatmapCluster.xsjs",
            params={
                "proteins": uniprot_id,
                "omics": "Proteomics",
                "biologicalSource": bio_source,
                "quantification": "MS1",
                "calculationMethod": calc_unit,
                "swissprotOnly": 1,
                "noIsoforms": 1,
                "source": "db",
                "datasetIds": "",
                "impute": 0,
                "taxcode": 9606,
            },
            timeout=60,
        )

        if not result["ok"]:
            return {"status": "error", "error": result["error"]}

        data = result["data"]
        mapdata = data.get("mapdata", [])
        tissuedata = data.get("tissuedata", [])
        proteindata = data.get("proteindata", [])

        if not mapdata:
            return {
                "status": "success",
                "data": {
                    "uniprot_id": uniprot_id,
                    "calculation_method": calc_unit,
                    "tissue_category": bio_source,
                    "num_tissues": 0,
                    "expression_records": [],
                },
            }

        # Build tissue lookup: tissue_id -> [name, sap_synonym, category]
        tissue_lookup = {}
        for t in tissuedata:
            # Format: [tissue_id, name, sap_synonym, category]
            if len(t) >= 4:
                tissue_lookup[t[0]] = {
                    "tissue_name": t[1] or "",
                    "tissue_group": t[2] or "",
                    "tissue_category": t[3] or "",
                }

        # Build expression records
        # mapdata format: [protein_id, tissue_id, val1, val2, val3, val4]
        # val1 appears to be the main normalized expression value (log10 iBAQ)
        records = []
        for row in mapdata:
            if len(row) < 3:
                continue
            tissue_id = row[1]
            tissue_info = tissue_lookup.get(tissue_id, {})
            rec = {
                "tissue_id": tissue_id,
                "tissue_name": tissue_info.get("tissue_name", ""),
                "tissue_group": tissue_info.get("tissue_group", ""),
                "tissue_category": tissue_info.get("tissue_category", ""),
                "expression_value": row[2] if len(row) > 2 else None,
            }
            # Add additional quantification values if present
            if len(row) > 3:
                rec["median_expression"] = row[3]
            if len(row) > 4:
                rec["min_expression"] = row[4]
            if len(row) > 5:
                rec["max_expression"] = row[5]
            records.append(rec)

        # Sort by expression value descending
        records.sort(key=lambda r: r.get("expression_value") or 0, reverse=True)

        # Get protein info
        protein_name = ""
        gene_name = ""
        if proteindata and len(proteindata[0]) >= 4:
            protein_name = proteindata[0][1]
            gene_name = proteindata[0][2]

        return {
            "status": "success",
            "data": {
                "uniprot_id": uniprot_id,
                "protein_name": protein_name,
                "gene_name": gene_name,
                "calculation_method": calc_unit,
                "tissue_category": bio_source,
                "num_tissues": len(records),
                "expression_records": records,
            },
        }



[docs]
    def _search_proteins(self, arguments):
        """Search for proteins by gene name, UniProt ID, or protein name."""
        query = arguments.get("query")
        if not query:
            return {"status": "error", "error": "query parameter is required"}

        organism_id = arguments.get("organism_id", 9606)
        max_results = arguments.get("max_results", 20)

        # Try gene name first, then UniProt ID, then protein name
        # Use OData substringof for flexible matching
        url = (
            "%s/Protein?$filter=(substringof('%s',GENE_NAME) or "
            "substringof('%s',UNIQUE_IDENTIFIER) or "
            "substringof('%s',PROTEIN_NAME)) and TAXCODE eq %d"
            "&$format=json&$select=PROTEIN_ID,UNIQUE_IDENTIFIER,ENTRY_NAME,"
            "PROTEIN_NAME,GENE_NAME,TAXCODE,MASS"
            "&$top=%d"
        ) % (API_V2_URL, query, query, query, organism_id, max_results)

        result = _odata_request(url, timeout=30)
        if not result["ok"]:
            return {"status": "error", "error": result["error"]}

        raw_results = result["data"].get("d", {}).get("results", [])

        # Deduplicate by UNIQUE_IDENTIFIER (same protein can appear with
        # different PROTEIN_IDs due to database versions)
        seen = set()
        proteins = []
        for r in raw_results:
            uid = r.get("UNIQUE_IDENTIFIER", "")
            if uid in seen:
                continue
            seen.add(uid)
            mass_val = r.get("MASS")
            if mass_val is not None:
                try:
                    mass_val = float(mass_val)
                except (ValueError, TypeError):
                    mass_val = None
            proteins.append(
                {
                    "protein_id": r.get("PROTEIN_ID"),
                    "uniprot_id": uid,
                    "entry_name": r.get("ENTRY_NAME", ""),
                    "protein_name": r.get("PROTEIN_NAME", ""),
                    "gene_name": r.get("GENE_NAME", ""),
                    "organism_id": r.get("TAXCODE"),
                    "mass_da": mass_val,
                }
            )

        return {
            "status": "success",
            "data": {
                "query": query,
                "organism_id": organism_id,
                "num_results": len(proteins),
                "proteins": proteins,
            },
        }



[docs]
    def _get_expression_summary(self, arguments):
        """Get top tissues by expression level for a protein."""
        uniprot_id = arguments.get("uniprot_id")
        if not uniprot_id:
            return {"status": "error", "error": "uniprot_id parameter is required"}

        top_n = arguments.get("top_n", 10)

        # Get expression across all categories
        expr_result = self._get_protein_expression(
            {
                "uniprot_id": uniprot_id,
                "tissue_category": "all",
                "calculation_method": "iBAQ",
            }
        )

        if expr_result.get("status") != "success":
            return expr_result

        expr_data = expr_result["data"]
        records = expr_data.get("expression_records", [])

        # Already sorted by expression value descending
        top_records = records[:top_n]

        # Separate by category
        tissue_count = sum(1 for r in records if r.get("tissue_category") == "tissue")
        cell_line_count = sum(
            1 for r in records if r.get("tissue_category") == "cell line"
        )
        fluid_count = sum(1 for r in records if r.get("tissue_category") == "fluid")

        return {
            "status": "success",
            "data": {
                "uniprot_id": uniprot_id,
                "protein_name": expr_data.get("protein_name", ""),
                "gene_name": expr_data.get("gene_name", ""),
                "total_sources": len(records),
                "tissue_count": tissue_count,
                "cell_line_count": cell_line_count,
                "fluid_count": fluid_count,
                "top_n": top_n,
                "top_expression": top_records,
            },
        }



[docs]
    def _list_tissues(self, arguments):
        """List all available tissues/cell lines in ProteomicsDB."""
        tissue_category = arguments.get("tissue_category")

        # Build OData filter
        filter_parts = []
        if tissue_category:
            cat_map = {
                "tissue": "tissue",
                "cell line": "cell line",
                "cell_line": "cell line",
                "fluid": "fluid",
            }
            cat_val = cat_map.get(tissue_category, tissue_category)
            filter_parts.append("CATEGORY eq '%s'" % cat_val)

        filter_str = ""
        if filter_parts:
            filter_str = "&$filter=" + " and ".join(filter_parts)

        url = (
            "%s/Tissue?$format=json&$select=TISSUE_ID,NAME,CATEGORY,SAP_SYNONYM"
            "&$top=5000%s"
        ) % (API_V2_URL, filter_str)

        result = _odata_request(url, timeout=30)
        if not result["ok"]:
            return {"status": "error", "error": result["error"]}

        raw_results = result["data"].get("d", {}).get("results", [])

        tissues = []
        for r in raw_results:
            name = r.get("NAME", "")
            category = r.get("CATEGORY", "")
            # Skip entries with empty name or category
            if not name or not category:
                continue
            tissues.append(
                {
                    "tissue_id": r.get("TISSUE_ID", ""),
                    "name": name,
                    "category": category,
                    "group": r.get("SAP_SYNONYM", ""),
                }
            )

        # Sort by category then name
        tissues.sort(key=lambda t: (t["category"], t["name"]))

        # Count by category
        cat_counts = {}
        for t in tissues:
            cat = t["category"]
            cat_counts[cat] = cat_counts.get(cat, 0) + 1

        return {
            "status": "success",
            "data": {
                "filter_category": tissue_category,
                "total_sources": len(tissues),
                "category_counts": cat_counts,
                "tissues": tissues,
            },
        }



[docs]
    @staticmethod
    def _to_float(value):
        """Best-effort convert a numeric string to float; return None on failure."""
        if value is None:
            return None
        try:
            return float(value)
        except (TypeError, ValueError):
            return value



[docs]
    @staticmethod
    def _to_int(value):
        """Best-effort convert a numeric string to int; leave as-is on failure."""
        try:
            return int(value)
        except (TypeError, ValueError):
            return value



[docs]
    def _get_peptides_for_protein(self, arguments):
        """Get peptide-level identification evidence for a protein.

        Returns per-peptide sequence, uniqueness, peptide & protein q-values,
        peptide score, search engine, and source experiment/project/PubMed -
        the identification-level evidence that the tissue-level expression tools
        do not expose.
        """
        uniprot_id = arguments.get("uniprot_id")
        if not uniprot_id:
            return {"status": "error", "error": "uniprot_id parameter is required"}

        max_results = arguments.get("max_results", 50)
        try:
            max_results = int(max_results)
        except (TypeError, ValueError):
            max_results = 50
        if max_results < 1:
            max_results = 1

        # The proteinpeptideresult OData service takes the protein accession as
        # an InputParam and returns one row per peptide identification.
        select_fields = (
            "PEPTIDE_SEQUENCE,PEPTIDE_MASS,ISUNIQUE,ISUNIQUE_PROTEIN,"
            "PEPTIDE_Q_VALUE,PROTEIN_Q_VALUE,PEPTIDE_SCORE,SEARCH_ENGINE,"
            "START_POSITION,END_POSITION,EXPERIMENT_ID,EXPERIMENT_NAME,"
            "PROJECT_NAME,PROJECT_DESCRIPTION,PUBMEDID,UNIQUE_IDENTIFIER,"
            "GENE_NAME,PROTEIN_NAME,ENTRY_NAME"
        )
        safe_id = str(uniprot_id).replace("'", "")
        url = (
            "%s/api/proteinpeptideresult.xsodata/"
            "InputParams(PROTEINFILTER='%s')/Results"
            "?$format=json&$top=%d&$select=%s"
        ) % (BASE_URL, safe_id, max_results, select_fields)

        result = _odata_request(url, timeout=30)
        if not result["ok"]:
            return {"status": "error", "error": result["error"]}

        raw = result["data"].get("d", {}).get("results", [])
        if not raw:
            return {
                "status": "success",
                "data": {
                    "uniprot_id": uniprot_id,
                    "num_peptides": 0,
                    "peptides": [],
                },
            }

        peptides = []
        for r in raw:
            peptides.append(
                {
                    "peptide_sequence": r.get("PEPTIDE_SEQUENCE", ""),
                    "peptide_mass": self._to_float(r.get("PEPTIDE_MASS")),
                    "is_unique": self._to_int(r.get("ISUNIQUE")),
                    "is_unique_to_protein": self._to_int(r.get("ISUNIQUE_PROTEIN")),
                    "peptide_q_value": self._to_float(r.get("PEPTIDE_Q_VALUE")),
                    "protein_q_value": self._to_float(r.get("PROTEIN_Q_VALUE")),
                    "peptide_score": self._to_float(r.get("PEPTIDE_SCORE")),
                    "search_engine": self._to_int(r.get("SEARCH_ENGINE")),
                    "start_position": self._to_int(r.get("START_POSITION")),
                    "end_position": self._to_int(r.get("END_POSITION")),
                    "experiment_id": r.get("EXPERIMENT_ID"),
                    "experiment_name": r.get("EXPERIMENT_NAME", ""),
                    "project_name": r.get("PROJECT_NAME", ""),
                    "project_description": r.get("PROJECT_DESCRIPTION", ""),
                    "pubmed_id": r.get("PUBMEDID"),
                }
            )

        first = raw[0]
        return {
            "status": "success",
            "data": {
                "uniprot_id": uniprot_id,
                "gene_name": first.get("GENE_NAME", ""),
                "protein_name": first.get("PROTEIN_NAME", ""),
                "entry_name": first.get("ENTRY_NAME", ""),
                "num_peptides": len(peptides),
                "peptides": peptides,
            },
        }