Source code for tooluniverse.proteomicsdb_tool
"""
ProteomicsDB Tool - Mass Spectrometry-based Protein Expression Database
Provides access to ProteomicsDB (proteomicsdb.org), a comprehensive database
of mass spectrometry-based protein quantification across human tissues,
cell lines, and body fluids.
API: https://www.proteomicsdb.org/proteomicsdb/logic/
Authentication: None required (free public API).
"""
import requests
from typing import Dict, Any, Optional, List
from .base_tool import BaseTool
from .tool_registry import register_tool
BASE_URL = "https://www.proteomicsdb.org/proteomicsdb/logic"
API_V2_URL = "https://www.proteomicsdb.org/proteomicsdb/logic/api_v2/api.xsodata"
def _odata_request(url, timeout=30):
"""Execute an OData GET request and return parsed JSON."""
try:
resp = requests.get(url, timeout=timeout)
if resp.status_code != 200:
return {
"ok": False,
"error": "ProteomicsDB returned HTTP %d" % resp.status_code,
}
data = resp.json()
return {"ok": True, "data": data}
except requests.exceptions.Timeout:
return {"ok": False, "error": "ProteomicsDB request timed out"}
except requests.exceptions.ConnectionError:
return {"ok": False, "error": "Failed to connect to ProteomicsDB"}
except ValueError:
return {"ok": False, "error": "Invalid JSON response from ProteomicsDB"}
except Exception as e:
return {"ok": False, "error": "Request failed: %s" % str(e)}
def _xsjs_request(endpoint, params=None, timeout=30):
"""Execute a request to a ProteomicsDB .xsjs endpoint."""
url = "%s/%s" % (BASE_URL, endpoint)
try:
resp = requests.get(url, params=params, timeout=timeout)
if resp.status_code != 200:
return {
"ok": False,
"error": "ProteomicsDB returned HTTP %d" % resp.status_code,
}
data = resp.json()
return {"ok": True, "data": data}
except requests.exceptions.Timeout:
return {"ok": False, "error": "ProteomicsDB request timed out"}
except requests.exceptions.ConnectionError:
return {"ok": False, "error": "Failed to connect to ProteomicsDB"}
except ValueError:
return {"ok": False, "error": "Invalid JSON response from ProteomicsDB"}
except Exception as e:
return {"ok": False, "error": "Request failed: %s" % str(e)}
def _resolve_protein_id(uniprot_id, taxcode=9606):
"""Resolve a UniProt accession to a ProteomicsDB internal PROTEIN_ID.
Returns the first SwissProt (canonical) match if available, else the first match.
"""
url = (
"%s/Protein?$filter=UNIQUE_IDENTIFIER eq '%s' and TAXCODE eq %d"
"&$format=json&$select=PROTEIN_ID,UNIQUE_IDENTIFIER,ENTRY_NAME,"
"PROTEIN_NAME,GENE_NAME,TAXCODE,MASS"
) % (API_V2_URL, uniprot_id, taxcode)
result = _odata_request(url)
if not result["ok"]:
return None, result["error"]
results = result["data"].get("d", {}).get("results", [])
if not results:
return None, "Protein '%s' not found in ProteomicsDB" % uniprot_id
# Prefer the canonical (non-isoform) SwissProt entry
for entry in results:
uid = entry.get("UNIQUE_IDENTIFIER", "")
if uid == uniprot_id and "-" not in uid:
return entry["PROTEIN_ID"], None
# Fallback to first result
return results[0]["PROTEIN_ID"], None
[docs]
@register_tool("ProteomicsDBTool")
class ProteomicsDBTool(BaseTool):
"""
Tool for querying ProteomicsDB, a mass spectrometry-based protein
expression database covering human tissues, cell lines, and body fluids.
Complements HPA (antibody-based) with MS-based quantitative proteomics
from the Human Proteome Project. Provides iBAQ and TOP3 quantification
across 340+ biological sources.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.parameter = tool_config.get("parameter", {})
self.required = self.parameter.get("required", [])
[docs]
def run(self, arguments):
"""Execute a ProteomicsDB query."""
operation = arguments.get("operation")
if not operation:
return {"status": "error", "error": "Missing required parameter: operation"}
handlers = {
"get_protein_expression": self._get_protein_expression,
"search_proteins": self._search_proteins,
"get_expression_summary": self._get_expression_summary,
"list_tissues": self._list_tissues,
}
handler = handlers.get(operation)
if not handler:
return {
"status": "error",
"error": "Unknown operation: %s" % operation,
"available_operations": list(handlers.keys()),
}
try:
return handler(arguments)
except Exception as e:
return {"status": "error", "error": "Operation failed: %s" % str(e)}
[docs]
def _get_protein_expression(self, arguments):
"""Get protein expression across tissues/cell lines."""
uniprot_id = arguments.get("uniprot_id")
if not uniprot_id:
return {"status": "error", "error": "uniprot_id parameter is required"}
tissue_category = arguments.get("tissue_category", "tissue;fluid;cell line")
calculation_method = arguments.get("calculation_method", "iBAQ")
# Map user-friendly category to API format
category_map = {
"tissue": "tissue",
"cell line": "cell line",
"cell_line": "cell line",
"fluid": "fluid",
"all": "tissue;fluid;cell line",
}
if tissue_category in category_map:
bio_source = category_map[tissue_category]
else:
bio_source = tissue_category
# Map calculation method
calc_map = {"iBAQ": "iBAQ", "ibaq": "iBAQ", "TOP3": "top3", "top3": "top3"}
calc_unit = calc_map.get(calculation_method, "iBAQ")
# Use the heatmap cluster endpoint (the primary expression data API)
result = _xsjs_request(
"getExpressionProfileHeatmapCluster.xsjs",
params={
"proteins": uniprot_id,
"omics": "Proteomics",
"biologicalSource": bio_source,
"quantification": "MS1",
"calculationMethod": calc_unit,
"swissprotOnly": 1,
"noIsoforms": 1,
"source": "db",
"datasetIds": "",
"impute": 0,
"taxcode": 9606,
},
timeout=60,
)
if not result["ok"]:
return {"status": "error", "error": result["error"]}
data = result["data"]
mapdata = data.get("mapdata", [])
tissuedata = data.get("tissuedata", [])
proteindata = data.get("proteindata", [])
if not mapdata:
return {
"status": "success",
"data": {
"uniprot_id": uniprot_id,
"calculation_method": calc_unit,
"tissue_category": bio_source,
"num_tissues": 0,
"expression_records": [],
},
}
# Build tissue lookup: tissue_id -> [name, sap_synonym, category]
tissue_lookup = {}
for t in tissuedata:
# Format: [tissue_id, name, sap_synonym, category]
if len(t) >= 4:
tissue_lookup[t[0]] = {
"tissue_name": t[1] or "",
"tissue_group": t[2] or "",
"tissue_category": t[3] or "",
}
# Build expression records
# mapdata format: [protein_id, tissue_id, val1, val2, val3, val4]
# val1 appears to be the main normalized expression value (log10 iBAQ)
records = []
for row in mapdata:
if len(row) < 3:
continue
tissue_id = row[1]
tissue_info = tissue_lookup.get(tissue_id, {})
rec = {
"tissue_id": tissue_id,
"tissue_name": tissue_info.get("tissue_name", ""),
"tissue_group": tissue_info.get("tissue_group", ""),
"tissue_category": tissue_info.get("tissue_category", ""),
"expression_value": row[2] if len(row) > 2 else None,
}
# Add additional quantification values if present
if len(row) > 3:
rec["median_expression"] = row[3]
if len(row) > 4:
rec["min_expression"] = row[4]
if len(row) > 5:
rec["max_expression"] = row[5]
records.append(rec)
# Sort by expression value descending
records.sort(key=lambda r: r.get("expression_value") or 0, reverse=True)
# Get protein info
protein_name = ""
gene_name = ""
if proteindata and len(proteindata[0]) >= 4:
protein_name = proteindata[0][1]
gene_name = proteindata[0][2]
return {
"status": "success",
"data": {
"uniprot_id": uniprot_id,
"protein_name": protein_name,
"gene_name": gene_name,
"calculation_method": calc_unit,
"tissue_category": bio_source,
"num_tissues": len(records),
"expression_records": records,
},
}
[docs]
def _search_proteins(self, arguments):
"""Search for proteins by gene name, UniProt ID, or protein name."""
query = arguments.get("query")
if not query:
return {"status": "error", "error": "query parameter is required"}
organism_id = arguments.get("organism_id", 9606)
max_results = arguments.get("max_results", 20)
# Try gene name first, then UniProt ID, then protein name
# Use OData substringof for flexible matching
url = (
"%s/Protein?$filter=(substringof('%s',GENE_NAME) or "
"substringof('%s',UNIQUE_IDENTIFIER) or "
"substringof('%s',PROTEIN_NAME)) and TAXCODE eq %d"
"&$format=json&$select=PROTEIN_ID,UNIQUE_IDENTIFIER,ENTRY_NAME,"
"PROTEIN_NAME,GENE_NAME,TAXCODE,MASS"
"&$top=%d"
) % (API_V2_URL, query, query, query, organism_id, max_results)
result = _odata_request(url, timeout=30)
if not result["ok"]:
return {"status": "error", "error": result["error"]}
raw_results = result["data"].get("d", {}).get("results", [])
# Deduplicate by UNIQUE_IDENTIFIER (same protein can appear with
# different PROTEIN_IDs due to database versions)
seen = set()
proteins = []
for r in raw_results:
uid = r.get("UNIQUE_IDENTIFIER", "")
if uid in seen:
continue
seen.add(uid)
mass_val = r.get("MASS")
if mass_val is not None:
try:
mass_val = float(mass_val)
except (ValueError, TypeError):
mass_val = None
proteins.append(
{
"protein_id": r.get("PROTEIN_ID"),
"uniprot_id": uid,
"entry_name": r.get("ENTRY_NAME", ""),
"protein_name": r.get("PROTEIN_NAME", ""),
"gene_name": r.get("GENE_NAME", ""),
"organism_id": r.get("TAXCODE"),
"mass_da": mass_val,
}
)
return {
"status": "success",
"data": {
"query": query,
"organism_id": organism_id,
"num_results": len(proteins),
"proteins": proteins,
},
}
[docs]
def _get_expression_summary(self, arguments):
"""Get top tissues by expression level for a protein."""
uniprot_id = arguments.get("uniprot_id")
if not uniprot_id:
return {"status": "error", "error": "uniprot_id parameter is required"}
top_n = arguments.get("top_n", 10)
# Get expression across all categories
expr_result = self._get_protein_expression(
{
"uniprot_id": uniprot_id,
"tissue_category": "all",
"calculation_method": "iBAQ",
}
)
if expr_result.get("status") != "success":
return expr_result
expr_data = expr_result["data"]
records = expr_data.get("expression_records", [])
# Already sorted by expression value descending
top_records = records[:top_n]
# Separate by category
tissue_count = sum(1 for r in records if r.get("tissue_category") == "tissue")
cell_line_count = sum(
1 for r in records if r.get("tissue_category") == "cell line"
)
fluid_count = sum(1 for r in records if r.get("tissue_category") == "fluid")
return {
"status": "success",
"data": {
"uniprot_id": uniprot_id,
"protein_name": expr_data.get("protein_name", ""),
"gene_name": expr_data.get("gene_name", ""),
"total_sources": len(records),
"tissue_count": tissue_count,
"cell_line_count": cell_line_count,
"fluid_count": fluid_count,
"top_n": top_n,
"top_expression": top_records,
},
}
[docs]
def _list_tissues(self, arguments):
"""List all available tissues/cell lines in ProteomicsDB."""
tissue_category = arguments.get("tissue_category")
# Build OData filter
filter_parts = []
if tissue_category:
cat_map = {
"tissue": "tissue",
"cell line": "cell line",
"cell_line": "cell line",
"fluid": "fluid",
}
cat_val = cat_map.get(tissue_category, tissue_category)
filter_parts.append("CATEGORY eq '%s'" % cat_val)
filter_str = ""
if filter_parts:
filter_str = "&$filter=" + " and ".join(filter_parts)
url = (
"%s/Tissue?$format=json&$select=TISSUE_ID,NAME,CATEGORY,SAP_SYNONYM"
"&$top=5000%s"
) % (API_V2_URL, filter_str)
result = _odata_request(url, timeout=30)
if not result["ok"]:
return {"status": "error", "error": result["error"]}
raw_results = result["data"].get("d", {}).get("results", [])
tissues = []
for r in raw_results:
name = r.get("NAME", "")
category = r.get("CATEGORY", "")
# Skip entries with empty name or category
if not name or not category:
continue
tissues.append(
{
"tissue_id": r.get("TISSUE_ID", ""),
"name": name,
"category": category,
"group": r.get("SAP_SYNONYM", ""),
}
)
# Sort by category then name
tissues.sort(key=lambda t: (t["category"], t["name"]))
# Count by category
cat_counts = {}
for t in tissues:
cat = t["category"]
cat_counts[cat] = cat_counts.get(cat, 0) + 1
return {
"status": "success",
"data": {
"filter_category": tissue_category,
"total_sources": len(tissues),
"category_counts": cat_counts,
"tissues": tissues,
},
}