Source code for tooluniverse.proteomexchange_tool
# proteomexchange_tool.py
"""
ProteomeXchange REST API tool for ToolUniverse.
ProteomeXchange (PX) is a consortium providing a single point of
submission for proteomics data, coordinating PRIDE, MassIVE,
PeptideAtlas, jPOST, and iProX. It provides standardized metadata
for proteomics datasets using controlled vocabulary (CV) terms.
API: https://proteomecentral.proteomexchange.org/cgi/GetDataset
No authentication required. Free for all use.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
PX_BASE_URL = "https://proteomecentral.proteomexchange.org/cgi"
[docs]
@register_tool("ProteomeXchangeTool")
class ProteomeXchangeTool(BaseTool):
"""
Tool for querying ProteomeXchange, the proteomics data consortium.
Provides access to metadata for proteomics datasets including
species, instruments, publications, and data files from PRIDE,
MassIVE, PeptideAtlas, jPOST, and iProX.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
self.endpoint_type = tool_config.get("fields", {}).get(
"endpoint_type", "get_dataset"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the ProteomeXchange API call."""
try:
return self._dispatch(arguments)
except requests.exceptions.Timeout:
return {
"error": f"ProteomeXchange API request timed out after {self.timeout} seconds"
}
except requests.exceptions.ConnectionError:
return {
"error": "Failed to connect to ProteomeXchange API. Check network connectivity."
}
except requests.exceptions.HTTPError as e:
return {
"error": f"ProteomeXchange API HTTP error: {e.response.status_code}"
}
except Exception as e:
return {"error": f"Unexpected error querying ProteomeXchange: {str(e)}"}
[docs]
def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint based on config."""
if self.endpoint_type == "get_dataset":
return self._get_dataset(arguments)
elif self.endpoint_type == "search_datasets":
return self._search_datasets(arguments)
else:
return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs]
def _extract_cv_value(self, terms, accession_prefix=None, name_match=None):
"""Extract a value from CV terms list."""
if not isinstance(terms, list):
return None
for term in terms:
if not isinstance(term, dict):
continue
if accession_prefix and term.get("accession", "").startswith(
accession_prefix
):
return term.get("value", "")
if name_match and name_match.lower() in term.get("name", "").lower():
return term.get("value", "")
return None
[docs]
def _get_dataset(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get a ProteomeXchange dataset by PX identifier."""
px_id = arguments.get("px_id", "")
if not px_id:
return {"error": "px_id parameter is required (e.g., 'PXD000001')"}
url = f"{PX_BASE_URL}/GetDataset"
params = {"ID": px_id, "outputMode": "JSON"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
# Extract title
title_terms = raw.get("title", {}).get("terms", [])
title = self._extract_cv_value(title_terms, name_match="dataset title") or ""
# Extract species
species_groups = raw.get("species", [])
species_list = []
for group in species_groups:
if isinstance(group, dict):
terms = group.get("terms", [])
sp = self._extract_cv_value(terms, name_match="taxonomy")
if sp:
species_list.append(sp)
# Extract identifiers (PX ID + partners)
identifiers = []
for ident in raw.get("identifiers", []):
if isinstance(ident, dict):
val = ident.get("value", "")
name = ident.get("name", "")
if val:
identifiers.append({"name": name, "value": val})
# Extract instruments
instruments = []
for inst_group in raw.get("instruments", []):
if isinstance(inst_group, dict):
terms = inst_group.get("terms", [])
inst = self._extract_cv_value(terms, name_match="instrument model")
if inst and inst != "null":
instruments.append(inst)
# Extract publications
publications = []
for pub in raw.get("publications", []):
if isinstance(pub, dict):
terms = pub.get("terms", [])
pmid = self._extract_cv_value(terms, name_match="PubMed identifier")
doi = self._extract_cv_value(
terms, name_match="Dataset with its publication"
)
publications.append(
{
"pubmed_id": pmid,
"doi": doi,
}
)
# Extract file count
data_files = raw.get("datasetFiles", [])
result = {
"px_id": px_id,
"title": title,
"species": species_list,
"identifiers": identifiers,
"instruments": instruments,
"publications": publications,
"file_count": len(data_files),
}
return {
"data": result,
"metadata": {
"source": "ProteomeXchange",
"query": px_id,
"endpoint": "get_dataset",
},
}
[docs]
def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search ProteomeXchange datasets via MassIVE PROXI."""
query = arguments.get("query", "")
limit = min(arguments.get("limit", 10), 50)
# Use MassIVE PROXI API for search
url = "https://massive.ucsd.edu/ProteoSAFe/proxi/v0.1/datasets"
params = {"resultType": "compact"}
if query:
params["filter"] = query
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
datasets = []
for ds in raw[:limit]:
if not isinstance(ds, dict):
continue
# Extract accession
accession_list = ds.get("accession", [])
accession = ""
if isinstance(accession_list, list):
for acc in accession_list:
if isinstance(acc, dict):
accession = acc.get("value", "")
break
# Extract title
title = ds.get("title", "")
# Extract species
species_groups = ds.get("species", [])
species_names = []
for group in species_groups:
if isinstance(group, list):
for term in group:
if isinstance(term, dict) and "common name" in term.get(
"name", ""
):
val = term.get("value", "")
if val and val != "null":
species_names.append(val)
datasets.append(
{
"accession": accession,
"title": title,
"species": species_names,
}
)
return {
"data": datasets,
"metadata": {
"source": "ProteomeXchange/MassIVE",
"total_returned": len(datasets),
"query": query or "(all)",
"endpoint": "search_datasets",
},
}