Source code for tooluniverse.europepmc_annotations_tool

# europepmc_annotations_tool.py
"""
Europe PMC Annotations API tool for ToolUniverse.

Provides access to text-mined annotations from scientific articles using
Europe PMC's SciLite text mining pipeline. Extracts structured entities
including chemicals, organisms, gene ontology terms, diseases, and
gene/protein mentions from published literature.

API: https://www.ebi.ac.uk/europepmc/annotations_api/
No authentication required.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

EUROPEPMC_ANNOTATIONS_URL = "https://www.ebi.ac.uk/europepmc/annotations_api"


[docs] @register_tool("EuroPMCAnnotationsTool") class EuroPMCAnnotationsTool(BaseTool): """ Tool for extracting text-mined annotations from scientific articles via the Europe PMC Annotations API. Supports annotation types: Chemicals, Organisms, Gene Ontology, Diseases, Genes & Proteins, Accession Numbers. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) self.endpoint_type = tool_config.get("fields", {}).get( "endpoint_type", "by_article" )
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Europe PMC Annotations API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "error": f"Europe PMC Annotations API timed out after {self.timeout}s" } except requests.exceptions.ConnectionError: return {"error": "Failed to connect to Europe PMC Annotations API"} except requests.exceptions.HTTPError as e: status = e.response.status_code if e.response else "unknown" return {"error": f"Europe PMC Annotations API HTTP error: {status}"} except Exception as e: return {"error": f"Unexpected error: {str(e)}"}
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint_type == "by_article": return self._by_article(arguments) elif self.endpoint_type == "batch_by_type": return self._batch_by_type(arguments) elif self.endpoint_type == "chemicals_shortcut": return self._chemicals_shortcut(arguments) return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs] def _fetch_annotations( self, article_ids: str, annotation_type: str = None, page_size: int = None ): """Fetch annotations from the API.""" url = f"{EUROPEPMC_ANNOTATIONS_URL}/annotationsByArticleIds" params = { "articleIds": article_ids, "format": "JSON", } if annotation_type: params["type"] = annotation_type if page_size: params["pageSize"] = page_size response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() return response.json()
[docs] def _by_article(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get annotations from a single article.""" article_id = arguments.get("article_id", "") annotation_type = arguments.get("annotation_type") if not article_id: return {"error": "article_id is required (e.g., 'PMC:PMC4353746')"} raw = self._fetch_annotations(article_id, annotation_type) if not isinstance(raw, list) or len(raw) == 0: return { "data": { "article_id": article_id, "pmcid": None, "source": None, "annotation_count": 0, "annotations": [], }, "metadata": { "source": "Europe PMC Annotations API", "endpoint": "annotationsByArticleIds", }, } article = raw[0] annotations_raw = article.get("annotations", []) annotations = [] for ann in annotations_raw: tags = [] for tag in ann.get("tags", []): tags.append( { "name": tag.get("name", ""), "uri": tag.get("uri", ""), } ) annotations.append( { "exact": ann.get("exact", ""), "prefix": ann.get("prefix"), "postfix": ann.get("postfix"), "type": ann.get("type", ""), "section": ann.get("section"), "provider": ann.get("provider"), "tags": tags, } ) return { "data": { "article_id": article_id, "pmcid": article.get("pmcid"), "source": article.get("source"), "annotation_count": len(annotations), "annotations": annotations[:200], }, "metadata": { "source": "Europe PMC Annotations API", "endpoint": "annotationsByArticleIds", }, }
[docs] def _batch_by_type(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get annotations of a specific type from multiple articles.""" article_ids = arguments.get("article_ids", "") annotation_type = arguments.get("annotation_type", "") page_size = arguments.get("page_size") if not article_ids: return { "error": "article_ids is required (e.g., 'PMC:PMC4353746,PMC:PMC3531190')" } if not annotation_type: return {"error": "annotation_type is required (e.g., 'Chemicals')"} raw = self._fetch_annotations(article_ids, annotation_type, page_size) if not isinstance(raw, list): raw = [] total_annotations = 0 articles = [] for article in raw: annotations_raw = article.get("annotations", []) total_annotations += len(annotations_raw) annotations = [] for ann in annotations_raw: annotations.append( { "exact": ann.get("exact", ""), "type": ann.get("type", ""), "tags": ann.get("tags", []), } ) articles.append( { "article_id": f"{article.get('source', '')}:{article.get('extId', '')}", "pmcid": article.get("pmcid"), "annotation_count": len(annotations), "annotations": annotations[:100], } ) return { "data": { "article_count": len(articles), "annotation_type": annotation_type, "total_annotations": total_annotations, "articles": articles, }, "metadata": { "source": "Europe PMC Annotations API", "endpoint": "annotationsByArticleIds", }, }
[docs] def _chemicals_shortcut(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Extract chemical mentions from an article.""" article_id = arguments.get("article_id", "") if not article_id: return {"error": "article_id is required (e.g., 'PMC:PMC4353746')"} raw = self._fetch_annotations(article_id, "Chemicals") if not isinstance(raw, list) or len(raw) == 0: return { "data": { "article_id": article_id, "chemical_count": 0, "chemicals": [], }, "metadata": { "source": "Europe PMC Annotations API", "endpoint": "annotationsByArticleIds?type=Chemicals", }, } article = raw[0] annotations_raw = article.get("annotations", []) chemicals = [] for ann in annotations_raw: tags = ann.get("tags", []) chebi_uri = None chebi_name = None if tags: chebi_uri = tags[0].get("uri") chebi_name = tags[0].get("name") context = "" prefix = ann.get("prefix", "") or "" postfix = ann.get("postfix", "") or "" exact = ann.get("exact", "") context = f"...{prefix} [{exact}] {postfix}..." chemicals.append( { "name": exact, "chebi_uri": chebi_uri, "chebi_name": chebi_name, "context": context, "section": ann.get("section"), } ) return { "data": { "article_id": article_id, "chemical_count": len(chemicals), "chemicals": chemicals[:200], }, "metadata": { "source": "Europe PMC Annotations API", "endpoint": "annotationsByArticleIds?type=Chemicals", }, }