Source code for tooluniverse.europe_pmc_tool
import requests
from .base_tool import BaseTool
from .tool_registry import register_tool
from .http_utils import request_with_retry
import xml.etree.ElementTree as ET
import re
[docs]
def _normalize_pmcid(pmcid: str | None) -> tuple[str | None, str | None]:
"""
Return (pmcid_norm, pmcid_digits).
pmcid_norm: "PMC123" form (or None)
pmcid_digits: "123" digits only (or None)
"""
if not isinstance(pmcid, str):
return None, None
s = pmcid.strip()
if not s:
return None, None
upper = s.upper()
pmcid_norm = upper if upper.startswith("PMC") else f"PMC{upper}"
digits = pmcid_norm[3:]
if not digits.isdigit():
return pmcid_norm, None
return pmcid_norm, digits
[docs]
def _build_ncbi_pmc_oai_url(pmcid_digits: str | None) -> str | None:
if not isinstance(pmcid_digits, str) or not pmcid_digits.isdigit():
return None
return (
"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi"
f"?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:{pmcid_digits}"
)
[docs]
def _build_ncbi_pmc_efetch_url(pmcid_digits: str | None) -> str | None:
if not isinstance(pmcid_digits, str) or not pmcid_digits.isdigit():
return None
return (
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
f"?db=pmc&id={pmcid_digits}&retmode=xml"
)
[docs]
def _build_ncbi_pmc_html_url(pmcid_norm: str | None) -> str | None:
if not isinstance(pmcid_norm, str) or not pmcid_norm.strip():
return None
s = pmcid_norm.strip()
s = s if s.upper().startswith("PMC") else f"PMC{s}"
return f"https://pmc.ncbi.nlm.nih.gov/articles/{s}/"
[docs]
def _extract_text_from_html(html_text: str) -> str:
"""
Best-effort HTML -> text extraction.
For PMC pages, try to restrict to the main content region to avoid nav/JS/CSS noise.
"""
html_text = html_text or ""
# Prefer main article content on PMC pages.
m = re.search(
r'(?is)<main[^>]*id=["\\\']maincontent["\\\'][^>]*>(.*?)</main>', html_text
)
if not m:
m = re.search(
r'(?is)<div[^>]*id=["\\\']maincontent["\\\'][^>]*>(.*?)</div>', html_text
)
if m:
html_text = m.group(1)
# Parse using stdlib HTMLParser (more robust than regex-only stripping).
try:
from html.parser import HTMLParser
class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__(convert_charrefs=True)
self._parts: list[str] = []
self._skip_depth = 0
def handle_starttag(self, tag, attrs):
t = (tag or "").lower()
if t in {"script", "style", "noscript"}:
self._skip_depth += 1
return
if self._skip_depth:
return
if t in {
"p",
"br",
"div",
"section",
"li",
"tr",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
}:
self._parts.append(" ")
def handle_endtag(self, tag):
t = (tag or "").lower()
if t in {"script", "style", "noscript"} and self._skip_depth:
self._skip_depth -= 1
return
if self._skip_depth:
return
if t in {"p", "br", "div", "section", "li", "tr"}:
self._parts.append(" ")
def handle_data(self, data):
if self._skip_depth:
return
if data:
self._parts.append(data)
parser = _TextExtractor()
parser.feed(html_text)
text = "".join(parser._parts)
except Exception:
# Fallback: regex stripping (kept for safety).
html_text = re.sub(r"(?is)<(script|style|noscript).*?>.*?</\1>", " ", html_text)
text = re.sub(r"(?s)<[^>]+>", " ", html_text)
return " ".join((text or "").split())
[docs]
def _extract_abstract_from_pmc_html(html_text: str) -> str | None:
# Try meta tags first (most robust for machines).
candidates = [
r'(?is)<meta\\s+name=["\\\']citation_abstract["\\\']\\s+content=["\\\'](.*?)["\\\']',
r'(?is)<meta\\s+name=["\\\']DC\\.Description["\\\']\\s+content=["\\\'](.*?)["\\\']',
r'(?is)<meta\\s+name=["\\\']dc\\.description["\\\']\\s+content=["\\\'](.*?)["\\\']',
]
for pat in candidates:
m = re.search(pat, html_text or "")
if m:
abstract = m.group(1)
abstract = _extract_text_from_html(abstract)
if abstract:
return abstract
return None
[docs]
def _detect_ncbi_oai_error(xml_text: str) -> dict | None:
"""
NCBI PMC OAI-PMH often returns HTTP 200 even for logical errors, e.g.:
<error code="cannotDisseminateFormat">...</error>
Returns a small structured dict when an error is detected, else None.
"""
if not isinstance(xml_text, str) or not xml_text.strip():
return None
try:
root = ET.fromstring(xml_text)
except ET.ParseError:
return None
# Only treat OAI-PMH top-level <error> as a logical error.
if not (root.tag or "").lower().endswith("oai-pmh"):
return None
for el in list(root):
if (el.tag or "").endswith("error"):
code = el.attrib.get("code")
msg = " ".join((el.text or "").split()) or None
return {"code": code, "message": msg}
return None
[docs]
def _fetch_fulltext_with_trace(
session: requests.Session,
*,
europe_fulltext_xml_url: str | None,
pmcid: str | None,
timeout: int = 20,
) -> dict:
"""
Fetch full text content with a trace of attempts.
Returns:
{
ok: bool,
url: str|None,
source: str|None,
format: "xml"|"html"|None,
content_type: str|None,
status_code: int|None,
content: str|None,
trace: list[dict],
}
"""
pmcid_norm, pmcid_digits = _normalize_pmcid(pmcid)
trace: list[dict] = []
def _record(
attempt: str, url: str | None, resp, *, note: str | None = None
) -> None:
headers = getattr(resp, "headers", {}) or {}
entry = {
"attempt": attempt,
"url": url,
"status_code": getattr(resp, "status_code", None),
"content_type": headers.get("content-type"),
"note": note,
}
trace.append(entry)
# 1) Europe PMC fullTextXML
if isinstance(europe_fulltext_xml_url, str) and europe_fulltext_xml_url.strip():
url = europe_fulltext_xml_url.strip()
resp = request_with_retry(session, "GET", url, timeout=timeout, max_attempts=2)
_record("europe_pmc_fulltextxml", getattr(resp, "url", url), resp)
if resp.status_code == 200 and (resp.text or "").strip():
headers = getattr(resp, "headers", {}) or {}
return {
"ok": True,
"url": getattr(resp, "url", url),
"source": "Europe PMC fullTextXML",
"format": "xml",
"content_type": headers.get("content-type"),
"status_code": resp.status_code,
"content": resp.text,
"trace": trace,
}
# 2) NCBI PMC OAI-PMH (JATS XML)
oai_url = _build_ncbi_pmc_oai_url(pmcid_digits)
if oai_url:
resp = request_with_retry(
session, "GET", oai_url, timeout=timeout, max_attempts=2
)
oai_err = None
if resp.status_code == 200 and (resp.text or "").strip():
oai_err = _detect_ncbi_oai_error(resp.text)
note = None
if oai_err:
code = oai_err.get("code") or "unknown"
msg = oai_err.get("message")
note = f"oai_error:{code}" + (f":{msg}" if msg else "")
_record("ncbi_pmc_oai", getattr(resp, "url", oai_url), resp, note=note)
if resp.status_code == 200 and (resp.text or "").strip() and not oai_err:
headers = getattr(resp, "headers", {}) or {}
return {
"ok": True,
"url": getattr(resp, "url", oai_url),
"source": "NCBI PMC OAI (JATS)",
"format": "xml",
"content_type": headers.get("content-type"),
"status_code": resp.status_code,
"content": resp.text,
"trace": trace,
}
# 3) NCBI PMC efetch (XML) - may return a restricted stub.
efetch_url = _build_ncbi_pmc_efetch_url(pmcid_digits)
if efetch_url:
resp = request_with_retry(
session, "GET", efetch_url, timeout=timeout, max_attempts=2
)
note = None
if resp.status_code == 200 and (resp.text or "").strip():
# Some publishers return a stub like: "does not allow download".
lowered = (resp.text or "").lower()
if "does not allow download" not in lowered:
_record("ncbi_pmc_efetch", getattr(resp, "url", efetch_url), resp)
headers = getattr(resp, "headers", {}) or {}
return {
"ok": True,
"url": getattr(resp, "url", efetch_url),
"source": "NCBI PMC efetch (XML)",
"format": "xml",
"content_type": headers.get("content-type"),
"status_code": resp.status_code,
"content": resp.text,
"trace": trace,
}
note = "restricted_stub"
_record("ncbi_pmc_efetch", getattr(resp, "url", efetch_url), resp, note=note)
# 4) NCBI PMC HTML (last resort)
html_url = _build_ncbi_pmc_html_url(pmcid_norm)
if html_url:
resp = request_with_retry(
session, "GET", html_url, timeout=timeout, max_attempts=2
)
note = "forbidden" if resp.status_code == 403 else None
_record("ncbi_pmc_html", getattr(resp, "url", html_url), resp, note=note)
if resp.status_code == 200 and (resp.text or "").strip():
headers = getattr(resp, "headers", {}) or {}
return {
"ok": True,
"url": getattr(resp, "url", html_url),
"source": "NCBI PMC HTML",
"format": "html",
"content_type": headers.get("content-type"),
"status_code": resp.status_code,
"content": resp.text,
"trace": trace,
}
last = trace[-1] if trace else {}
return {
"ok": False,
"url": last.get("url"),
"source": None,
"format": None,
"content_type": last.get("content_type"),
"status_code": last.get("status_code"),
"content": None,
"trace": trace,
}
[docs]
@register_tool("EuropePMCTool")
class EuropePMCTool(BaseTool):
"""
Tool to search for articles on Europe PMC including abstracts.
"""
[docs]
def __init__(
self,
tool_config,
base_url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
):
super().__init__(tool_config)
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
# Some upstreams (notably NCBI/PMC) return 403 for the default
# python-requests User-Agent. Set a conservative browser UA.
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
}
)
[docs]
def run(self, arguments):
query = arguments.get("query")
limit = arguments.get("limit", 5)
enrich_missing_abstract = bool(arguments.get("enrich_missing_abstract", False))
extract_terms_from_fulltext = arguments.get("extract_terms_from_fulltext")
require_has_ft = bool(arguments.get("require_has_ft", False))
fulltext_terms = arguments.get("fulltext_terms")
if not query:
return [
{
"title": "Error",
"abstract": None,
"authors": [],
"journal": None,
"year": None,
"doi": None,
"doi_url": None,
"url": None,
"citations": 0,
"open_access": False,
"keywords": [],
"source": "Europe PMC",
"error": "`query` parameter is required.",
"retryable": False,
}
]
if isinstance(fulltext_terms, list) and [
t for t in fulltext_terms if isinstance(t, str) and t.strip()
]:
require_has_ft = True
terms = [
t.strip() for t in fulltext_terms if isinstance(t, str) and t.strip()
]
def _escape_phrase(s: str) -> str:
# Europe PMC uses a Lucene-like syntax; keep this conservative.
return s.replace('"', '\\"')
clause = " OR ".join([f'BODY:"{_escape_phrase(t)}"' for t in terms])
query = f"({query}) AND ({clause})"
if require_has_ft:
query = f"({query}) AND HAS_FT:Y"
return self._search(
query,
limit,
enrich_missing_abstract=enrich_missing_abstract,
extract_terms_from_fulltext=extract_terms_from_fulltext,
)
[docs]
def _extract_abstract_from_fulltext_xml(self, xml_text: str) -> str | None:
try:
root = ET.fromstring(xml_text)
except ET.ParseError:
return None
for el in root.iter():
if self._local_name(el.tag).lower() == "abstract":
text = " ".join("".join(el.itertext()).split())
if text:
return text
return None
[docs]
def _build_fulltext_xml_url(
self, *, source_db: str | None, article_id: str | None, pmcid: str | None
) -> str | None:
if pmcid:
return (
f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
)
if source_db and article_id:
return f"https://www.ebi.ac.uk/europepmc/webservices/rest/{source_db}/{article_id}/fullTextXML"
return None
[docs]
def _build_pmc_oai_url(self, pmcid: str | None) -> str | None:
"""
Build an NCBI PMC OAI-PMH URL to retrieve JATS XML for a PMC article.
Europe PMC fullTextXML is not always available even when an article is in PMC.
The OAI endpoint provides a robust fallback for extracting full text/abstract.
"""
if not isinstance(pmcid, str):
return None
s = pmcid.strip()
if not s:
return None
s = s.upper()
if s.startswith("PMC"):
s = s[3:]
if not s.isdigit():
return None
return (
"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi"
f"?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:{s}"
)
[docs]
def _fetch_fulltext_with_trace(
self, *, fulltext_url: str | None, pmcid: str | None, timeout: int = 20
) -> dict:
return _fetch_fulltext_with_trace(
self.session,
europe_fulltext_xml_url=fulltext_url,
pmcid=pmcid,
timeout=timeout,
)
[docs]
def _search(
self,
query,
limit,
*,
enrich_missing_abstract: bool = False,
extract_terms_from_fulltext: list | None = None,
):
# First try core mode to get abstracts
core_params = {
"query": query,
"resultType": "core",
"pageSize": limit,
"format": "json",
}
core_response = request_with_retry(
self.session,
"GET",
self.base_url,
params=core_params,
timeout=20,
max_attempts=3,
)
# Then try lite mode to get journal information
lite_params = {
"query": query,
"resultType": "lite",
"pageSize": limit,
"format": "json",
}
lite_response = request_with_retry(
self.session,
"GET",
self.base_url,
params=lite_params,
timeout=20,
max_attempts=3,
)
if core_response.status_code != 200:
return [
{
"title": "Error",
"abstract": None,
"authors": [],
"journal": None,
"year": None,
"doi": None,
"url": None,
"citations": 0,
"open_access": False,
"keywords": [],
"source": "Europe PMC",
"error": f"Europe PMC API error {core_response.status_code}",
"reason": core_response.reason,
"retryable": core_response.status_code
in (408, 429, 500, 502, 503, 504),
}
]
# Get core mode results
try:
core_payload = core_response.json()
except ValueError:
return [
{
"title": "Error",
"abstract": None,
"authors": [],
"journal": None,
"year": None,
"doi": None,
"url": None,
"citations": 0,
"open_access": False,
"keywords": [],
"source": "Europe PMC",
"error": "Europe PMC returned invalid JSON",
"retryable": True,
}
]
core_results = core_payload.get("resultList", {}).get("result", [])
lite_results = []
# If lite mode also succeeds, get journal information
if lite_response.status_code == 200:
try:
lite_payload = lite_response.json()
except ValueError:
lite_payload = {}
lite_results = lite_payload.get("resultList", {}).get("result", [])
# Create ID to record mapping
lite_map = {rec.get("id"): rec for rec in lite_results}
articles = []
for rec in core_results:
# Extract basic information
title = rec.get("title")
abstract = rec.get("abstractText") or None
year = rec.get("pubYear")
# Extract author information
authors = []
author_list = rec.get("authorList", {}).get("author", [])
if isinstance(author_list, list):
for author in author_list:
if isinstance(author, dict):
full_name = author.get("fullName", "")
if full_name:
authors.append(full_name)
elif isinstance(author_list, dict):
full_name = author_list.get("fullName", "")
if full_name:
authors.append(full_name)
# Get journal information from lite mode
journal = None
if rec.get("id") in lite_map:
lite_rec = lite_map[rec["id"]]
journal = lite_rec.get("journalTitle")
# If still no journal information, use source field
if not journal:
journal = rec.get("source")
# Extract DOI
doi = rec.get("doi") or None
doi_url = f"https://doi.org/{doi}" if doi else None
source_db = rec.get("source") or None
article_id = rec.get("id") or None
pmid = rec.get("pmid") or None
pmcid = rec.get("pmcid") or None
if isinstance(pmcid, str):
pmcid = pmcid.strip() or None
# Extract citation count
citations = rec.get("citedByCount", 0)
if citations:
try:
citations = int(citations)
except (ValueError, TypeError):
citations = 0
# Extract open access status
open_access_raw = rec.get("isOpenAccess", False)
# Normalize to boolean (API can return 'Y'/'N' or True/False)
if isinstance(open_access_raw, str):
open_access = open_access_raw.upper() == "Y"
else:
open_access = bool(open_access_raw)
# Extract keywords
keywords = []
text_mined_terms = rec.get("hasTextMinedTerms", {})
if text_mined_terms and isinstance(text_mined_terms, dict):
# Try to extract keywords
for _key, value in text_mined_terms.items():
if isinstance(value, list):
keywords.extend(value)
elif isinstance(value, str):
keywords.append(value)
# Build URL
url = (
f"https://europepmc.org/article/{source_db}/{article_id}"
if source_db and article_id
else None
)
fulltext_xml_url = self._build_fulltext_xml_url(
source_db=source_db, article_id=article_id, pmcid=pmcid
)
articles.append(
{
"title": title or "Title not available",
"abstract": abstract,
"authors": authors,
"journal": journal or None,
"year": year,
"doi": doi,
"url": url,
"source_db": source_db,
"article_id": article_id,
"pmid": pmid,
"pmcid": pmcid,
"doi_url": doi_url,
"fulltext_xml_url": fulltext_xml_url,
"citations": citations,
"open_access": open_access,
"keywords": keywords,
"source": "Europe PMC",
"data_quality": {
"has_abstract": bool(abstract),
"has_authors": bool(authors),
"has_journal": bool(journal),
"has_year": bool(year),
"has_doi": bool(doi),
"has_citations": bool(citations and citations > 0),
"has_keywords": bool(keywords),
"has_url": bool(url),
},
}
)
if enrich_missing_abstract:
# Keep enrichment bounded to avoid slow calls.
max_enrich = min(int(limit) if limit else 5, 3)
enriched = 0
for a in articles:
if enriched >= max_enrich:
break
if not isinstance(a, dict):
continue
if a.get("abstract"):
continue
fulltext_url = a.get("fulltext_xml_url")
if not isinstance(fulltext_url, str) or not fulltext_url:
continue
fetch = self._fetch_fulltext_with_trace(
fulltext_url=fulltext_url, pmcid=a.get("pmcid"), timeout=20
)
content = fetch.get("content")
if not isinstance(content, str) or not content.strip():
continue
abstract_from_fulltext = None
if fetch.get("format") == "xml":
abstract_from_fulltext = self._extract_abstract_from_fulltext_xml(
content
)
elif fetch.get("format") == "html":
abstract_from_fulltext = _extract_abstract_from_pmc_html(content)
if abstract_from_fulltext:
a["abstract"] = abstract_from_fulltext
a["abstract_source"] = fetch.get("source") or "fulltext"
a["abstract_retrieval_trace"] = fetch.get("trace") or []
if isinstance(a.get("data_quality"), dict):
a["data_quality"]["has_abstract"] = True
enriched += 1
# Extract fulltext snippets if requested
if extract_terms_from_fulltext and isinstance(
extract_terms_from_fulltext, list
):
# Filter valid terms -- accept any number, process in batches of 5
all_valid_terms = [
t.strip()
for t in extract_terms_from_fulltext
if isinstance(t, str) and t.strip()
]
if all_valid_terms:
# Chunk terms into batches of 5 to stay within safe limits
batch_size = 5
term_batches = [
all_valid_terms[i : i + batch_size]
for i in range(0, len(all_valid_terms), batch_size)
]
# Process up to 3 OA articles to avoid latency
max_snippet_articles = 3
processed = 0
for a in articles:
if processed >= max_snippet_articles:
break
if not isinstance(a, dict):
continue
# Only process open access articles with fulltext URLs
if not a.get("open_access"):
continue
fulltext_url = a.get("fulltext_xml_url")
if not isinstance(fulltext_url, str) or not fulltext_url:
continue
# Extract snippets using the existing tool logic
try:
fetch = self._fetch_fulltext_with_trace(
fulltext_url=fulltext_url, pmcid=a.get("pmcid"), timeout=30
)
content = fetch.get("content")
if not isinstance(content, str) or not content.strip():
continue
if fetch.get("format") == "xml":
try:
root = ET.fromstring(content or "")
text = " ".join("".join(root.itertext()).split())
except ET.ParseError:
continue
else:
text = _extract_text_from_html(content)
# Extract snippets around terms, processing all batches
snippets = []
total_chars = 0
max_total_chars = 8000
window_chars = 220
max_snippets_per_term = 3
low = text.lower()
for batch in term_batches:
for term in batch:
needle = term.lower()
found = 0
for m in re.finditer(re.escape(needle), low):
if found >= max_snippets_per_term:
break
start = max(0, m.start() - window_chars)
end = min(len(text), m.end() + window_chars)
snippet = text[start:end].strip()
if total_chars + len(snippet) > max_total_chars:
break
snippets.append({"term": term, "snippet": snippet})
total_chars += len(snippet)
found += 1
# Stop processing more batches if char budget exhausted
if total_chars >= max_total_chars:
break
if snippets:
a["fulltext_snippets"] = snippets
a["fulltext_snippets_count"] = len(snippets)
a["fulltext_snippets_source"] = fetch.get("source")
a["fulltext_snippets_retrieval_trace"] = (
fetch.get("trace") or []
)
processed += 1
except Exception:
# Silently skip articles that fail snippet extraction
continue
return articles
[docs]
@register_tool("EuropePMCFullTextSnippetsTool")
class EuropePMCFullTextSnippetsTool(BaseTool):
"""
Fetch Europe PMC fullTextXML (open access) and return bounded text snippets
around user-provided terms. This helps answer questions where the crucial
detail is present in the full text (e.g., methods/section titles) but not
necessarily in the abstract.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/xml, text/xml;q=0.9, */*;q=0.8",
# NCBI/PMC frequently blocks the default python-requests UA.
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
}
)
[docs]
def _build_pmc_html_url(self, pmcid: str | None) -> str | None:
if not isinstance(pmcid, str):
return None
s = pmcid.strip()
if not s:
return None
s = s if s.upper().startswith("PMC") else f"PMC{s}"
return f"https://pmc.ncbi.nlm.nih.gov/articles/{s}/"
[docs]
def _build_pmc_oai_url(self, pmcid: str | None) -> str | None:
if not isinstance(pmcid, str):
return None
s = pmcid.strip()
if not s:
return None
s = s.upper()
if s.startswith("PMC"):
s = s[3:]
if not s.isdigit():
return None
return (
"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi"
f"?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:{s}"
)
[docs]
def _build_fulltext_xml_url(self, arguments: dict) -> str | None:
fulltext_xml_url = arguments.get("fulltext_xml_url")
if isinstance(fulltext_xml_url, str) and fulltext_xml_url.strip():
return fulltext_xml_url.strip()
pmcid = arguments.get("pmcid")
if isinstance(pmcid, str) and pmcid.strip():
pmcid = pmcid.strip()
pmcid = pmcid if pmcid.upper().startswith("PMC") else f"PMC{pmcid}"
return (
f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
)
source_db = arguments.get("source_db") or arguments.get("source")
article_id = arguments.get("article_id")
if (
isinstance(source_db, str)
and source_db.strip()
and isinstance(article_id, str)
and article_id.strip()
):
return f"https://www.ebi.ac.uk/europepmc/webservices/rest/{source_db.strip()}/{article_id.strip()}/fullTextXML"
return None
[docs]
def _extract_text(self, xml_text: str) -> str:
root = ET.fromstring(xml_text)
# Collapse whitespace to make snippets readable and stable.
return " ".join("".join(root.itertext()).split())
[docs]
def _extract_text_from_html(self, html_text: str) -> str:
return _extract_text_from_html(html_text or "")
[docs]
def run(self, arguments):
fulltext_url = self._build_fulltext_xml_url(arguments)
terms = arguments.get("terms")
if terms is None:
terms = arguments.get("keywords")
if not fulltext_url:
return {
"status": "error",
"error": "Provide `fulltext_xml_url`, or `pmcid`, or (`source_db` + `article_id`).",
"retryable": False,
}
if not isinstance(terms, list) or not [
t for t in terms if isinstance(t, str) and t.strip()
]:
return {
"status": "error",
"error": "Provide a non-empty list of search terms via `terms` (preferred) or `keywords` (alias).",
"retryable": False,
}
try:
window_chars = int(arguments.get("window_chars", 220))
except (TypeError, ValueError):
window_chars = 220
window_chars = max(20, min(window_chars, 2000))
try:
max_snippets_per_term = int(arguments.get("max_snippets_per_term", 3))
except (TypeError, ValueError):
max_snippets_per_term = 3
max_snippets_per_term = max(1, min(max_snippets_per_term, 10))
try:
max_total_chars = int(arguments.get("max_total_chars", 8000))
except (TypeError, ValueError):
max_total_chars = 8000
max_total_chars = max(1000, min(max_total_chars, 50000))
fetch = _fetch_fulltext_with_trace(
self.session,
europe_fulltext_xml_url=fulltext_url,
pmcid=arguments.get("pmcid"),
timeout=30,
)
content = fetch.get("content")
if not isinstance(content, str) or not content.strip():
return {
"status": "error",
"error": "Full text fetch failed",
"url": fetch.get("url") or fulltext_url,
"status_code": fetch.get("status_code"),
"retryable": fetch.get("status_code") in (408, 429, 500, 502, 503, 504),
"retrieval_trace": fetch.get("trace") or [],
}
try:
if fetch.get("format") == "xml":
text = self._extract_text(content)
else:
text = _extract_text_from_html(content)
except ET.ParseError:
return {
"status": "error",
"error": "Full text returned invalid XML",
"url": fetch.get("url") or fulltext_url,
"retryable": True,
"retrieval_trace": fetch.get("trace") or [],
}
snippets = []
total_chars = 0
low = text.lower()
for raw_term in terms:
if not isinstance(raw_term, str):
continue
term = raw_term.strip()
if not term:
continue
needle = term.lower()
found = 0
for m in re.finditer(re.escape(needle), low):
if found >= max_snippets_per_term:
break
start = max(0, m.start() - window_chars)
end = min(len(text), m.end() + window_chars)
snippet = text[start:end].strip()
# Bound total output size
if total_chars + len(snippet) > max_total_chars:
break
snippets.append({"term": term, "snippet": snippet})
total_chars += len(snippet)
found += 1
return {
"status": "success",
"url": fetch.get("url"),
"source": fetch.get("source"),
"format": fetch.get("format"),
"content_type": fetch.get("content_type"),
"retrieval_trace": fetch.get("trace") or [],
"snippets": snippets,
"snippets_count": len(snippets),
"truncated": total_chars >= max_total_chars,
}
[docs]
@register_tool("EuropePMCFullTextFetchTool")
class EuropePMCFullTextFetchTool(BaseTool):
"""
Fetch full text content for a PMC article with deterministic fallbacks and
machine-readable provenance (retrieval_trace).
This tool is intended for machine consumption: it always returns a structured
status payload and, when successful, includes source/format/content_type.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/xml, text/xml;q=0.9, text/html;q=0.8, */*;q=0.7",
# NCBI/PMC frequently blocks the default python-requests UA.
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
}
)
[docs]
def _build_fulltext_xml_url(self, arguments: dict) -> str | None:
fulltext_xml_url = arguments.get("fulltext_xml_url")
if isinstance(fulltext_xml_url, str) and fulltext_xml_url.strip():
return fulltext_xml_url.strip()
pmcid = arguments.get("pmcid")
if isinstance(pmcid, str) and pmcid.strip():
pmcid = pmcid.strip()
pmcid = pmcid if pmcid.upper().startswith("PMC") else f"PMC{pmcid}"
return (
f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
)
source_db = arguments.get("source_db") or arguments.get("source")
article_id = arguments.get("article_id")
if (
isinstance(source_db, str)
and source_db.strip()
and isinstance(article_id, str)
and article_id.strip()
):
return f"https://www.ebi.ac.uk/europepmc/webservices/rest/{source_db.strip()}/{article_id.strip()}/fullTextXML"
return None
[docs]
def run(self, arguments):
fulltext_url = self._build_fulltext_xml_url(arguments)
pmcid = arguments.get("pmcid")
output_format = arguments.get("output_format", "text")
if output_format not in ("text", "raw"):
output_format = "text"
include_raw = bool(arguments.get("include_raw", False))
try:
max_chars = int(arguments.get("max_chars", 200000))
except (TypeError, ValueError):
max_chars = 200000
max_chars = max(1000, min(max_chars, 2_000_000))
try:
max_raw_chars = int(arguments.get("max_raw_chars", 200000))
except (TypeError, ValueError):
max_raw_chars = 200000
max_raw_chars = max(1000, min(max_raw_chars, 2_000_000))
try:
timeout = int(arguments.get("timeout", 30))
except (TypeError, ValueError):
timeout = 30
timeout = max(5, min(timeout, 120))
if not fulltext_url and not isinstance(pmcid, str):
return {
"status": "error",
"error": "Provide `pmcid`, or `fulltext_xml_url`, or (`source_db` + `article_id`).",
"retryable": False,
}
fetch = _fetch_fulltext_with_trace(
self.session,
europe_fulltext_xml_url=fulltext_url,
pmcid=pmcid,
timeout=timeout,
)
content = fetch.get("content")
if not isinstance(content, str) or not content.strip():
return {
"status": "error",
"error": "Full text fetch failed",
"url": fetch.get("url") or fulltext_url,
"status_code": fetch.get("status_code"),
"retryable": fetch.get("status_code") in (408, 429, 500, 502, 503, 504),
"retrieval_trace": fetch.get("trace") or [],
}
raw_out = None
truncated_raw = False
if include_raw:
raw_out = content[:max_raw_chars]
truncated_raw = len(content) > max_raw_chars
if output_format == "raw":
return {
"status": "success",
"url": fetch.get("url"),
"source": fetch.get("source"),
"format": fetch.get("format"),
"content_type": fetch.get("content_type"),
"retrieval_trace": fetch.get("trace") or [],
"content": content[:max_chars],
"truncated": len(content) > max_chars,
"raw": raw_out,
"raw_truncated": truncated_raw if include_raw else None,
}
# output_format == "text"
try:
if fetch.get("format") == "xml":
root = ET.fromstring(content or "")
text = " ".join("".join(root.itertext()).split())
else:
text = _extract_text_from_html(content)
except ET.ParseError:
return {
"status": "error",
"error": "Full text returned invalid XML",
"url": fetch.get("url") or fulltext_url,
"retryable": True,
"retrieval_trace": fetch.get("trace") or [],
}
out_text = text[:max_chars]
return {
"status": "success",
"url": fetch.get("url"),
"source": fetch.get("source"),
"format": fetch.get("format"),
"content_type": fetch.get("content_type"),
"retrieval_trace": fetch.get("trace") or [],
"text": out_text,
"truncated": len(text) > max_chars,
"raw": raw_out,
"raw_truncated": truncated_raw if include_raw else None,
}
[docs]
@register_tool("EuropePMCRESTTool")
class EuropePMCRESTTool(BaseTool):
"""
Generic REST tool for Europe PMC API endpoints.
Supports citations, references, and other article-related endpoints.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest"
self.session = requests.Session()
self.session.headers.update({"Accept": "application/json"})
self.timeout = 30
[docs]
def _build_url(self, arguments):
"""Build URL from endpoint template and arguments."""
endpoint = self.tool_config["fields"]["endpoint"]
url = endpoint
for key, value in arguments.items():
placeholder = f"{{{key}}}"
if placeholder in url:
url = url.replace(placeholder, str(value))
return url
[docs]
def run(self, arguments):
"""Execute the Europe PMC REST API request."""
try:
url = self._build_url(arguments)
# Extract query parameters (those not in URL path)
params = {"format": "json"}
endpoint_template = self.tool_config["fields"]["endpoint"]
# Add parameters that are not path parameters
for key, value in arguments.items():
placeholder = f"{{{key}}}"
if placeholder not in endpoint_template and value is not None:
# Europe PMC expects pageSize, not page_size.
if key == "page_size":
params["pageSize"] = value
else:
params[key] = value
response = request_with_retry(
self.session,
"GET",
url,
params=params,
timeout=self.timeout,
max_attempts=3,
)
if response.status_code == 200:
data = response.json()
return {"status": "success", "data": data, "url": response.url}
else:
return {
"status": "error",
"error": f"Europe PMC API returned status {response.status_code}",
"url": response.url,
"status_code": response.status_code,
"detail": response.text[:200] if response.text else None,
}
except Exception as e:
return {
"status": "error",
"error": f"Europe PMC API request failed: {str(e)}",
"url": url if "url" in locals() else None,
}