import requests
import re
from .base_tool import BaseTool
from html import unescape
from .tool_registry import register_tool
import io
import os
import sys
import subprocess
import pdfplumber
from playwright.sync_api import sync_playwright
[docs]
@register_tool("URLToPDFTextTool")
class URLToPDFTextTool(BaseTool):
"""
Loads a webpage (with JavaScript), exports it as a PDF, and extracts text.
Expects: {"url": "https://..."}
Optional: {"timeout": <seconds>} (default 30)
Returns: {"text": "<extracted text>"} or {"error": "..."}
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.return_key = tool_config["fields"].get("return_key", "text")
[docs]
def _ensure_playwright_browsers(
self,
browsers=("chromium",),
with_deps: bool = False,
timeout_seconds: int = 600,
):
"""
Ensure Playwright browser binaries are installed.
Returns:
None on success, or an error string on failure.
"""
# Allow user to skip auto-install via env var
if os.environ.get("PLAYWRIGHT_SKIP_BROWSER_INSTALL", "") in (
"1",
"true",
"True",
):
return "PLAYWRIGHT_SKIP_BROWSER_INSTALL is set; skipping browser install."
# Detect if running inside an active asyncio event loop (Colab/Jupyter)
try:
import asyncio
loop = asyncio.get_event_loop()
running_async = loop.is_running()
except Exception:
running_async = False
def try_launch_one_sync():
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
b = getattr(p, browsers[0])
browser = b.launch(headless=True, timeout=10_000)
browser.close()
return True, None
except Exception as e:
return False, str(e)
async def try_launch_one_async():
try:
from playwright.async_api import async_playwright
async with async_playwright() as p:
b = getattr(p, browsers[0])
browser = await b.launch(headless=True, timeout=10_000)
await browser.close()
return True, None
except Exception as e:
return False, str(e)
if running_async:
# Use async Playwright API for browser launch check
try:
ok, msg = loop.run_until_complete(try_launch_one_async())
except Exception as e:
ok, msg = False, str(e)
else:
ok, msg = try_launch_one_sync()
if ok:
return None # browsers are already installed
# Attempt install using the same Python executable
cmd = [sys.executable, "-m", "playwright", "install"] + list(browsers)
if with_deps:
cmd.append("--with-deps")
try:
subprocess.run(
cmd, check=True, capture_output=True, text=True, timeout=timeout_seconds
)
except subprocess.CalledProcessError as e:
stdout = e.stdout or ""
stderr = e.stderr or ""
return f"playwright install failed (exit {e.returncode}). stdout:\n{stdout}\nstderr:\n{stderr}"
except Exception as e:
return f"Failed to run playwright install: {e}"
# Try launching again after install
if running_async:
try:
ok2, msg2 = loop.run_until_complete(try_launch_one_async())
except Exception as e:
ok2, msg2 = False, str(e)
else:
ok2, msg2 = try_launch_one_sync()
if ok2:
return None
return f"Browsers installed but launch still fails: {msg2}"
[docs]
def run(self, arguments: dict):
url = arguments.get("url")
if not url:
return {"error": "Parameter 'url' is required."}
if not (url.startswith("http://") or url.startswith("https://")):
return {"error": "URL must start with http:// or https://"}
timeout = arguments.get("timeout", 30)
# Ensure browsers are installed (auto-install if needed)
ensure_error = self._ensure_playwright_browsers(
browsers=("chromium",), with_deps=False
)
if ensure_error is not None:
return {"error": f"Playwright browser check/install failed: {ensure_error}"}
# Detect if running inside an active asyncio event loop (Colab/Jupyter)
try:
import asyncio
loop = asyncio.get_event_loop()
running_async = loop.is_running()
except Exception:
running_async = False
if running_async:
# Use async Playwright API
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()
async def async_pdf():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(
url, timeout=timeout * 1000, wait_until="networkidle"
)
pdf_bytes = await page.pdf(format="A4", print_background=True)
await browser.close()
return pdf_bytes
try:
pdf_bytes = loop.run_until_complete(async_pdf())
except Exception as e:
return {"error": f"Failed to render webpage to PDF (async): {e}"}
else:
# Use sync Playwright API
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, timeout=timeout * 1000, wait_until="networkidle")
pdf_bytes = page.pdf(format="A4", print_background=True)
browser.close()
except Exception as e:
return {"error": f"Failed to render webpage to PDF (sync): {e}"}
# Step 2: Extract text from PDF
try:
text = ""
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if not text.strip():
return {"error": "No text could be extracted from rendered PDF."}
return {self.return_key: text.strip()}
except Exception as e:
return {"error": f"Failed to extract text from PDF: {e}"}