Source code for tooluniverse.url_tool

import requests
import re
from .base_tool import BaseTool
from html import unescape
from .tool_registry import register_tool
import io
import os
import sys
import subprocess
import pdfplumber
from playwright.sync_api import sync_playwright


[docs] @register_tool("URLHTMLTagTool") class URLHTMLTagTool(BaseTool): """ Fetches a webpage and extracts the content of a specified HTML tag. Expects: {"url": "https://..."} The tag to extract is specified in the tool's configuration. The tag to extract is specified in the tool's configuration. Optional: {"timeout": <seconds>} (default 20) Returns: {"content": "<extracted content>"} or {"error": "..."} """
[docs] def __init__(self, tool_config): super().__init__(tool_config) self.tag_to_fetch = tool_config["fields"].get("tag", "title") self.return_key = tool_config["fields"].get("return_key", "content")
[docs] def run(self, arguments: dict): url = arguments.get("url") if not url: return {"error": "Parameter 'url' is required."} # Basic validation if not (url.startswith("http://") or url.startswith("https://")): return {"error": "URL must start with http:// or https://"} timeout = arguments.get("timeout", 20) try: resp = requests.get(url, timeout=timeout) except requests.Timeout: return {"error": "Request timed out."} except Exception as e: return {"error": f"Request failed: {e}"} if resp.status_code != 200: return {"error": f"HTTP {resp.status_code}", "detail": resp.text[:300]} ctype = resp.headers.get("Content-Type", "").lower() if "html" not in ctype: # Still attempt extraction if text-like if not ctype.startswith("text/"): return {"error": "Response is not HTML."} text = resp.text # Extract <tag>...</tag> m = re.search( rf"<{self.tag_to_fetch}>(.*?)</{self.tag_to_fetch}>", text, flags=re.IGNORECASE | re.DOTALL, ) if not m: return {"error": f"No <{self.tag_to_fetch}> tag found."} raw_content = m.group(1).strip() # Collapse whitespace cleaned = re.sub(r"\s+", " ", raw_content) cleaned = unescape(cleaned) return {self.return_key: cleaned}
[docs] @register_tool("URLToPDFTextTool") class URLToPDFTextTool(BaseTool): """ Loads a webpage (with JavaScript), exports it as a PDF, and extracts text. Expects: {"url": "https://..."} Optional: {"timeout": <seconds>} (default 30) Returns: {"text": "<extracted text>"} or {"error": "..."} """
[docs] def __init__(self, tool_config): super().__init__(tool_config) self.return_key = tool_config["fields"].get("return_key", "text")
[docs] def _ensure_playwright_browsers( self, browsers=("chromium",), with_deps: bool = False, timeout_seconds: int = 600, ): """ Ensure Playwright browser binaries are installed. Returns: None on success, or an error string on failure. """ # Allow user to skip auto-install via env var if os.environ.get("PLAYWRIGHT_SKIP_BROWSER_INSTALL", "") in ( "1", "true", "True", ): return "PLAYWRIGHT_SKIP_BROWSER_INSTALL is set; skipping browser install." # Detect if running inside an active asyncio event loop (Colab/Jupyter) try: import asyncio loop = asyncio.get_event_loop() running_async = loop.is_running() except Exception: running_async = False def try_launch_one_sync(): try: from playwright.sync_api import sync_playwright with sync_playwright() as p: b = getattr(p, browsers[0]) browser = b.launch(headless=True, timeout=10_000) browser.close() return True, None except Exception as e: return False, str(e) async def try_launch_one_async(): try: from playwright.async_api import async_playwright async with async_playwright() as p: b = getattr(p, browsers[0]) browser = await b.launch(headless=True, timeout=10_000) await browser.close() return True, None except Exception as e: return False, str(e) if running_async: # Use async Playwright API for browser launch check try: ok, msg = loop.run_until_complete(try_launch_one_async()) except Exception as e: ok, msg = False, str(e) else: ok, msg = try_launch_one_sync() if ok: return None # browsers are already installed # Attempt install using the same Python executable cmd = [sys.executable, "-m", "playwright", "install"] + list(browsers) if with_deps: cmd.append("--with-deps") try: subprocess.run( cmd, check=True, capture_output=True, text=True, timeout=timeout_seconds ) except subprocess.CalledProcessError as e: stdout = e.stdout or "" stderr = e.stderr or "" return f"playwright install failed (exit {e.returncode}). stdout:\n{stdout}\nstderr:\n{stderr}" except Exception as e: return f"Failed to run playwright install: {e}" # Try launching again after install if running_async: try: ok2, msg2 = loop.run_until_complete(try_launch_one_async()) except Exception as e: ok2, msg2 = False, str(e) else: ok2, msg2 = try_launch_one_sync() if ok2: return None return f"Browsers installed but launch still fails: {msg2}"
[docs] def run(self, arguments: dict): url = arguments.get("url") if not url: return {"error": "Parameter 'url' is required."} if not (url.startswith("http://") or url.startswith("https://")): return {"error": "URL must start with http:// or https://"} timeout = arguments.get("timeout", 30) # Ensure browsers are installed (auto-install if needed) ensure_error = self._ensure_playwright_browsers( browsers=("chromium",), with_deps=False ) if ensure_error is not None: return {"error": f"Playwright browser check/install failed: {ensure_error}"} # Detect if running inside an active asyncio event loop (Colab/Jupyter) try: import asyncio loop = asyncio.get_event_loop() running_async = loop.is_running() except Exception: running_async = False if running_async: # Use async Playwright API from playwright.async_api import async_playwright import nest_asyncio nest_asyncio.apply() async def async_pdf(): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() await page.goto( url, timeout=timeout * 1000, wait_until="networkidle" ) pdf_bytes = await page.pdf(format="A4", print_background=True) await browser.close() return pdf_bytes try: pdf_bytes = loop.run_until_complete(async_pdf()) except Exception as e: return {"error": f"Failed to render webpage to PDF (async): {e}"} else: # Use sync Playwright API try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(url, timeout=timeout * 1000, wait_until="networkidle") pdf_bytes = page.pdf(format="A4", print_background=True) browser.close() except Exception as e: return {"error": f"Failed to render webpage to PDF (sync): {e}"} # Step 2: Extract text from PDF try: text = "" with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if not text.strip(): return {"error": "No text could be extracted from rendered PDF."} return {self.return_key: text.strip()} except Exception as e: return {"error": f"Failed to extract text from PDF: {e}"}