Source code for tooluniverse.web_search_tool
"""
Web search tools for ToolUniverse using DDGS (Dux Distributed Global Search).
This module provides web search capabilities using the ddgs library,
which supports multiple search engines including DuckDuckGo, Google, Bing, etc.
"""
import json
import re
import subprocess
import sys
import time
from html import unescape
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import parse_qs, unquote, urlparse
import requests
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("WebSearchTool")
class WebSearchTool(BaseTool):
"""
Web search tool using DDGS library.
This tool performs web searches using the DDGS library which supports
multiple search engines including Google, Bing, Brave, Yahoo, DuckDuckGo, etc.
"""
# DDGS instance will be created per request to avoid session issues
[docs]
def _search_with_ddgs(
self,
query: str,
max_results: int = 10,
backend: str = "auto",
region: str = "us-en",
safesearch: str = "moderate",
) -> List[Dict[str, Any]]:
"""
Perform a web search using DDGS library and return formatted results.
Args:
query: Search query string
max_results: Maximum number of results to return
backend: Search engine backend (auto, google, bing, brave, etc.)
region: Search region (e.g., 'us-en', 'cn-zh')
safesearch: Safe search level ('on', 'moderate', 'off')
Returns:
List of search results with title, url, and snippet
"""
# Run DDGS in a subprocess so native-library panics cannot crash the
# current ToolUniverse process.
payload = {
"query": query,
"max_results": max_results,
"backend": backend,
"region": region,
"safesearch": safesearch,
}
ddgs_script = """
import json
import sys
from ddgs import DDGS
args = json.loads(sys.argv[1])
results = list(
DDGS().text(
query=args["query"],
max_results=args["max_results"],
backend=args["backend"],
region=args["region"],
safesearch=args["safesearch"],
)
)
print(json.dumps(results))
"""
completed = subprocess.run(
[sys.executable, "-c", ddgs_script, json.dumps(payload)],
capture_output=True,
text=True,
timeout=30,
)
if completed.returncode != 0:
stderr_preview = (completed.stderr or "").strip()[:300]
stdout_preview = (completed.stdout or "").strip()[:300]
detail = stderr_preview or stdout_preview or "Unknown subprocess failure"
raise RuntimeError(
f"DDGS subprocess failed with exit code {completed.returncode}: {detail}"
)
try:
search_results = json.loads(completed.stdout or "[]")
except json.JSONDecodeError as error:
raise RuntimeError(
f"DDGS subprocess returned invalid JSON: {str(error)}"
) from error
# Convert DDGS results to our expected format
results = []
for i, result in enumerate(search_results):
results.append(
{
"title": result.get("title", ""),
"url": result.get("href", ""),
"snippet": result.get("body", ""),
"rank": i + 1,
}
)
return results
[docs]
def _search_with_fallback(
self,
query: str,
max_results: int,
backend: str,
region: str,
safesearch: str,
) -> Tuple[
List[Dict[str, Any]],
Optional[str],
List[str],
Optional[str],
Dict[str, str],
]:
"""
Search with backend fallback strategy.
Returns:
(results, backend_used, attempted_backends, warning, provider_errors)
"""
attempted_backends = []
last_error = None
provider_errors: Dict[str, str] = {}
had_empty_success = False
backends_to_try = [backend]
stop_ddgs_backends = False
if backend == "auto":
# Try the explicit DuckDuckGo backend first. It tends to fail with
# regular Python exceptions in restricted environments, while some
# other backends can trigger native panics.
backends_to_try = ["duckduckgo", "auto", "bing", "brave"]
else:
backends_to_try.append("auto")
seen = set()
unique_backends = []
for backend_name in backends_to_try:
if backend_name not in seen:
seen.add(backend_name)
unique_backends.append(backend_name)
for backend_name in unique_backends:
if stop_ddgs_backends:
break
attempted_backends.append(backend_name)
try:
results = self._search_with_ddgs(
query=query,
max_results=max_results,
backend=backend_name,
region=region,
safesearch=safesearch,
)
if results:
return (
results,
backend_name,
attempted_backends,
None,
provider_errors,
)
had_empty_success = True
except Exception as error:
error_str = str(error)
last_error = error_str
provider_errors[backend_name] = error_str
# If DNS resolution is unavailable, additional DDGS backends
# are unlikely to help and may trigger avoidable native panics.
if any(
token in error_str
for token in [
"NameResolutionError",
"nodename nor servname provided",
"Temporary failure in name resolution",
]
):
stop_ddgs_backends = True
continue
# Fallback providers that do not depend on DDGS.
provider_chain = [
("duckduckgo_html", self._search_with_duckduckgo_html),
("wikipedia_api", self._search_with_wikipedia_api),
]
for provider_name, provider_func in provider_chain:
attempted_backends.append(provider_name)
try:
results = provider_func(query=query, max_results=max_results)
if results:
return (
results,
provider_name,
attempted_backends,
None,
provider_errors,
)
had_empty_success = True
except Exception as error:
last_error = str(error)
provider_errors[provider_name] = str(error)
continue
# Return clean empty set + last error context when every provider fails
# or no provider had results.
if had_empty_success:
return [], "empty", attempted_backends, None, provider_errors
warning = (
"All search providers failed"
if provider_errors
else (last_error or "No search providers returned results")
)
return [], "none", attempted_backends, warning, provider_errors
[docs]
def _search_with_duckduckgo_html(
self, query: str, max_results: int
) -> List[Dict[str, Any]]:
"""Fallback provider: parse DuckDuckGo HTML results without DDGS."""
response = requests.get(
"https://duckduckgo.com/html/",
params={"q": query},
timeout=15,
headers={"User-Agent": "ToolUniverse/1.0"},
)
response.raise_for_status()
html_text = response.text
result_pattern = re.compile(
r'<a[^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
flags=re.IGNORECASE | re.DOTALL,
)
snippet_pattern = re.compile(
r'<a[^>]*class="result__snippet"[^>]*>(.*?)</a>',
flags=re.IGNORECASE | re.DOTALL,
)
anchors = result_pattern.findall(html_text)
snippets = snippet_pattern.findall(html_text)
results = []
for index, (href, raw_title) in enumerate(anchors[:max_results], start=1):
parsed_href = href
if href.startswith("/l/?") or "uddg=" in href:
parsed = urlparse(href)
query_map = parse_qs(parsed.query)
uddg_values = query_map.get("uddg")
if uddg_values:
parsed_href = unquote(uddg_values[0])
clean_title = unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
raw_snippet = snippets[index - 1] if index - 1 < len(snippets) else ""
clean_snippet = unescape(re.sub(r"<[^>]+>", "", raw_snippet)).strip()
if not parsed_href:
continue
results.append(
{
"title": clean_title,
"url": parsed_href,
"snippet": clean_snippet,
"rank": index,
}
)
return results
[docs]
def _search_with_wikipedia_api(
self, query: str, max_results: int
) -> List[Dict[str, Any]]:
"""Fallback provider: Wikipedia OpenSearch API."""
response = requests.get(
"https://en.wikipedia.org/w/api.php",
params={
"action": "opensearch",
"search": query,
"limit": max_results,
"namespace": 0,
"format": "json",
},
timeout=15,
headers={"User-Agent": "ToolUniverse/1.0"},
)
response.raise_for_status()
data = response.json()
if not isinstance(data, list) or len(data) < 4:
return []
titles = data[1] if isinstance(data[1], list) else []
descriptions = data[2] if isinstance(data[2], list) else []
urls = data[3] if isinstance(data[3], list) else []
results = []
for index, title in enumerate(titles[:max_results], start=1):
url = urls[index - 1] if index - 1 < len(urls) else ""
snippet = descriptions[index - 1] if index - 1 < len(descriptions) else ""
if not url:
continue
results.append(
{
"title": str(title),
"url": str(url),
"snippet": str(snippet),
"rank": index,
}
)
return results
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Execute web search using DDGS.
Args:
arguments: Dictionary containing:
- query: Search query string
- max_results: Maximum number of results (default: 10)
- search_type: Type of search (default: 'general')
- backend: Search engine backend (default: 'auto')
- region: Search region (default: 'us-en')
- safesearch: Safe search level (default: 'moderate')
Returns:
Dictionary containing search results
"""
try:
query = arguments.get("query", "").strip()
max_results = int(arguments.get("max_results", 10))
search_type = arguments.get("search_type", "general")
backend = arguments.get("backend", "auto")
region = arguments.get("region", "us-en")
safesearch = arguments.get("safesearch", "moderate")
if not query:
error_msg = "Query parameter is required"
return {
"status": "error",
"error": error_msg,
"data": {
"status": "error",
"error": error_msg,
"query": "",
"total_results": 0,
"results": [],
},
}
# Validate max_results
max_results = max(1, min(max_results, 50)) # Limit between 1-50
# Modify query based on search type
if search_type == "api_documentation":
query = f"{query} API documentation python library"
elif search_type == "python_packages":
query = f"{query} python package pypi"
elif search_type == "github":
query = f"{query} site:github.com"
# Perform search using fallback backends to improve reliability.
(
results,
backend_used,
attempted_backends,
search_warning,
provider_errors,
) = self._search_with_fallback(
query=query,
max_results=max_results,
backend=backend,
region=region,
safesearch=safesearch,
)
if search_warning:
return {
"status": "success",
"warning": search_warning,
"data": {
"status": "success",
"query": query,
"search_type": search_type,
"total_results": 0,
"results": [],
"attempted_backends": attempted_backends,
"backend_used": backend_used or "none",
"all_providers_failed": backend_used == "none",
"provider_errors": provider_errors,
"warning": search_warning,
},
}
# Add rate limiting to be respectful
time.sleep(0.5)
result_data = {
"status": "success",
"query": query,
"search_type": search_type,
"total_results": len(results),
"results": results,
"backend_used": backend_used or backend,
"attempted_backends": attempted_backends,
}
if provider_errors:
result_data["provider_errors"] = provider_errors
return {"status": "success", "data": result_data}
except Exception as e:
error_msg = str(e)
return {
"status": "error",
"error": error_msg,
"data": {
"status": "error",
"error": error_msg,
"query": arguments.get("query", ""),
"total_results": 0,
"results": [],
},
}
[docs]
@register_tool("WebAPIDocumentationSearchTool")
class WebAPIDocumentationSearchTool(WebSearchTool):
"""
Specialized web search tool for API documentation and Python libraries.
This tool is optimized for finding API documentation, Python packages,
and technical resources using DDGS with multiple search engines.
"""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Execute API documentation focused search.
Args:
arguments: Dictionary containing:
- query: Search query string
- max_results: Maximum number of results (default: 10)
- focus: Focus area ('api_docs', 'python_packages', etc.)
- backend: Search engine backend (default: 'auto')
Returns:
Dictionary containing search results
"""
try:
query = arguments.get("query", "").strip()
focus = arguments.get("focus", "api_docs")
backend = arguments.get("backend", "auto")
if not query:
error_msg = "Query parameter is required"
return {
"status": "error",
"error": error_msg,
"data": {
"status": "error",
"error": error_msg,
"query": "",
"total_results": 0,
"results": [],
},
}
# Modify query based on focus
if focus == "api_docs":
enhanced_query = f'"{query}" API documentation official docs'
elif focus == "python_packages":
enhanced_query = f'"{query}" python package pypi install pip'
elif focus == "github_repos":
enhanced_query = f'"{query}" github repository source code'
else:
enhanced_query = f'"{query}" documentation API reference'
# Use parent class search with enhanced query
arguments["query"] = enhanced_query
arguments["search_type"] = "api_documentation"
arguments["backend"] = backend
result = super().run(arguments)
# Extract data from parent result and add focus-specific metadata
if result["status"] == "success" and "data" in result:
result_data = result["data"]
result_data["focus"] = focus
result_data["enhanced_query"] = enhanced_query
# Filter results for better relevance
if focus == "python_packages":
result_data["results"] = [
r
for r in result_data["results"]
if (
"pypi.org" in r.get("url", "")
or "python" in r.get("title", "").lower()
)
]
elif focus == "github_repos":
result_data["results"] = [
r
for r in result_data["results"]
if "github.com" in r.get("url", "")
]
# Update total_results after filtering
result_data["total_results"] = len(result_data["results"])
return {"status": "success", "data": result_data}
return result
except Exception as e:
error_msg = str(e)
return {
"status": "error",
"error": error_msg,
"data": {
"status": "error",
"error": error_msg,
"query": arguments.get("query", ""),
"total_results": 0,
"results": [],
},
}