tooluniverse.datagov_tool 源代码

"""
Data.gov search tool for ToolUniverse.

Searches the US federal open data catalog (catalog.data.gov) for datasets
from EPA, CDC, Census, NIH, USDA, NOAA, and 100+ other federal agencies.

The legacy CKAN /api/3/action/package_search endpoint was retired in 2025;
the catalog now serves a Solr-backed JSON search at /search?_format=json
with different param names (`_q` instead of `q`, `organization` slug
instead of CKAN `fq` filter). This tool talks to the new endpoint and
normalises the response into the same {datasets:[{title, description,
organization, ...}], total_count, returned} shape the previous CKAN
version emitted, so callers don't see a behavioural change.
"""

import requests
from .base_tool import BaseTool
from .tool_registry import register_tool

DATAGOV_SEARCH = "https://catalog.data.gov/search"
_BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36 ToolUniverse/DataGov"
    ),
    "Accept": "application/json,*/*;q=0.8",
}


[文档] @register_tool("DataGovTool") class DataGovTool(BaseTool): """Search US federal open data catalog (Data.gov) for datasets."""
[文档] def run(self, arguments=None): arguments = arguments or {} query = (arguments.get("query") or "").strip() organization = arguments.get("organization") rows = max(1, min(int(arguments.get("rows", 10)), 100)) if not query: return { "status": "error", "error": { "message": "Missing required parameter: query", "details": "Provide a search query string.", }, } params = {"_q": query, "_format": "json", "rows": rows} if organization: # The new endpoint takes the organization *slug* (e.g. 'epa-gov') # as a separate query param, not as a CKAN fq filter. params["organization"] = organization try: resp = requests.get( DATAGOV_SEARCH, params=params, headers=_BROWSER_HEADERS, timeout=30 ) resp.raise_for_status() body = resp.json() except requests.RequestException as exc: return { "status": "error", "error": { "message": "Data.gov API request failed", "details": str(exc), }, } except ValueError as exc: return { "status": "error", "error": { "message": "Data.gov API returned non-JSON response", "details": str(exc), }, } results = body.get("results") or [] datasets = [] for pkg in results: org = pkg.get("organization") or {} # The Solr response stores 'distribution_titles' (a flat list of # resource titles) — the old CKAN response had a full 'resources' # array. Reconstruct a thin resources list from what's available. dcat = pkg.get("dcat") or {} resources = [] for dist in (dcat.get("distribution") or [])[:10]: if not isinstance(dist, dict): continue resources.append( { "name": dist.get("title"), "url": dist.get("accessURL") or dist.get("downloadURL"), "format": dist.get("format") or dist.get("mediaType"), "description": dist.get("description"), } ) datasets.append( { "title": pkg.get("title", ""), "description": (pkg.get("description") or "")[:500] or None, "organization": org.get("slug") or org.get("name"), "organization_title": org.get("name"), "metadata_modified": pkg.get("last_harvested_date"), "tags": pkg.get("keyword") or [], "resources": resources, "url": ( f"https://catalog.data.gov/dataset/{pkg['slug']}" if pkg.get("slug") else None ), } ) return { "status": "success", "data": { "query": query, "organization": organization, "total_count": len(results), "returned": len(datasets), "datasets": datasets, }, "metadata": { "source": "Data.gov (Solr search)", "api": DATAGOV_SEARCH, }, }