tooluniverse.datagov_tool 源代码

"""
Data.gov search tool for ToolUniverse.

Searches the US federal open data catalog (catalog.data.gov) for datasets
from EPA, CDC, Census, NIH, USDA, NOAA, and 100+ other federal agencies.

The legacy CKAN /api/3/action/package_search endpoint was retired in 2025;
the catalog now serves a Solr-backed JSON search at /search?_format=json
with different param names (`_q` instead of `q`, `organization` slug
instead of CKAN `fq` filter). This tool talks to the new endpoint and
normalises the response into the same {datasets:[{title, description,
organization, ...}], total_count, returned} shape the previous CKAN
version emitted, so callers don't see a behavioural change.
"""

import requests
from .base_tool import BaseTool
from .tool_registry import register_tool

DATAGOV_SEARCH = "https://catalog.data.gov/search"
_BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36 ToolUniverse/DataGov"
    ),
    "Accept": "application/json,*/*;q=0.8",
}



[文档]
@register_tool("DataGovTool")
class DataGovTool(BaseTool):
    """Search US federal open data catalog (Data.gov) for datasets."""


[文档]
    def run(self, arguments=None):
        arguments = arguments or {}
        query = (arguments.get("query") or "").strip()
        organization = arguments.get("organization")
        rows = max(1, min(int(arguments.get("rows", 10)), 100))

        if not query:
            return {
                "status": "error",
                "error": {
                    "message": "Missing required parameter: query",
                    "details": "Provide a search query string.",
                },
            }

        params = {"_q": query, "_format": "json", "rows": rows}
        if organization:
            # The new endpoint takes the organization *slug* (e.g. 'epa-gov')
            # as a separate query param, not as a CKAN fq filter.
            params["organization"] = organization

        try:
            resp = requests.get(
                DATAGOV_SEARCH, params=params, headers=_BROWSER_HEADERS, timeout=30
            )
            resp.raise_for_status()
            body = resp.json()
        except requests.RequestException as exc:
            return {
                "status": "error",
                "error": {
                    "message": "Data.gov API request failed",
                    "details": str(exc),
                },
            }
        except ValueError as exc:
            return {
                "status": "error",
                "error": {
                    "message": "Data.gov API returned non-JSON response",
                    "details": str(exc),
                },
            }

        results = body.get("results") or []
        datasets = []
        for pkg in results:
            org = pkg.get("organization") or {}
            # The Solr response stores 'distribution_titles' (a flat list of
            # resource titles) — the old CKAN response had a full 'resources'
            # array. Reconstruct a thin resources list from what's available.
            dcat = pkg.get("dcat") or {}
            resources = []
            for dist in (dcat.get("distribution") or [])[:10]:
                if not isinstance(dist, dict):
                    continue
                resources.append(
                    {
                        "name": dist.get("title"),
                        "url": dist.get("accessURL") or dist.get("downloadURL"),
                        "format": dist.get("format") or dist.get("mediaType"),
                        "description": dist.get("description"),
                    }
                )

            datasets.append(
                {
                    "title": pkg.get("title", ""),
                    "description": (pkg.get("description") or "")[:500] or None,
                    "organization": org.get("slug") or org.get("name"),
                    "organization_title": org.get("name"),
                    "metadata_modified": pkg.get("last_harvested_date"),
                    "tags": pkg.get("keyword") or [],
                    "resources": resources,
                    "url": (
                        f"https://catalog.data.gov/dataset/{pkg['slug']}"
                        if pkg.get("slug")
                        else None
                    ),
                }
            )

        return {
            "status": "success",
            "data": {
                "query": query,
                "organization": organization,
                "total_count": len(results),
                "returned": len(datasets),
                "datasets": datasets,
            },
            "metadata": {
                "source": "Data.gov (Solr search)",
                "api": DATAGOV_SEARCH,
            },
        }