tooluniverse.re3data_tool 源代码

"""
re3data tool for searching and retrieving research data repository metadata.

re3data.org is a global registry of 3,000+ research data repositories covering
all academic disciplines. The API returns XML which this tool parses into JSON.
"""

import requests
import xml.etree.ElementTree as ET
from typing import Any, Dict, List, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool
from .http_utils import request_with_retry

_NS = {"r3d": "http://www.re3data.org/schema/2-2"}


def _text(el: Optional[ET.Element], tag: str) -> Optional[str]:
    """Extract text from a namespaced child element."""
    child = el.find(f"r3d:{tag}", _NS) if el is not None else None
    return child.text.strip() if child is not None and child.text else None


def _texts(el: Optional[ET.Element], tag: str) -> List[str]:
    """Extract text from all matching namespaced child elements."""
    if el is None:
        return []
    return [
        c.text.strip()
        for c in el.findall(f"r3d:{tag}", _NS)
        if c.text and c.text.strip()
    ]


def _parse_repo_list(xml_text: str) -> List[Dict[str, Any]]:
    """Parse the search results XML list into a list of repository summaries."""
    root = ET.fromstring(xml_text)
    repos = []
    for repo_el in root.findall("repository"):
        repo_id = repo_el.findtext("id", "").strip()
        name = repo_el.findtext("name", "").strip()
        doi = repo_el.findtext("doi", "").strip()
        repos.append({"id": repo_id, "name": name, "doi": doi})
    return repos


def _parse_repo_detail(xml_text: str) -> Dict[str, Any]:
    """Parse the detailed repository XML into a structured dictionary."""
    root = ET.fromstring(xml_text)
    repo = root.find(".//r3d:repository", _NS)
    if repo is None:
        return {"error": "No repository element found in response"}

    # Subjects with their scheme attribute
    subjects = []
    for s in repo.findall("r3d:subject", _NS):
        scheme = s.get("subjectScheme", "")
        if s.text and s.text.strip():
            subjects.append({"subject": s.text.strip(), "scheme": scheme})

    # Content types
    content_types = _texts(repo, "contentType")

    # Keywords
    keywords = _texts(repo, "keyword")

    # Data access types
    access_types = []
    for da in repo.findall("r3d:dataAccess", _NS):
        access_type = _text(da, "dataAccessType")
        restrictions = _texts(da, "dataAccessRestriction")
        if access_type:
            entry = {"type": access_type}
            if restrictions:
                entry["restrictions"] = restrictions
            access_types.append(entry)

    # Institutions
    institutions = []
    for inst in repo.findall("r3d:institution", _NS):
        inst_name = _text(inst, "institutionName")
        country = _text(inst, "institutionCountry")
        inst_type = _text(inst, "institutionType")
        url = _text(inst, "institutionURL")
        if inst_name:
            entry = {"name": inst_name}
            if country:
                entry["country"] = country
            if inst_type:
                entry["type"] = inst_type
            if url:
                entry["url"] = url
            institutions.append(entry)

    # Data licenses
    licenses = []
    for dl in repo.findall("r3d:dataLicense", _NS):
        lic_name = _text(dl, "dataLicenseName")
        lic_url = _text(dl, "dataLicenseURL")
        if lic_name:
            licenses.append({"name": lic_name, "url": lic_url})

    return {
        "id": _text(repo, "re3data.orgIdentifier"),
        "name": _text(repo, "repositoryName"),
        "url": _text(repo, "repositoryURL"),
        "description": _text(repo, "description"),
        "type": _text(repo, "type"),
        "size": _text(repo, "size"),
        "languages": _texts(repo, "repositoryLanguage"),
        "subjects": subjects,
        "keywords": keywords,
        "content_types": content_types,
        "data_access": access_types,
        "data_licenses": licenses,
        "institutions": institutions,
        "mission_statement_url": _text(repo, "missionStatementURL"),
    }



[文档]
@register_tool("Re3DataTool")
class Re3DataTool(BaseTool):
    """Search and retrieve metadata from the re3data.org registry of research data repositories."""

    BASE_URL = "https://www.re3data.org/api/beta"


[文档]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": "ToolUniverse/1.0 (re3data client)"})
        self.timeout = 30



[文档]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        tool_name = self.tool_config.get("name", "")

        if tool_name == "re3data_search_repositories":
            return self._search_repositories(arguments)
        elif tool_name == "re3data_get_repository":
            return self._get_repository(arguments)
        return {"status": "error", "error": f"Unknown tool: {tool_name}"}



[文档]
    def _search_repositories(self, args: Dict[str, Any]) -> Dict[str, Any]:
        query = args.get("query")
        if not query:
            return {"status": "error", "error": "`query` parameter is required."}

        subjects = args.get("subjects")
        countries = args.get("countries")

        try:
            params = {"query": query}
            resp = request_with_retry(
                self.session,
                "GET",
                f"{self.BASE_URL}/repositories",
                params=params,
                timeout=self.timeout,
                max_attempts=3,
            )
            if resp.status_code != 200:
                return {
                    "status": "error",
                    "error": f"re3data API error: HTTP {resp.status_code}",
                    "detail": resp.text[:500],
                }

            repos = _parse_repo_list(resp.text)

            # Client-side filtering by subjects/countries (API doesn't support these filters)
            if subjects:
                subjects_lower = subjects.lower()
                repos = [
                    r
                    for r in repos
                    if subjects_lower in r.get("name", "").lower()
                    or subjects_lower in r.get("id", "").lower()
                ]

            if countries:
                countries.upper()
                # Country filtering requires detail data; skip for search results
                # as the list endpoint doesn't include country info
                pass

            return {
                "status": "success",
                "data": repos,
                "count": len(repos),
            }
        except Exception as e:
            return {"status": "error", "error": f"re3data API error: {str(e)}"}



[文档]
    def _get_repository(self, args: Dict[str, Any]) -> Dict[str, Any]:
        repo_id = args.get("repository_id")
        if not repo_id:
            return {
                "status": "error",
                "error": "`repository_id` parameter is required.",
            }

        try:
            url = f"{self.BASE_URL}/repository/{repo_id}"
            resp = request_with_retry(
                self.session,
                "GET",
                url,
                timeout=self.timeout,
                max_attempts=3,
            )
            if resp.status_code == 404:
                return {
                    "status": "error",
                    "error": f"Repository not found: {repo_id}",
                }
            if resp.status_code != 200:
                return {
                    "status": "error",
                    "error": f"re3data API error: HTTP {resp.status_code}",
                    "detail": resp.text[:500],
                }

            detail = _parse_repo_detail(resp.text)
            return {
                "status": "success",
                "data": detail,
                "url": url,
            }
        except Exception as e:
            return {"status": "error", "error": f"re3data API error: {str(e)}"}