Source code for tooluniverse.fatcat_tool

import requests
from html.parser import HTMLParser
from .base_tool import BaseTool
from .tool_registry import register_tool



[docs]
class FatcatResultParser(HTMLParser):
    """Parse Fatcat search results from HTML."""


[docs]
    def __init__(self):
        super().__init__()
        self.results = []
        self.current_result = None
        self.in_title = False
        self.title_text = []



[docs]
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)

        # Look for release links
        if tag == "a" and "href" in attrs_dict:
            href = attrs_dict["href"]
            # Match pattern: /fatcat/release/{release_id}
            if href.startswith("/fatcat/release/") and len(href.split("/")) == 4:
                release_id = href.split("/")[-1]
                # Skip lookup links
                if release_id != "lookup" and release_id != "search":
                    self.current_result = {
                        "release_id": release_id,
                        "url": f"https://scholar.archive.org{href}",
                    }
                    self.in_title = True
                    self.title_text = []



[docs]
    def handle_data(self, data):
        if self.in_title:
            self.title_text.append(data.strip())



[docs]
    def handle_endtag(self, tag):
        if tag == "a" and self.in_title:
            self.in_title = False
            if self.current_result and self.title_text:
                title = " ".join(filter(None, self.title_text))
                if title and len(title) > 3:  # Filter out very short titles
                    self.current_result["title"] = title
                    self.results.append(self.current_result)
            self.current_result = None
            self.title_text = []





[docs]
class FatcatMetadataParser(HTMLParser):
    """Extract metadata from Fatcat release page meta tags."""


[docs]
    def __init__(self):
        super().__init__()
        self.metadata = {}
        self.authors = []



[docs]
    def handle_starttag(self, tag, attrs):
        if tag == "meta":
            attrs_dict = dict(attrs)
            name = attrs_dict.get("name", "")
            content = attrs_dict.get("content", "")

            # Extract various metadata fields
            if name == "citation_author":
                self.authors.append(content)
            elif name == "citation_publication_date":
                # Try to extract year
                try:
                    self.metadata["year"] = (
                        int(content.split("-")[0]) if content else None
                    )
                except (ValueError, IndexError):
                    pass
            elif name == "citation_doi":
                self.metadata["doi"] = content
            elif name == "citation_journal_title":
                self.metadata["journal"] = content
            elif name == "citation_publisher":
                self.metadata["publisher"] = content
            elif name == "abstract":
                self.metadata["abstract"] = content
            elif name == "citation_pdf_url":
                self.metadata["pdf_url"] = content





[docs]
@register_tool("FatcatScholarTool")
class FatcatScholarTool(BaseTool):
    """
    Search Internet Archive Scholar via Fatcat releases search.

    Uses web scraping of the scholar.archive.org interface to retrieve
    bibliographic information about research papers and publications.

    Parameters (arguments):
        query (str): Query string
        max_results (int): Max results (default 10, max 100)
    """


[docs]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.base_url = "https://scholar.archive.org/fatcat/release/search"



[docs]
    def run(self, arguments=None):
        arguments = arguments or {}
        query = arguments.get("query")
        max_results = int(arguments.get("max_results", 10))

        if not query:
            error_msg = "`query` parameter is required."
            return {"status": "error", "data": {"error": error_msg}, "error": error_msg}

        # Limit results to reasonable range
        limit = max(1, min(max_results, 100))

        params = {
            "q": query,
            "limit": limit,
        }

        try:
            resp = requests.get(self.base_url, params=params, timeout=30)
            resp.raise_for_status()
        except requests.RequestException as e:
            error_msg = f"Network/API error calling Fatcat: {str(e)}"
            return {
                "status": "error",
                "data": {
                    "error": "Network/API error calling Fatcat",
                    "reason": str(e),
                },
                "error": error_msg,
            }

        # Parse HTML to extract results
        try:
            parser = FatcatResultParser()
            parser.feed(resp.text)
            raw_results = parser.results
        except Exception as e:
            error_msg = f"Failed to parse Fatcat search results: {str(e)}"
            return {"status": "error", "data": {"error": error_msg}, "error": error_msg}

        # Fetch detailed metadata for each result (limited to avoid too many requests)
        results = []
        fetch_limit = min(
            len(raw_results), limit, 10
        )  # Limit to 10 to avoid excessive requests

        for r in raw_results[:fetch_limit]:
            result = {
                "title": r.get("title", ""),
                "authors": [],
                "year": None,
                "doi": None,
                "journal": None,
                "publisher": None,
                "abstract": None,
                "pdf_url": None,
                "url": r.get("url", ""),
                "source": "Fatcat/IA Scholar",
            }

            # Fetch detailed metadata from release page
            try:
                release_resp = requests.get(r["url"], timeout=10)
                if release_resp.status_code == 200:
                    meta_parser = FatcatMetadataParser()
                    meta_parser.feed(release_resp.text)

                    # Update result with fetched metadata
                    result["authors"] = meta_parser.authors
                    result["year"] = meta_parser.metadata.get("year")
                    result["doi"] = meta_parser.metadata.get("doi")
                    result["journal"] = meta_parser.metadata.get("journal")
                    result["publisher"] = meta_parser.metadata.get("publisher")
                    result["abstract"] = meta_parser.metadata.get("abstract")
                    result["pdf_url"] = meta_parser.metadata.get("pdf_url")
            except Exception:
                # If metadata fetch fails, continue with basic info
                pass

            results.append(result)

        return {"status": "success", "data": results}