Source code for tooluniverse.crossref_tool
import requests
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("CrossrefTool")
class CrossrefTool(BaseTool):
"""
Search Crossref Works API for articles by keyword.
"""
[docs]
def __init__(
self,
tool_config,
base_url="https://api.crossref.org/works",
):
super().__init__(tool_config)
self.base_url = base_url
[docs]
def run(self, arguments):
query = arguments.get("query")
rows = int(arguments.get("limit", 10))
# e.g., 'type:journal-article,from-pub-date:2020-01-01'
filter_str = arguments.get("filter")
if not query:
return {"error": "`query` parameter is required."}
return self._search(query, rows, filter_str)
[docs]
def _search(self, query, rows, filter_str):
params = {"query": query, "rows": max(1, min(rows, 100))}
if filter_str:
params["filter"] = filter_str
try:
response = requests.get(self.base_url, params=params, timeout=20)
except requests.RequestException as e:
return {
"error": "Network error calling Crossref API",
"reason": str(e),
}
if response.status_code != 200:
return {
"error": f"Crossref API error {response.status_code}",
"reason": response.reason,
}
data = response.json().get("message", {}).get("items", [])
results = []
for item in data:
# Extract title
title_list = item.get("title") or []
title = title_list[0] if title_list else None
# Extract abstract
abstract = item.get("abstract")
if abstract and isinstance(abstract, str):
# Clean HTML tags
import re
abstract = re.sub(r"<[^>]+>", "", abstract)
abstract = abstract.strip()
# Extract author information
authors = []
author_list = item.get("author", [])
if isinstance(author_list, list):
for author in author_list:
if isinstance(author, dict):
given = author.get("given", "")
family = author.get("family", "")
if given and family:
authors.append(f"{given} {family}")
elif family:
authors.append(family)
# Extract year
year = None
issued = item.get("issued", {}).get("date-parts") or []
if issued and issued[0]:
year = issued[0][0]
# Extract URL and DOI
url = item.get("URL")
doi = item.get("DOI")
# Extract journal information
container_title = item.get("container-title") or []
journal = container_title[0] if container_title else None
# Extract citation count
citations = item.get("is-referenced-by-count", 0)
if citations:
try:
citations = int(citations)
except (ValueError, TypeError):
citations = 0
# Extract open access status
open_access = item.get(
"is-referenced-by-count", 0
) # This field might not be accurate
# Try to get open access status from license information
license_info = item.get("license", [])
if isinstance(license_info, list) and license_info:
open_access = (
True # If there's license information, it might be open access
)
# Extract keywords
keywords = []
subject_list = item.get("subject", [])
if isinstance(subject_list, list):
keywords.extend(subject_list)
# Extract article type
article_type = item.get("type", "Unknown")
# Extract publisher
publisher = item.get("publisher", "Unknown")
# Handle missing abstract
if not abstract:
abstract = "Abstract not available"
results.append(
{
"title": title or "Title not available",
"abstract": abstract,
"authors": (
authors if authors else "Author information not available"
),
"journal": journal or "Journal information not available",
"year": year,
"doi": doi or "DOI not available",
"url": url or "URL not available",
"citations": citations,
"open_access": open_access,
"keywords": keywords if keywords else "Keywords not available",
"article_type": article_type,
"publisher": publisher,
"source": "Crossref",
"data_quality": {
"has_abstract": bool(
abstract and abstract != "Abstract not available"
),
"has_authors": bool(authors),
"has_journal": bool(journal),
"has_year": bool(year),
"has_doi": bool(doi),
"has_citations": bool(citations and citations > 0),
"has_keywords": bool(keywords),
"has_url": bool(url),
},
}
)
return results