Source code for tooluniverse.openalex_tool
import requests
from typing import Any, Dict, Optional
from .base_tool import BaseTool
from .http_utils import request_with_retry
from .tool_registry import register_tool
[docs]
@register_tool("OpenAlexTool")
class OpenAlexTool(BaseTool):
"""
Tool to retrieve literature from OpenAlex based on search keywords.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.base_url = "https://api.openalex.org/works"
[docs]
def run(self, arguments):
"""Main entry point for the tool."""
search_keywords = arguments.get("search_keywords")
max_results = arguments.get("max_results", 10)
year_from = arguments.get("year_from", None)
year_to = arguments.get("year_to", None)
open_access = arguments.get("open_access", None)
return self.search_literature(
search_keywords, max_results, year_from, year_to, open_access
)
[docs]
def search_literature(
self,
search_keywords,
max_results=10,
year_from=None,
year_to=None,
open_access=None,
):
"""
Search for literature using OpenAlex API.
Parameters
search_keywords (str): Keywords to search for in title, abstract, and content.
max_results (int): Maximum number of results to return (default: 10).
year_from (int): Start year for publication date filter (optional).
year_to (int): End year for publication date filter (optional).
open_access (bool): Filter for open access papers only (optional).
Returns
list: List of dictionaries containing paper information.
"""
# Build query parameters
params = {
"search": search_keywords,
"per-page": min(max_results, 200), # OpenAlex allows max 200 per page
"sort": "cited_by_count:desc", # Sort by citation count (most cited first)
"mailto": "support@openalex.org", # Polite pool access
}
# Add year filters if provided
filters = []
if year_from is not None and year_to is not None:
filters.append(f"publication_year:{year_from}-{year_to}")
elif year_from is not None:
filters.append(f"from_publication_date:{year_from}-01-01")
elif year_to is not None:
filters.append(f"to_publication_date:{year_to}-12-31")
# Add open access filter if specified
if open_access is True:
filters.append("is_oa:true")
elif open_access is False:
filters.append("is_oa:false")
if filters:
params["filter"] = ",".join(filters)
try:
response = requests.get(self.base_url, params=params)
response.raise_for_status()
data = response.json()
papers = []
for work in data.get("results", []):
try:
paper_info = self._extract_paper_info(work)
papers.append(paper_info)
except Exception:
# Skip papers with missing data rather than failing completely
continue
print(
f"[OpenAlex] Retrieved {len(papers)} papers for keywords: '{search_keywords}'"
)
return papers
except requests.exceptions.RequestException as e:
return f"Error retrieving data from OpenAlex: {e}"
[docs]
def _extract_paper_info(self, work):
"""
Extract relevant information from a work object returned by OpenAlex API.
Parameters
work (dict): Work object from OpenAlex API response.
Returns
dict: Formatted paper information.
"""
# Extract title
title = work.get("title", "No title available")
# Extract abstract (display_name from abstract_inverted_index if available)
abstract = None
if work.get("abstract_inverted_index"):
# Reconstruct abstract from inverted index
abstract_dict = work["abstract_inverted_index"]
abstract_words = [""] * 500 # Assume max 500 words
for word, positions in abstract_dict.items():
for pos in positions:
if pos < len(abstract_words):
abstract_words[pos] = word
abstract = " ".join([word for word in abstract_words if word]).strip()
if not abstract:
abstract = "Abstract not available"
# Extract authors
authors = []
for authorship in work.get("authorships", []):
author = authorship.get("author", {})
author_name = author.get("display_name", "Unknown Author")
authors.append(author_name)
# Extract publication year
publication_year = work.get("publication_year", "Year not available")
# Extract organizations/affiliations
organizations = set()
for authorship in work.get("authorships", []):
for institution in authorship.get("institutions", []):
org_name = institution.get("display_name")
if org_name:
organizations.add(org_name)
# Extract additional useful information
primary_location = work.get("primary_location") or {}
source = primary_location.get("source") or {}
venue = source.get("display_name", "Unknown venue")
doi = work.get("doi", "No DOI")
citation_count = work.get("cited_by_count", 0)
open_access_info = work.get("open_access") or {}
open_access = open_access_info.get("is_oa", False)
pdf_url = open_access_info.get("oa_url")
# Extract keywords/concepts
keywords = []
concepts = work.get("concepts", [])
if isinstance(concepts, list):
for concept in concepts:
if isinstance(concept, dict):
concept_name = concept.get("display_name", "")
if concept_name:
keywords.append(concept_name)
# Extract article type
article_type = work.get("type", "Unknown")
# Extract publisher
primary_location = work.get("primary_location") or {}
source = primary_location.get("source") or {}
publisher = source.get("publisher", "Unknown")
return {
"title": title,
"abstract": abstract,
"authors": authors,
"year": publication_year,
"organizations": list(organizations),
"venue": venue,
"doi": doi,
"citation_count": citation_count,
"open_access": open_access,
"pdf_url": pdf_url,
"keywords": keywords if keywords else "Keywords not available",
"article_type": article_type,
"publisher": publisher,
"openalex_id": work.get("id", ""),
"url": work.get("doi") if work.get("doi") else work.get("id", ""),
"data_quality": {
"has_abstract": bool(abstract and abstract != "Abstract not available"),
"has_authors": bool(authors),
"has_venue": bool(venue and venue != "Unknown venue"),
"has_year": bool(
publication_year and publication_year != "Year not available"
),
"has_doi": bool(doi and doi != "No DOI"),
"has_citation_count": bool(citation_count and citation_count > 0),
"has_keywords": bool(keywords),
},
}
[docs]
def get_paper_by_doi(self, doi):
"""
Retrieve a specific paper by its DOI.
Parameters
doi (str): DOI of the paper to retrieve.
Returns
dict: Paper information or None if not found.
"""
try:
# OpenAlex supports DOI lookup directly
url = f"https://api.openalex.org/works/https://doi.org/{doi}"
params = {"mailto": "support@openalex.org"}
response = requests.get(url, params=params)
response.raise_for_status()
work = response.json()
return self._extract_paper_info(work)
except requests.exceptions.RequestException as e:
print(f"Error retrieving paper by DOI {doi}: {e}")
return None
[docs]
def get_papers_by_author(self, author_name, max_results=10):
"""
Retrieve papers by a specific author.
Parameters
author_name (str): Name of the author to search for.
max_results (int): Maximum number of results to return.
Returns
list: List of papers by the author.
"""
try:
params = {
"filter": f"author.display_name.search:{author_name}",
"per-page": min(max_results, 200),
"sort": "cited_by_count:desc",
"mailto": "support@openalex.org",
}
response = requests.get(self.base_url, params=params)
response.raise_for_status()
data = response.json()
papers = []
for work in data.get("results", []):
paper_info = self._extract_paper_info(work)
papers.append(paper_info)
print(
f"[OpenAlex] Retrieved {len(papers)} papers by author: '{author_name}'"
)
return papers
except requests.exceptions.RequestException as e:
return f"Error retrieving papers by author {author_name}: {e}"
[docs]
@register_tool("OpenAlexRESTTool")
class OpenAlexRESTTool(BaseTool):
"""
Generic JSON-config driven OpenAlex REST tool.
Notes:
- OpenAlex strongly encourages providing a contact email via the `mailto` query param.
- This tool returns a consistent wrapper: {status, data, url} (plus error fields on failure).
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
self.base_url = "https://api.openalex.org"
self.session = requests.Session()
self.session.headers.update({"Accept": "application/json"})
self.timeout = 30
[docs]
@staticmethod
def _normalize_openalex_id(value: Any) -> Any:
if isinstance(value, str) and "openalex.org/" in value:
return value.rstrip("/").split("/")[-1]
return value
[docs]
@staticmethod
def _normalize_doi(value: Any) -> Any:
if not isinstance(value, str):
return value
v = value.strip()
if "doi.org/" in v:
return v.split("doi.org/")[-1]
if v.lower().startswith("doi:"):
return v[4:]
return v
[docs]
def _build_url_and_params(
self, arguments: Dict[str, Any]
) -> tuple[str, Dict[str, Any]]:
fields = self.tool_config.get("fields", {}) or {}
path_tmpl = fields.get("path", "")
if not path_tmpl:
raise ValueError("OpenAlexRESTTool requires fields.path in tool config")
# Replace placeholders in the path.
path = path_tmpl
for k, v in (arguments or {}).items():
if v is None:
continue
if k == "doi":
v = self._normalize_doi(v)
elif k.endswith("_id") or k in {
"openalex_id",
"author_id",
"institution_id",
"concept_id",
"work_id",
}:
v = self._normalize_openalex_id(v)
path = path.replace(f"{{{k}}}", str(v))
url = f"{self.base_url}{path}"
# Build query params (optional).
params: Dict[str, Any] = {}
default_params = fields.get("default_params")
if isinstance(default_params, dict):
params.update(default_params)
param_map = (
fields.get("param_map") if isinstance(fields.get("param_map"), dict) else {}
)
path_params = set(fields.get("path_params") or [])
for k, v in (arguments or {}).items():
if v is None or k in path_params:
continue
api_key = param_map.get(k, k)
params[api_key] = v
# Provide a default mailto unless user overrides.
if "mailto" not in params:
params["mailto"] = "support@openalex.org"
return url, params
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
url: Optional[str] = None
try:
url, params = self._build_url_and_params(arguments or {})
resp = request_with_retry(
self.session,
"GET",
url,
params=params,
timeout=self.timeout,
max_attempts=3,
)
final_url = getattr(resp, "url", None) or url
if resp.status_code != 200:
return {
"status": "error",
"error": "OpenAlex API error",
"url": final_url,
"status_code": resp.status_code,
"detail": (resp.text or "")[:500],
}
return {"status": "success", "data": resp.json(), "url": final_url}
except Exception as e:
return {
"status": "error",
"error": f"OpenAlex API error: {str(e)}",
"url": url,
}