Source code for tooluniverse.idigbio_tool
"""
iDigBio tools for ToolUniverse — biodiversity specimen records.
iDigBio aggregates 130M+ digitized biodiversity specimen records (museum, herbarium,
paleo collections) in Darwin Core format. These tools search records by taxon/locality
and retrieve a single record by UUID.
API: https://search.idigbio.org/v2 (public, no authentication, JSON)
"""
import json
from typing import Any, Dict
import requests
from .base_tool import BaseTool
from .tool_registry import register_tool
IDIGBIO_BASE = "https://search.idigbio.org/v2"
# Darwin-Core query fields users may filter on (passed through to iDigBio's rq).
_QUERY_FIELDS = (
"scientificname",
"genus",
"family",
"country",
"stateprovince",
"recordedby",
"catalognumber",
"collectioncode",
"phylum",
"class",
"order",
)
def _summarize(item: Dict[str, Any]) -> Dict[str, Any]:
idx = item.get("indexTerms", {}) or {}
data = item.get("data", {}) or {}
return {
"uuid": item.get("uuid"),
"scientific_name": idx.get("scientificname") or data.get("dwc:scientificName"),
"family": idx.get("family"),
"genus": idx.get("genus"),
"taxon_rank": idx.get("taxonrank"),
"country": idx.get("country") or data.get("dwc:country"),
"state_province": idx.get("stateprovince"),
"county": idx.get("county"),
"recorded_by": data.get("dwc:recordedBy"),
"catalog_number": idx.get("catalognumber"),
"collection_code": data.get("dwc:collectionCode"),
"occurrence_id": data.get("dwc:occurrenceID"),
}
def _media_summary(item: Dict[str, Any]) -> Dict[str, Any]:
"""Flatten an iDigBio media (image) record to the most useful fields."""
data = item.get("data", {}) or {}
idx = item.get("indexTerms", {}) or {}
return {
"uuid": item.get("uuid"),
"type": data.get("dcterms:type") or idx.get("type"),
"format": data.get("dcterms:format") or idx.get("format"),
"provider_managed_id": data.get("ac:providerManagedID"),
"creator": data.get("dc:creator"),
"rights_owner": data.get("xmpRights:Owner"),
"usage_terms": data.get("xmpRights:UsageTerms"),
"access_uri": data.get("ac:accessURI") or data.get("ac:goodQualityAccessURI"),
"records": item.get("records") or idx.get("records"),
}
def _build_rq(arguments: Dict[str, Any]) -> Dict[str, str]:
"""Build the iDigBio Darwin-Core query (rq) from per-field args or a raw rq."""
raw = arguments.get("rq")
if isinstance(raw, dict) and raw:
return {k: v for k, v in raw.items() if v is not None}
if isinstance(raw, str) and raw.strip():
try:
parsed = json.loads(raw)
if isinstance(parsed, dict) and parsed:
return parsed
except (ValueError, TypeError):
pass
return {
f: arguments[f].strip()
for f in _QUERY_FIELDS
if isinstance(arguments.get(f), str) and arguments[f].strip()
}
def _idigbio_get(url: str, params: Dict[str, Any], timeout: int):
"""GET an iDigBio endpoint, returning (payload, error_envelope).
Exactly one of the two is non-None; the error envelope is the standard
{"status": "error", ...} dict so callers never raise.
"""
try:
resp = requests.get(
url,
params=params,
headers={"Accept": "application/json"},
timeout=timeout,
)
resp.raise_for_status()
return resp.json(), None
except requests.exceptions.Timeout:
return None, {
"status": "error",
"error": f"iDigBio request timed out after {timeout}s",
}
except requests.exceptions.RequestException as e:
return None, {"status": "error", "error": f"iDigBio request failed: {e}"}
except ValueError:
return None, {
"status": "error",
"error": "iDigBio returned a non-JSON response",
}
[docs]
@register_tool("iDigBioSearchTool")
class iDigBioSearchTool(BaseTool):
"""Search iDigBio specimen records (records / media) and summary facets.
Dispatch is driven by ``fields.mode``:
- unset / "records" : /search/records/ specimen occurrence records (default)
- "media" : /search/media/ specimen image/media records
- "summary" : /summary/top/records/ facet counts + /summary/count/
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("fields", {}).get("timeout", 30)
self.mode = tool_config.get("fields", {}).get("mode", "records")
[docs]
def _limit(self, arguments: Dict[str, Any], default: int = 10) -> int:
try:
return max(1, min(int(arguments.get("limit") or default), 100))
except (TypeError, ValueError):
return default
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
if self.mode == "summary":
return self._run_summary(arguments)
if self.mode == "media":
return self._run_media(arguments)
return self._run_records(arguments)
[docs]
def _run_records(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
rq = _build_rq(arguments)
if not rq:
return {
"status": "error",
"error": "Provide at least one query field: "
+ ", ".join(_QUERY_FIELDS),
}
params = {"rq": json.dumps(rq), "limit": self._limit(arguments)}
payload, err = _idigbio_get(
f"{IDIGBIO_BASE}/search/records/", params, self.timeout
)
if err is not None:
return err
items = payload.get("items", []) or []
return {
"status": "success",
"data": [_summarize(it) for it in items if isinstance(it, dict)],
"metadata": {
"total_available": payload.get("itemCount"),
"returned": len(items),
"query": rq,
"source": "iDigBio",
},
}
[docs]
def _run_media(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
rq = _build_rq(arguments)
if not rq:
return {
"status": "error",
"error": "Provide at least one query field (e.g. scientificname, "
"genus, family) or an rq query.",
}
params = {"rq": json.dumps(rq), "limit": self._limit(arguments, 10)}
payload, err = _idigbio_get(
f"{IDIGBIO_BASE}/search/media/", params, self.timeout
)
if err is not None:
return err
items = payload.get("items", []) or []
return {
"status": "success",
"data": [_media_summary(it) for it in items if isinstance(it, dict)],
"metadata": {
"total_available": payload.get("itemCount"),
"returned": len(items),
"query": rq,
"source": "iDigBio",
},
}
[docs]
def _run_summary(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
rq = _build_rq(arguments)
if not rq:
return {
"status": "error",
"error": "Provide at least one query field (e.g. scientificname, "
"genus, family) or an rq query.",
}
rq_json = json.dumps(rq)
# Always return the exact total count for the query.
count_payload, err = _idigbio_get(
f"{IDIGBIO_BASE}/summary/count/records/",
{"rq": rq_json},
self.timeout,
)
if err is not None:
return err
total = (count_payload or {}).get("itemCount")
facets: Dict[str, Any] = {}
top_fields = arguments.get("top_fields")
if isinstance(top_fields, list):
top_fields = ",".join(str(f) for f in top_fields if f)
if isinstance(top_fields, str) and top_fields.strip():
try:
count = max(1, min(int(arguments.get("count") or 10), 100))
except (TypeError, ValueError):
count = 10
top_payload, err = _idigbio_get(
f"{IDIGBIO_BASE}/summary/top/records/",
{"rq": rq_json, "top_fields": top_fields.strip(), "count": count},
self.timeout,
)
if err is not None:
return err
facets = {
k: v
for k, v in (top_payload or {}).items()
if k != "itemCount" and isinstance(v, dict)
}
return {
"status": "success",
"data": {"itemCount": total, "facets": facets},
"metadata": {
"query": rq,
"top_fields": top_fields if isinstance(top_fields, str) else None,
"source": "iDigBio",
},
}
[docs]
@register_tool("iDigBioRecordTool")
class iDigBioRecordTool(BaseTool):
"""Retrieve a single iDigBio specimen record by UUID."""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("fields", {}).get("timeout", 30)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
uuid = (arguments.get("uuid") or "").strip()
if not uuid:
return {"status": "error", "error": "'uuid' is required"}
try:
resp = requests.get(
f"{IDIGBIO_BASE}/view/records/{uuid}",
headers={"Accept": "application/json"},
timeout=self.timeout,
)
if resp.status_code == 404:
return {
"status": "success",
"data": {},
"metadata": {
"query_uuid": uuid,
"note": f"No iDigBio record '{uuid}'.",
},
}
resp.raise_for_status()
rec = resp.json()
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"iDigBio request timed out after {self.timeout}s",
}
except requests.exceptions.RequestException as e:
return {"status": "error", "error": f"iDigBio request failed: {e}"}
except ValueError:
return {"status": "error", "error": "iDigBio returned a non-JSON response"}
if not isinstance(rec, dict) or not rec.get("uuid"):
return {"status": "success", "data": {}, "metadata": {"query_uuid": uuid}}
summary = _summarize(rec)
summary["data"] = rec.get("data", {}) # full Darwin Core record
return {
"status": "success",
"data": summary,
"metadata": {"query_uuid": uuid, "source": "iDigBio"},
}