Source code for tooluniverse.clinical_society_tools

"""
Clinical Society Guideline Tools (ADA, AHA/ACC, NCCN).

Provides PubMed-based searches for clinical practice guidelines from
the American Diabetes Association (ADA), American Heart Association (AHA),
American College of Cardiology (ACC), and National Comprehensive Cancer
Network (NCCN).  Also provides web scraping for NCCN patient guidelines.
"""

import io
import os
import re
import time
import xml.etree.ElementTree as ET

import pdfplumber
import requests
from bs4 import BeautifulSoup

from .base_tool import BaseTool
from .tool_registry import register_tool

# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------

_UA = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"


def _eutils_params(extra=None):
    """Return base params with optional NCBI API key."""
    params = {}
    api_key = os.environ.get("NCBI_API_KEY")
    if api_key:
        params["api_key"] = api_key
    if extra:
        params.update(extra)
    return params


def _rate_limit():
    """Respect NCBI rate limit (3 requests/sec without key, 10 with key)."""
    if os.environ.get("NCBI_API_KEY"):
        time.sleep(0.15)
    else:
        time.sleep(0.4)


def _pubmed_search(query, limit=10, sort="date"):
    """Search PubMed and return list of PMIDs."""
    params = _eutils_params(
        {
            "db": "pubmed",
            "term": query,
            "retmode": "json",
            "retmax": limit,
            "sort": sort,
        }
    )
    resp = requests.get(f"{EUTILS_BASE}/esearch.fcgi", params=params, timeout=20)
    resp.raise_for_status()
    data = resp.json()
    return data.get("esearchresult", {}).get("idlist", [])


def _pubmed_summaries(pmids):
    """Fetch PubMed esummary for a list of PMIDs. Returns dict keyed by PMID."""
    if not pmids:
        return {}
    params = _eutils_params(
        {
            "db": "pubmed",
            "id": ",".join(pmids),
            "retmode": "json",
        }
    )
    resp = requests.get(f"{EUTILS_BASE}/esummary.fcgi", params=params, timeout=20)
    resp.raise_for_status()
    data = resp.json()
    result = data.get("result", {})
    # Remove the 'uids' key if present
    result.pop("uids", None)
    return result


def _pubmed_abstracts(pmids):
    """Fetch abstracts for PMIDs using efetch XML. Returns dict PMID -> abstract."""
    if not pmids:
        return {}
    params = _eutils_params(
        {
            "db": "pubmed",
            "id": ",".join(pmids),
            "retmode": "xml",
            "rettype": "abstract",
        }
    )
    resp = requests.get(f"{EUTILS_BASE}/efetch.fcgi", params=params, timeout=30)
    resp.raise_for_status()

    abstracts = {}
    try:
        root = ET.fromstring(resp.content)
        for article in root.findall(".//PubmedArticle"):
            pmid_elem = article.find(".//PMID")
            if pmid_elem is None:
                continue
            pmid = pmid_elem.text
            # Collect all AbstractText elements (may have multiple for structured)
            abstract_parts = []
            for at in article.findall(".//AbstractText"):
                label = at.get("Label", "")
                # Also gather tail text from child elements
                full = ET.tostring(at, encoding="unicode", method="text").strip()
                if label:
                    abstract_parts.append(f"{label}: {full}")
                else:
                    abstract_parts.append(full)
            abstracts[pmid] = " ".join(abstract_parts) if abstract_parts else ""
    except ET.ParseError:
        pass
    return abstracts


def _build_article_result(pmid, summary, abstract=""):
    """Build a standardized article result dict from esummary + abstract."""
    articleids = summary.get("articleids", [])
    pmc_ids = [a["value"] for a in articleids if a.get("idtype") == "pmc"]
    doi_ids = [a["value"] for a in articleids if a.get("idtype") == "doi"]

    pmc_id = pmc_ids[0] if pmc_ids else None
    doi = (
        doi_ids[0]
        if doi_ids
        else summary.get("elocationid", "").replace("doi: ", "")
        if "doi:" in summary.get("elocationid", "")
        else ""
    )

    # Extract year from pubdate
    pubdate = summary.get("pubdate", "")
    year_match = re.search(r"(\d{4})", pubdate)
    year = int(year_match.group(1)) if year_match else None

    return {
        "title": summary.get("title", ""),
        "abstract": abstract[:2000] if abstract else "",
        "year": year,
        "pmid": pmid,
        "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
        "pmc_url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
        if pmc_id
        else None,
        "doi": doi,
        "journal": summary.get("source", ""),
        "publication_date": pubdate,
    }


def _search_and_fetch(query, limit=5):
    """Combined search + summary + abstract fetch pipeline."""
    pmids = _pubmed_search(query, limit=limit)
    if not pmids:
        return []

    _rate_limit()
    summaries = _pubmed_summaries(pmids)

    _rate_limit()
    abstracts = _pubmed_abstracts(pmids)

    results = []
    for pmid in pmids:
        if pmid not in summaries:
            continue
        article = _build_article_result(pmid, summaries[pmid], abstracts.get(pmid, ""))
        results.append(article)
    return results


# ---------------------------------------------------------------------------
# ADA Tools
# ---------------------------------------------------------------------------



[docs]
@register_tool("ADAStandardsTool")
class ADAStandardsTool(BaseTool):
    """
    American Diabetes Association (ADA) Standards of Care tools.

    Supports three operations:
    - list_sections: List all sections of the current ADA Standards of Care
    - search: Search ADA guidelines by topic via PubMed
    - get_section: Fetch content of a specific ADA Standards section via PMC
    """


[docs]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.operation = tool_config.get("fields", {}).get("operation", "search")



[docs]
    def run(self, arguments):
        try:
            op = self.operation
            if op == "list_sections":
                return self._list_sections(arguments)
            elif op == "search":
                return self._search(arguments)
            elif op == "get_section":
                return self._get_section(arguments)
            return {"status": "error", "error": f"Unknown operation: {op}"}
        except requests.exceptions.RequestException as e:
            return {"status": "error", "error": f"Request failed: {e}"}
        except Exception as e:
            return {"status": "error", "error": f"Error: {e}"}



[docs]
    def _list_sections(self, arguments):
        """List all sections of the current ADA Standards of Care via PubMed."""
        year = arguments.get("year", 2026)
        query = (
            f'"Standards of Care in Diabetes-{year}" AND '
            f'"American Diabetes Association"[Corporate Author]'
        )
        pmids = _pubmed_search(query, limit=25, sort="date")
        if not pmids:
            return []

        _rate_limit()
        summaries = _pubmed_summaries(pmids)

        sections = []
        for pmid in pmids:
            if pmid not in summaries:
                continue
            s = summaries[pmid]
            title = s.get("title", "")
            articleids = s.get("articleids", [])
            pmc_ids = [a["value"] for a in articleids if a.get("idtype") == "pmc"]
            doi_ids = [a["value"] for a in articleids if a.get("idtype") == "doi"]

            # Extract section number from title like "9. Pharmacologic..."
            sec_match = re.match(r"(\d+)\.\s", title)
            section_number = int(sec_match.group(1)) if sec_match else None

            sections.append(
                {
                    "title": title,
                    "section_number": section_number,
                    "pmid": pmid,
                    "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
                    "pmc_url": (
                        f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_ids[0]}/"
                        if pmc_ids
                        else None
                    ),
                    "doi": doi_ids[0] if doi_ids else None,
                    "year": year,
                }
            )

        # Sort by section number (numbered sections first, then unnumbered)
        sections.sort(
            key=lambda x: (x["section_number"] is None, x["section_number"] or 0)
        )
        return sections



[docs]
    def _search(self, arguments):
        """Search ADA guidelines by topic using PubMed."""
        query_text = arguments.get("query", "")
        limit = arguments.get("limit", 5)
        if not query_text:
            return {"status": "error", "error": "query parameter is required"}

        pubmed_query = (
            f'("American Diabetes Association"[Corporate Author]) AND ({query_text})'
        )
        return _search_and_fetch(pubmed_query, limit=limit)



[docs]
    def _get_section(self, arguments):
        """Fetch content of a specific ADA Standards section via PMC."""
        pmid = arguments.get("pmid", "")
        if not pmid:
            return {"status": "error", "error": "pmid parameter is required"}

        pmid = str(pmid)

        # First get summary to find PMC ID
        summaries = _pubmed_summaries([pmid])
        if pmid not in summaries:
            return {"status": "error", "error": f"PMID {pmid} not found"}

        summary = summaries[pmid]
        articleids = summary.get("articleids", [])
        pmc_ids = [a["value"] for a in articleids if a.get("idtype") == "pmc"]

        _rate_limit()
        # Fetch abstract
        abstracts = _pubmed_abstracts([pmid])
        abstract = abstracts.get(pmid, "")

        result = {
            "title": summary.get("title", ""),
            "pmid": pmid,
            "abstract": abstract,
            "journal": summary.get("source", ""),
            "publication_date": summary.get("pubdate", ""),
            "pmc_url": None,
            "full_text_snippet": None,
        }

        # Try to fetch full text from PMC if available
        if pmc_ids:
            pmc_id = pmc_ids[0]
            result["pmc_url"] = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
            _rate_limit()
            try:
                ft_url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmc_id}/unicode"
                ft_resp = requests.get(ft_url, timeout=30)
                if ft_resp.status_code == 200:
                    ft_data = ft_resp.json()
                    # Extract text passages
                    passages = []
                    for doc in ft_data.get("documents", []):
                        for passage in doc.get("passages", []):
                            text = passage.get("text", "")
                            section = passage.get("infons", {}).get("section_type", "")
                            ptype = passage.get("infons", {}).get("type", "")
                            if (
                                text
                                and ptype != "ref"
                                and section not in ("REF", "COMP_INT")
                            ):
                                passages.append(text)
                    if passages:
                        # Return first 5000 chars of combined passages
                        combined = "\n\n".join(passages)
                        result["full_text_snippet"] = combined[:5000]
            except Exception:
                pass  # PMC full text is optional

        return result




# ---------------------------------------------------------------------------
# AHA/ACC Tools
# ---------------------------------------------------------------------------



[docs]
@register_tool("AHAACCGuidelineTool")
class AHAACCGuidelineTool(BaseTool):
    """
    American Heart Association (AHA) and American College of Cardiology (ACC)
    guideline tools. Uses PubMed to search and list clinical practice guidelines.

    Supports three operations:
    - search: Search AHA/ACC guidelines by topic
    - list_aha: List recent AHA guidelines
    - list_acc: List recent ACC guidelines
    """


[docs]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.operation = tool_config.get("fields", {}).get("operation", "search")



[docs]
    def run(self, arguments):
        try:
            op = self.operation
            if op == "search":
                return self._search(arguments)
            elif op == "list_aha":
                return self._list_org(arguments, "American Heart Association")
            elif op == "list_acc":
                return self._list_org(arguments, "American College of Cardiology")
            elif op == "get_guideline":
                return self._get_guideline(arguments)
            return {"status": "error", "error": f"Unknown operation: {op}"}
        except requests.exceptions.RequestException as e:
            return {"status": "error", "error": f"Request failed: {e}"}
        except Exception as e:
            return {"status": "error", "error": f"Error: {e}"}



[docs]
    def _search(self, arguments):
        """Search AHA/ACC guidelines by topic."""
        query_text = arguments.get("query", "")
        limit = arguments.get("limit", 5)
        year_from = arguments.get("year_from")

        if not query_text:
            return {"status": "error", "error": "query parameter is required"}

        date_filter = ""
        if year_from:
            date_filter = (
                f' AND ("{year_from}"[Date - Publication] : "3000"[Date - Publication])'
            )

        pubmed_query = (
            f'(("American Heart Association"[Corporate Author] OR '
            f'"American College of Cardiology"[Corporate Author]) AND '
            f'("guideline"[Title] OR "practice guideline"[Publication Type])) '
            f"AND ({query_text}){date_filter}"
        )
        return _search_and_fetch(pubmed_query, limit=limit)



[docs]
    def _get_guideline(self, arguments):
        """Fetch full text of an AHA/ACC guideline from PMC by PMID."""
        pmid = str(arguments.get("pmid", ""))
        if not pmid:
            return {"status": "error", "error": "pmid parameter is required"}

        summaries = _pubmed_summaries([pmid])
        if pmid not in summaries:
            return {"status": "error", "error": f"PMID {pmid} not found in PubMed"}

        summary = summaries[pmid]
        articleids = summary.get("articleids", [])
        pmc_ids = [a["value"] for a in articleids if a.get("idtype") == "pmc"]

        _rate_limit()
        abstracts = _pubmed_abstracts([pmid])

        result = {
            "title": summary.get("title", ""),
            "pmid": pmid,
            "abstract": abstracts.get(pmid, ""),
            "journal": summary.get("source", ""),
            "publication_date": summary.get("pubdate", ""),
            "pmc_url": None,
            "full_text_snippet": None,
            "is_open_access": bool(pmc_ids),
        }

        if pmc_ids:
            pmc_id = pmc_ids[0]
            result["pmc_url"] = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
            _rate_limit()
            try:
                ft_url = (
                    f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/"
                    f"pmcoa.cgi/BioC_json/{pmc_id}/unicode"
                )
                ft_resp = requests.get(ft_url, timeout=30)
                if ft_resp.status_code == 200:
                    ft_data = ft_resp.json()
                    passages = []
                    for doc in ft_data.get("documents", []):
                        for passage in doc.get("passages", []):
                            text = passage.get("text", "")
                            section = passage.get("infons", {}).get("section_type", "")
                            ptype = passage.get("infons", {}).get("type", "")
                            if (
                                text
                                and ptype != "ref"
                                and section not in ("REF", "COMP_INT")
                            ):
                                passages.append(text)
                    if passages:
                        result["full_text_snippet"] = "\n\n".join(passages)[:6000]
            except Exception:
                pass

        return result



[docs]
    def _list_org(self, arguments, org_name):
        """List recent guidelines from a specific organization."""
        limit = arguments.get("limit", 10)
        year_from = arguments.get("year_from")

        date_filter = ""
        if year_from:
            date_filter = (
                f' AND ("{year_from}"[Date - Publication] : "3000"[Date - Publication])'
            )

        pubmed_query = (
            f'"{org_name}"[Corporate Author] AND '
            f'("guideline"[Title] OR "practice guideline"[Publication Type])'
            f"{date_filter}"
        )
        return _search_and_fetch(pubmed_query, limit=limit)




# ---------------------------------------------------------------------------
# NCCN Tools
# ---------------------------------------------------------------------------

_NCCN_BASE = "https://www.nccn.org"
_NCCN_GUIDELINES_URL = (
    f"{_NCCN_BASE}/patientresources/patient-resources/guidelines-for-patients"
)



[docs]
@register_tool("NCCNGuidelineTool")
class NCCNGuidelineTool(BaseTool):
    """
    National Comprehensive Cancer Network (NCCN) guideline tools.

    Supports three operations:
    - list_patient: List NCCN Guidelines for Patients (free, scraped from nccn.org)
    - search: Search NCCN content via PubMed (JNCCN journal articles)
    - get_patient: Get content from a specific NCCN patient guideline page
    """


[docs]
    def __init__(self, tool_config):
        super().__init__(tool_config)
        self.operation = tool_config.get("fields", {}).get("operation", "search")
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": _UA})



[docs]
    def run(self, arguments):
        try:
            op = self.operation
            if op == "list_patient":
                return self._list_patient_guidelines(arguments)
            elif op == "search":
                return self._search(arguments)
            elif op == "get_patient":
                return self._get_patient_guideline(arguments)
            return {"status": "error", "error": f"Unknown operation: {op}"}
        except requests.exceptions.RequestException as e:
            return {"status": "error", "error": f"Request failed: {e}"}
        except Exception as e:
            return {"status": "error", "error": f"Error: {e}"}



[docs]
    def _list_patient_guidelines(self, arguments):
        """List all NCCN Guidelines for Patients from nccn.org."""
        time.sleep(1)  # Respectful delay
        resp = self.session.get(_NCCN_GUIDELINES_URL, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.content, "html.parser")

        # Category mapping: anchor name -> display category
        category_map = {
            "category_1": "Treatment by Cancer Type",
            "category_2": "Detection, Prevention, and Risk Reduction",
            "category_3": "Supportive Care",
            "category_4": "Specific Populations",
        }

        # Build ordered list of (position, category_name) from <a name="category_N">
        # and (position, guideline_link) for each guideline link
        html_text = str(soup)
        category_positions = []
        for anchor_name, cat_label in category_map.items():
            # Find <a name="category_N"> in the HTML
            pattern = f'name="{anchor_name}"'
            idx = html_text.find(pattern)
            if idx >= 0:
                category_positions.append((idx, cat_label))
        category_positions.sort()

        guidelines = []
        seen_urls = set()

        for link in soup.find_all("a", href=True):
            href = link["href"]
            text = link.get_text(strip=True)

            # Only include links to individual guideline pages
            if "patientGuidelineId" not in href:
                continue
            if not text or len(text) < 3:
                continue

            full_url = _NCCN_BASE + href if href.startswith("/") else href
            if full_url in seen_urls:
                continue
            seen_urls.add(full_url)

            # Determine category by finding which category anchor precedes this link
            link_str = str(link)
            link_pos = html_text.find(link_str)
            current_category = "Treatment by Cancer Type"
            for cat_pos, cat_label in category_positions:
                if cat_pos < link_pos:
                    current_category = cat_label
                else:
                    break

            guidelines.append(
                {
                    "cancer_type": text,
                    "url": full_url,
                    "category": current_category,
                }
            )

        return guidelines



[docs]
    def _search(self, arguments):
        """Search NCCN content via PubMed."""
        query_text = arguments.get("query", "")
        limit = arguments.get("limit", 5)
        if not query_text:
            return {"status": "error", "error": "query parameter is required"}

        pubmed_query = (
            f'("National Comprehensive Cancer Network"[Corporate Author] '
            f"OR NCCN[Title]) AND ({query_text})"
        )
        return _search_and_fetch(pubmed_query, limit=limit)



[docs]
    def _get_patient_guideline(self, arguments):
        """Get content from a specific NCCN patient guideline — scrapes detail page
        and extracts text directly from the PDF using pdfplumber."""
        url = arguments.get("url", "")
        if not url:
            return {"status": "error", "error": "url parameter is required"}
        if "nccn.org" not in url:
            return {"status": "error", "error": "URL must be an nccn.org URL"}

        # Rewrite old-style content/english URLs to the current patientresources path
        if "patientGuidelineId" not in url and "patientresources" not in url:
            return {
                "status": "error",
                "error": "URL should come from NCCN_list_patient_guidelines",
            }

        time.sleep(1)
        resp = self.session.get(url, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.content, "html.parser")

        # Extract title from h2
        cancer_type = ""
        for h2 in soup.find_all("h2"):
            text = h2.get_text(strip=True)
            if text and len(text) > 2:
                cancer_type = text
                break

        # Collect PDF links
        pdf_links = []
        seen_pdfs = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if ".pdf" not in href.lower():
                continue
            full_pdf_url = _NCCN_BASE + href if href.startswith("/") else href
            if full_pdf_url in seen_pdfs:
                continue
            seen_pdfs.add(full_pdf_url)
            label = a.get_text(strip=True) or cancer_type
            pdf_links.append({"label": label, "url": full_pdf_url})

        # Extract version info
        version = ""
        for elem in soup.find_all(string=re.compile(r"\b20\d{2}\b")):
            candidate = elem.strip()
            if re.search(r"\b20\d{2}\b", candidate) and len(candidate) < 60:
                version = candidate
                break

        result = {
            "title": cancer_type or "NCCN Guidelines for Patients",
            "url": url,
            "cancer_type": cancer_type,
            "version": version,
            "pdf_url": pdf_links[0]["url"] if pdf_links else None,
            "pdf_downloads": pdf_links,
            "source": "NCCN Guidelines for Patients",
            "content": None,
        }

        # Download the primary PDF and extract text from first 8 pages
        primary_pdf = pdf_links[0]["url"] if pdf_links else None
        if primary_pdf:
            try:
                time.sleep(0.5)
                pdf_resp = self.session.get(primary_pdf, timeout=30)
                if (
                    pdf_resp.status_code == 200
                    and "pdf" in pdf_resp.headers.get("Content-Type", "").lower()
                ):
                    with pdfplumber.open(io.BytesIO(pdf_resp.content)) as pdf:
                        pages_text = []
                        for page in pdf.pages[:8]:
                            text = page.extract_text()
                            if text and len(text.strip()) > 50:
                                pages_text.append(text.strip())
                    if pages_text:
                        result["content"] = "\n\n---\n\n".join(pages_text)[:6000]
            except Exception:
                pass  # content remains None; pdf_url still returned

        return result