Source code for tooluniverse.fda_pharmacogenomic_biomarkers_tool

from typing import Dict, Any, List, Optional
import requests
import re
from tooluniverse.base_tool import BaseTool
from tooluniverse.tool_registry import register_tool



[docs]
@register_tool("FDAPharmacogenomicBiomarkersTool")
class FDAPharmacogenomicBiomarkersTool(BaseTool):
    """
    Tool to retrieve data from the FDA's Table of Pharmacogenomic Biomarkers in Drug Labeling.
    Fetches the table from the FDA website and provides filtering capabilities.
    """

    FDA_URL = "https://www.fda.gov/drugs/science-and-research-drugs/table-pharmacogenomic-biomarkers-drug-labeling"

    # Standard headers to avoid 403/404 errors from FDA servers
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }


[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """
        Executes the tool to retrieve and filter pharmacogenomic biomarkers.

        Args:
            arguments (Dict[str, Any]):
                - drug_name (str, optional): Filter by drug name (case-insensitive partial match).
                - biomarker (str, optional): Filter by biomarker (case-insensitive partial match).
                - limit (int, optional): Maximum number of results to return (default: 10).

        Returns:
            Dict[str, Any]: A dictionary containing the 'count' and 'results' list.
        """
        drug_name_filter = arguments.get("drug_name")
        biomarker_filter = arguments.get("biomarker")
        limit = arguments.get("limit", 10)

        try:
            # TODO: Add caching mechanism if available in the ecosystem
            # For now, we fetch every time or rely on potential requests caching if configured globally
            response = requests.get(self.FDA_URL, headers=self.HEADERS, timeout=30)
            response.raise_for_status()

            records = self._parse_html_table(response.text)

            # Filter results
            filtered_results = []
            for record in records:
                match = True
                if drug_name_filter:
                    if drug_name_filter.lower() not in record.get("Drug", "").lower():
                        match = False

                if match and biomarker_filter:
                    if (
                        biomarker_filter.lower()
                        not in record.get("Biomarker", "").lower()
                    ):
                        match = False

                if match:
                    filtered_results.append(record)

            # Apply limit
            limited_results = filtered_results[:limit]

            return {
                "count": len(filtered_results),
                "shown": len(limited_results),
                "results": limited_results,
            }

        except Exception as e:
            return {"error": f"Failed to retrieve or parse FDA data: {str(e)}"}



[docs]
    def _parse_html_table(self, html_content: str) -> List[Dict[str, str]]:
        """
        Parses the HTML content to extract the biomarkers table.
        Uses regex/simple parsing to avoid heavy dependencies like BeautifulSoup if possible,
        or assumes BeautifulSoup is available in the environment (it usually is in this project).
        """
        records = []
        try:
            # Try importing BeautifulSoup, fallback to regex if not available (though highly recommended)
            from bs4 import BeautifulSoup

            soup = BeautifulSoup(html_content, "html.parser")

            # Find the table - usually the first table in the main content or identified by headers
            # The FDA page structure has a table with specific headers
            tables = soup.find_all("table")
            target_table = None

            for table in tables:
                headers = [th.get_text(strip=True) for th in table.find_all("th")]
                # Partial match check for crucial columns
                if any("Drug" in h for h in headers) and any(
                    "Biomarker" in h for h in headers
                ):
                    target_table = table
                    break

            if target_table:
                # Get the header mapping
                headers = [
                    th.get_text(strip=True) for th in target_table.find_all("th")
                ]
                # Map headers to cleaner keys
                header_map = {
                    "Drug": "Drug",
                    "Therapeutic Area": "TherapeuticArea",
                    "Biomarker": "Biomarker",
                    "Labeling Section": "LabelingSection",
                }

                rows = target_table.find_all("tr")[1:]  # Skip header row
                for row in rows:
                    cells = row.find_all(["td", "th"])
                    if not cells:
                        continue

                    record = {}
                    for i, cell in enumerate(cells):
                        if i < len(headers):
                            original_header = headers[i]
                            # Clean header for mapping (remove special chars)
                            clean_header = (
                                original_header.replace("\xa0", " ")
                                .replace("*", "")
                                .replace("†", "")
                                .strip()
                            )

                            # Clean cell text
                            cell_text = cell.get_text(strip=True)

                            # Find matching key based on partial match
                            key = None
                            for k in header_map:
                                # Check if configured key is part of the cleaned actual header (e.g. "Labeling Section" in "Labeling Sections")
                                if k in clean_header:
                                    key = header_map[k]
                                    break

                            if key:
                                record[key] = cell_text
                            elif (
                                clean_header
                            ):  # Store unmapped columns if header is not empty
                                record[clean_header] = cell_text

                    if record.get("Drug"):  # Only add valid records
                        records.append(record)

            return records

        except ImportError:
            # Fallback regex parsing if BS4 is missing (less robust)
            # Find table rows
            row_pattern = re.compile(r"<tr[^>]*>(.*?)</tr>", re.DOTALL)
            cell_pattern = re.compile(r"<td[^>]*>(.*?)</td>", re.DOTALL)

            matches = row_pattern.findall(html_content)
            for match in matches:
                cells = cell_pattern.findall(match)
                if len(cells) >= 3:  # Assuming at least Drug, Area, Biomarker
                    # Cleanup tags
                    clean_cells = [re.sub(r"<[^>]+>", "", c).strip() for c in cells]
                    # This is very brittle, BS4 is preferred.
                    # Assuming standard FDA columns order: Drug, Therapeutic Area, Biomarker, Labeling Section
                    if len(clean_cells) >= 4:
                        records.append(
                            {
                                "Drug": clean_cells[0],
                                "TherapeuticArea": clean_cells[1],
                                "Biomarker": clean_cells[2],
                                "LabelingSection": clean_cells[3],
                            }
                        )
            return records