Source code for tooluniverse.fda_pharmacogenomic_biomarkers_tool
from typing import Dict, Any, List, Optional
import requests
import re
from tooluniverse.base_tool import BaseTool
from tooluniverse.tool_registry import register_tool
[docs]
@register_tool("FDAPharmacogenomicBiomarkersTool")
class FDAPharmacogenomicBiomarkersTool(BaseTool):
"""
Tool to retrieve data from the FDA's Table of Pharmacogenomic Biomarkers in Drug Labeling.
Fetches the table from the FDA website and provides filtering capabilities.
"""
FDA_URL = "https://www.fda.gov/drugs/science-and-research-drugs/table-pharmacogenomic-biomarkers-drug-labeling"
# Standard headers to avoid 403/404 errors from FDA servers
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Executes the tool to retrieve and filter pharmacogenomic biomarkers.
Args:
arguments (Dict[str, Any]):
- drug_name (str, optional): Filter by drug name (case-insensitive partial match).
- biomarker (str, optional): Filter by biomarker (case-insensitive partial match).
- limit (int, optional): Maximum number of results to return (default: 10).
Returns:
Dict[str, Any]: A dictionary containing the 'count' and 'results' list.
"""
drug_name_filter = arguments.get("drug_name")
biomarker_filter = arguments.get("biomarker")
limit = arguments.get("limit", 10)
try:
# TODO: Add caching mechanism if available in the ecosystem
# For now, we fetch every time or rely on potential requests caching if configured globally
response = requests.get(self.FDA_URL, headers=self.HEADERS, timeout=30)
response.raise_for_status()
records = self._parse_html_table(response.text)
# Filter results
filtered_results = []
for record in records:
match = True
if drug_name_filter:
if drug_name_filter.lower() not in record.get("Drug", "").lower():
match = False
if match and biomarker_filter:
if (
biomarker_filter.lower()
not in record.get("Biomarker", "").lower()
):
match = False
if match:
filtered_results.append(record)
# Apply limit
limited_results = filtered_results[:limit]
return {
"count": len(filtered_results),
"shown": len(limited_results),
"results": limited_results,
}
except Exception as e:
return {"error": f"Failed to retrieve or parse FDA data: {str(e)}"}
[docs]
def _parse_html_table(self, html_content: str) -> List[Dict[str, str]]:
"""
Parses the HTML content to extract the biomarkers table.
Uses regex/simple parsing to avoid heavy dependencies like BeautifulSoup if possible,
or assumes BeautifulSoup is available in the environment (it usually is in this project).
"""
records = []
try:
# Try importing BeautifulSoup, fallback to regex if not available (though highly recommended)
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Find the table - usually the first table in the main content or identified by headers
# The FDA page structure has a table with specific headers
tables = soup.find_all("table")
target_table = None
for table in tables:
headers = [th.get_text(strip=True) for th in table.find_all("th")]
# Partial match check for crucial columns
if any("Drug" in h for h in headers) and any(
"Biomarker" in h for h in headers
):
target_table = table
break
if target_table:
# Get the header mapping
headers = [
th.get_text(strip=True) for th in target_table.find_all("th")
]
# Map headers to cleaner keys
header_map = {
"Drug": "Drug",
"Therapeutic Area": "TherapeuticArea",
"Biomarker": "Biomarker",
"Labeling Section": "LabelingSection",
}
rows = target_table.find_all("tr")[1:] # Skip header row
for row in rows:
cells = row.find_all(["td", "th"])
if not cells:
continue
record = {}
for i, cell in enumerate(cells):
if i < len(headers):
original_header = headers[i]
# Clean header for mapping (remove special chars)
clean_header = (
original_header.replace("\xa0", " ")
.replace("*", "")
.replace("†", "")
.strip()
)
# Clean cell text
cell_text = cell.get_text(strip=True)
# Find matching key based on partial match
key = None
for k in header_map:
# Check if configured key is part of the cleaned actual header (e.g. "Labeling Section" in "Labeling Sections")
if k in clean_header:
key = header_map[k]
break
if key:
record[key] = cell_text
elif (
clean_header
): # Store unmapped columns if header is not empty
record[clean_header] = cell_text
if record.get("Drug"): # Only add valid records
records.append(record)
return records
except ImportError:
# Fallback regex parsing if BS4 is missing (less robust)
# Find table rows
row_pattern = re.compile(r"<tr[^>]*>(.*?)</tr>", re.DOTALL)
cell_pattern = re.compile(r"<td[^>]*>(.*?)</td>", re.DOTALL)
matches = row_pattern.findall(html_content)
for match in matches:
cells = cell_pattern.findall(match)
if len(cells) >= 3: # Assuming at least Drug, Area, Biomarker
# Cleanup tags
clean_cells = [re.sub(r"<[^>]+>", "", c).strip() for c in cells]
# This is very brittle, BS4 is preferred.
# Assuming standard FDA columns order: Drug, Therapeutic Area, Biomarker, Labeling Section
if len(clean_cells) >= 4:
records.append(
{
"Drug": clean_cells[0],
"TherapeuticArea": clean_cells[1],
"Biomarker": clean_cells[2],
"LabelingSection": clean_cells[3],
}
)
return records