Source code for tooluniverse.pfam_tool
# pfam_tool.py
"""
Pfam protein families tool for ToolUniverse.
Provides access to Pfam data via the InterPro API (Pfam is now hosted at InterPro):
- Search Pfam families by keyword
- Get detailed Pfam family information (description, counters, clan membership)
- Get proteins containing a Pfam domain (with optional species filter)
- Get Pfam annotations for a specific protein
- List Pfam clans (superfamilies) with search
- Get proteome distribution for a Pfam family
API: https://www.ebi.ac.uk/interpro/api/
No authentication required. Free public access.
"""
import re
import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
INTERPRO_BASE_URL = "https://www.ebi.ac.uk/interpro/api"
[docs]
class PfamTool(BaseTool):
"""
Tool for Pfam protein family queries via the InterPro API.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 90)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "search_families")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Pfam API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"InterPro/Pfam API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to InterPro/Pfam API"}
except requests.exceptions.HTTPError as e:
code = e.response.status_code if e.response is not None else "unknown"
if code == 404:
param = arguments.get(
"accession",
arguments.get("pfam_accession", arguments.get("query", "")),
)
return {"error": f"Not found in Pfam/InterPro: {param}"}
if code == 204:
return {"error": "No results found"}
return {"error": f"InterPro/Pfam API HTTP error: {code}"}
except Exception as e:
return {"error": f"Unexpected error querying Pfam API: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "search_families":
return self._search_families(arguments)
elif self.endpoint == "get_family_detail":
return self._get_family_detail(arguments)
elif self.endpoint == "get_family_proteins":
return self._get_family_proteins(arguments)
elif self.endpoint == "get_protein_pfam":
return self._get_protein_pfam(arguments)
elif self.endpoint == "search_clans":
return self._search_clans(arguments)
elif self.endpoint == "get_family_proteomes":
return self._get_family_proteomes(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _strip_html(self, text: str) -> str:
"""Remove HTML tags from text."""
if not text:
return ""
return re.sub(r"<[^>]+>", "", text).strip()
[docs]
def _search_families(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search Pfam families by keyword."""
query = arguments.get("query", "")
if not query:
return {
"error": "query parameter is required (e.g., 'kinase', 'zinc finger')"
}
max_results = min(arguments.get("max_results", 20), 100)
url = f"{INTERPRO_BASE_URL}/entry/pfam/"
params = {"search": query, "page_size": max_results}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
total = data.get("count", 0)
results = data.get("results", [])
families = []
for r in results:
meta = r.get("metadata", {})
families.append(
{
"accession": meta.get("accession", ""),
"name": meta.get("name", ""),
"type": meta.get("type", ""),
"integrated_interpro": meta.get("integrated"),
}
)
return {
"data": {
"query": query,
"total_results": total,
"returned": len(families),
"families": families,
},
"metadata": {
"source": "InterPro API (Pfam family search)",
"query": query,
},
}
[docs]
def _get_family_detail(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get detailed information about a specific Pfam family."""
pfam_acc = arguments.get("pfam_accession", "")
if not pfam_acc:
return {"error": "pfam_accession parameter is required (e.g., 'PF00001')"}
url = f"{INTERPRO_BASE_URL}/entry/pfam/{pfam_acc}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
data = response.json()
meta = data.get("metadata", {})
name_info = meta.get("name", {})
if isinstance(name_info, dict):
full_name = name_info.get("name", "")
short_name = name_info.get("short", "")
else:
full_name = str(name_info)
short_name = ""
# Extract description text
desc_list = meta.get("description", [])
description = ""
if desc_list and isinstance(desc_list, list):
description = self._strip_html(desc_list[0].get("text", ""))
# Extract counters
counters = meta.get("counters", {})
# Extract set/clan info
set_info = meta.get("set_info", {})
clan_accession = set_info.get("accession") if set_info else None
clan_name = set_info.get("name") if set_info else None
# Extract representative structure
rep_struct = meta.get("representative_structure", {})
# Extract Wikipedia
wiki = meta.get("wikipedia", [])
wikipedia_title = wiki[0].get("title", "") if wiki else None
# Extract literature count
lit = meta.get("literature", {})
literature_count = len(lit) if lit else 0
# Extract GO terms
go_terms = meta.get("go_terms", []) or []
return {
"data": {
"accession": meta.get("accession", ""),
"name": full_name,
"short_name": short_name,
"type": meta.get("type", ""),
"source_database": meta.get("source_database", ""),
"integrated_interpro": meta.get("integrated"),
"description": description[:2000] if description else None,
"clan_accession": clan_accession,
"clan_name": clan_name,
"counters": {
"proteins": counters.get("proteins", 0),
"structures": counters.get("structures", 0),
"taxa": counters.get("taxa", 0),
"proteomes": counters.get("proteomes", 0),
"domain_architectures": counters.get("domain_architectures", 0),
"matches": counters.get("matches", 0),
},
"representative_structure": {
"pdb_id": rep_struct.get("accession"),
"name": rep_struct.get("name"),
}
if rep_struct
else None,
"wikipedia_title": wikipedia_title,
"literature_count": literature_count,
"go_terms": go_terms[:20] if go_terms else [],
},
"metadata": {
"source": "InterPro API (Pfam family detail)",
"pfam_accession": pfam_acc,
},
}
[docs]
def _get_family_proteins(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get proteins containing a specific Pfam domain, optionally filtered by species."""
pfam_acc = arguments.get("pfam_accession", "")
if not pfam_acc:
return {"error": "pfam_accession parameter is required (e.g., 'PF00001')"}
max_results = min(arguments.get("max_results", 20), 100)
reviewed_only = arguments.get("reviewed_only", True)
tax_id = arguments.get("tax_id", None)
db = "reviewed" if reviewed_only else "uniprot"
if tax_id:
url = f"{INTERPRO_BASE_URL}/protein/{db}/entry/pfam/{pfam_acc}/taxonomy/uniprot/{tax_id}/"
else:
url = f"{INTERPRO_BASE_URL}/protein/{db}/entry/pfam/{pfam_acc}/"
params = {"page_size": max_results}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
total = data.get("count", 0)
results = data.get("results", [])
proteins = []
for r in results:
meta = r.get("metadata", {})
organism = meta.get("source_organism", {})
entries = r.get("entries", [])
# Get domain positions
domain_positions = []
for entry in entries:
if entry.get("accession", "").upper() == pfam_acc.upper():
for loc in entry.get("entry_protein_locations", []):
for frag in loc.get("fragments", []):
domain_positions.append(
{
"start": frag.get("start"),
"end": frag.get("end"),
}
)
proteins.append(
{
"accession": meta.get("accession", ""),
"name": meta.get("name", ""),
"gene": meta.get("gene"),
"length": meta.get("length"),
"organism": organism.get("scientificName") if organism else None,
"tax_id": organism.get("taxId") if organism else None,
"domain_positions": domain_positions,
"in_alphafold": meta.get("in_alphafold", False),
}
)
return {
"data": {
"pfam_accession": pfam_acc,
"total_proteins": total,
"returned": len(proteins),
"reviewed_only": reviewed_only,
"tax_id_filter": tax_id,
"proteins": proteins,
},
"metadata": {
"source": "InterPro API (Pfam family proteins)",
"pfam_accession": pfam_acc,
},
}
[docs]
def _get_protein_pfam(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get all Pfam domain annotations for a specific protein."""
accession = arguments.get("accession", "")
if not accession:
return {
"error": "accession parameter is required (UniProt accession, e.g., 'P04637')"
}
url = f"{INTERPRO_BASE_URL}/entry/pfam/protein/uniprot/{accession}"
params = {"page_size": 50}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
domains = []
protein_length = None
for r in results:
meta = r.get("metadata", {})
name_info = meta.get("name", {})
if isinstance(name_info, dict):
name = name_info.get("name", "")
short_name = name_info.get("short", "")
else:
name = str(name_info) if name_info else ""
short_name = ""
proteins = r.get("proteins", [])
for p in proteins:
if protein_length is None:
protein_length = p.get("protein_length")
for loc in p.get("entry_protein_locations", []):
for frag in loc.get("fragments", []):
domains.append(
{
"pfam_accession": meta.get("accession", ""),
"name": name,
"short_name": short_name,
"type": meta.get("type", ""),
"integrated_interpro": meta.get("integrated"),
"start": frag.get("start"),
"end": frag.get("end"),
"score": loc.get("score"),
}
)
# Sort domains by start position
domains.sort(key=lambda d: d.get("start", 0))
return {
"data": {
"accession": accession,
"protein_length": protein_length,
"domain_count": len(domains),
"domains": domains,
},
"metadata": {
"source": "InterPro API (Pfam annotations for protein)",
"accession": accession,
},
}
[docs]
def _search_clans(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search Pfam clans (superfamilies)."""
query = arguments.get("query", "")
max_results = min(arguments.get("max_results", 20), 100)
url = f"{INTERPRO_BASE_URL}/set/pfam/"
params = {"page_size": max_results}
if query:
params["search"] = query
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
total = data.get("count", 0)
results = data.get("results", [])
clans = []
for r in results:
meta = r.get("metadata", {})
clans.append(
{
"accession": meta.get("accession", ""),
"name": meta.get("name", ""),
"source_database": meta.get("source_database", ""),
}
)
return {
"data": {
"query": query if query else "(all clans)",
"total_results": total,
"returned": len(clans),
"clans": clans,
},
"metadata": {
"source": "InterPro API (Pfam clan search)",
"query": query,
},
}
[docs]
def _get_family_proteomes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get proteome distribution for a Pfam family."""
pfam_acc = arguments.get("pfam_accession", "")
if not pfam_acc:
return {"error": "pfam_accession parameter is required (e.g., 'PF00001')"}
max_results = min(arguments.get("max_results", 20), 100)
url = f"{INTERPRO_BASE_URL}/proteome/uniprot/entry/pfam/{pfam_acc}/"
params = {"page_size": max_results}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
total = data.get("count", 0)
results = data.get("results", [])
proteomes = []
for r in results:
meta = r.get("metadata", {})
proteomes.append(
{
"proteome_accession": meta.get("accession", ""),
"organism_name": meta.get("name", ""),
"taxonomy_id": meta.get("taxonomy"),
"is_reference": meta.get("is_reference", False),
}
)
return {
"data": {
"pfam_accession": pfam_acc,
"total_proteomes": total,
"returned": len(proteomes),
"proteomes": proteomes,
},
"metadata": {
"source": "InterPro API (Pfam family proteomes)",
"pfam_accession": pfam_acc,
},
}