Source code for tooluniverse.pfam_tool

# pfam_tool.py
"""
Pfam protein families tool for ToolUniverse.

Provides access to Pfam data via the InterPro API (Pfam is now hosted at InterPro):
- Search Pfam families by keyword
- Get detailed Pfam family information (description, counters, clan membership)
- Get proteins containing a Pfam domain (with optional species filter)
- Get Pfam annotations for a specific protein
- List Pfam clans (superfamilies) with search
- Get proteome distribution for a Pfam family

API: https://www.ebi.ac.uk/interpro/api/
No authentication required. Free public access.
"""

import re
import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool


INTERPRO_BASE_URL = "https://www.ebi.ac.uk/interpro/api"


[docs] class PfamTool(BaseTool): """ Tool for Pfam protein family queries via the InterPro API. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 90) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "search_families")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Pfam API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return {"error": f"InterPro/Pfam API timed out after {self.timeout}s"} except requests.exceptions.ConnectionError: return {"error": "Failed to connect to InterPro/Pfam API"} except requests.exceptions.HTTPError as e: code = e.response.status_code if e.response is not None else "unknown" if code == 404: param = arguments.get( "accession", arguments.get("pfam_accession", arguments.get("query", "")), ) return {"error": f"Not found in Pfam/InterPro: {param}"} if code == 204: return {"error": "No results found"} return {"error": f"InterPro/Pfam API HTTP error: {code}"} except Exception as e: return {"error": f"Unexpected error querying Pfam API: {str(e)}"}
[docs] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint == "search_families": return self._search_families(arguments) elif self.endpoint == "get_family_detail": return self._get_family_detail(arguments) elif self.endpoint == "get_family_proteins": return self._get_family_proteins(arguments) elif self.endpoint == "get_protein_pfam": return self._get_protein_pfam(arguments) elif self.endpoint == "search_clans": return self._search_clans(arguments) elif self.endpoint == "get_family_proteomes": return self._get_family_proteomes(arguments) else: return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs] def _strip_html(self, text: str) -> str: """Remove HTML tags from text.""" if not text: return "" return re.sub(r"<[^>]+>", "", text).strip()
[docs] def _search_families(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search Pfam families by keyword.""" query = arguments.get("query", "") if not query: return { "error": "query parameter is required (e.g., 'kinase', 'zinc finger')" } max_results = min(arguments.get("max_results", 20), 100) url = f"{INTERPRO_BASE_URL}/entry/pfam/" params = {"search": query, "page_size": max_results} response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() total = data.get("count", 0) results = data.get("results", []) families = [] for r in results: meta = r.get("metadata", {}) families.append( { "accession": meta.get("accession", ""), "name": meta.get("name", ""), "type": meta.get("type", ""), "integrated_interpro": meta.get("integrated"), } ) return { "data": { "query": query, "total_results": total, "returned": len(families), "families": families, }, "metadata": { "source": "InterPro API (Pfam family search)", "query": query, }, }
[docs] def _get_family_detail(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed information about a specific Pfam family.""" pfam_acc = arguments.get("pfam_accession", "") if not pfam_acc: return {"error": "pfam_accession parameter is required (e.g., 'PF00001')"} url = f"{INTERPRO_BASE_URL}/entry/pfam/{pfam_acc}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() meta = data.get("metadata", {}) name_info = meta.get("name", {}) if isinstance(name_info, dict): full_name = name_info.get("name", "") short_name = name_info.get("short", "") else: full_name = str(name_info) short_name = "" # Extract description text desc_list = meta.get("description", []) description = "" if desc_list and isinstance(desc_list, list): description = self._strip_html(desc_list[0].get("text", "")) # Extract counters counters = meta.get("counters", {}) # Extract set/clan info set_info = meta.get("set_info", {}) clan_accession = set_info.get("accession") if set_info else None clan_name = set_info.get("name") if set_info else None # Extract representative structure rep_struct = meta.get("representative_structure", {}) # Extract Wikipedia wiki = meta.get("wikipedia", []) wikipedia_title = wiki[0].get("title", "") if wiki else None # Extract literature count lit = meta.get("literature", {}) literature_count = len(lit) if lit else 0 # Extract GO terms go_terms = meta.get("go_terms", []) or [] return { "data": { "accession": meta.get("accession", ""), "name": full_name, "short_name": short_name, "type": meta.get("type", ""), "source_database": meta.get("source_database", ""), "integrated_interpro": meta.get("integrated"), "description": description[:2000] if description else None, "clan_accession": clan_accession, "clan_name": clan_name, "counters": { "proteins": counters.get("proteins", 0), "structures": counters.get("structures", 0), "taxa": counters.get("taxa", 0), "proteomes": counters.get("proteomes", 0), "domain_architectures": counters.get("domain_architectures", 0), "matches": counters.get("matches", 0), }, "representative_structure": { "pdb_id": rep_struct.get("accession"), "name": rep_struct.get("name"), } if rep_struct else None, "wikipedia_title": wikipedia_title, "literature_count": literature_count, "go_terms": go_terms[:20] if go_terms else [], }, "metadata": { "source": "InterPro API (Pfam family detail)", "pfam_accession": pfam_acc, }, }
[docs] def _get_family_proteins(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get proteins containing a specific Pfam domain, optionally filtered by species.""" pfam_acc = arguments.get("pfam_accession", "") if not pfam_acc: return {"error": "pfam_accession parameter is required (e.g., 'PF00001')"} max_results = min(arguments.get("max_results", 20), 100) reviewed_only = arguments.get("reviewed_only", True) tax_id = arguments.get("tax_id", None) db = "reviewed" if reviewed_only else "uniprot" if tax_id: url = f"{INTERPRO_BASE_URL}/protein/{db}/entry/pfam/{pfam_acc}/taxonomy/uniprot/{tax_id}/" else: url = f"{INTERPRO_BASE_URL}/protein/{db}/entry/pfam/{pfam_acc}/" params = {"page_size": max_results} response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() total = data.get("count", 0) results = data.get("results", []) proteins = [] for r in results: meta = r.get("metadata", {}) organism = meta.get("source_organism", {}) entries = r.get("entries", []) # Get domain positions domain_positions = [] for entry in entries: if entry.get("accession", "").upper() == pfam_acc.upper(): for loc in entry.get("entry_protein_locations", []): for frag in loc.get("fragments", []): domain_positions.append( { "start": frag.get("start"), "end": frag.get("end"), } ) proteins.append( { "accession": meta.get("accession", ""), "name": meta.get("name", ""), "gene": meta.get("gene"), "length": meta.get("length"), "organism": organism.get("scientificName") if organism else None, "tax_id": organism.get("taxId") if organism else None, "domain_positions": domain_positions, "in_alphafold": meta.get("in_alphafold", False), } ) return { "data": { "pfam_accession": pfam_acc, "total_proteins": total, "returned": len(proteins), "reviewed_only": reviewed_only, "tax_id_filter": tax_id, "proteins": proteins, }, "metadata": { "source": "InterPro API (Pfam family proteins)", "pfam_accession": pfam_acc, }, }
[docs] def _get_protein_pfam(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get all Pfam domain annotations for a specific protein.""" accession = arguments.get("accession", "") if not accession: return { "error": "accession parameter is required (UniProt accession, e.g., 'P04637')" } url = f"{INTERPRO_BASE_URL}/entry/pfam/protein/uniprot/{accession}" params = {"page_size": 50} response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() results = data.get("results", []) domains = [] protein_length = None for r in results: meta = r.get("metadata", {}) name_info = meta.get("name", {}) if isinstance(name_info, dict): name = name_info.get("name", "") short_name = name_info.get("short", "") else: name = str(name_info) if name_info else "" short_name = "" proteins = r.get("proteins", []) for p in proteins: if protein_length is None: protein_length = p.get("protein_length") for loc in p.get("entry_protein_locations", []): for frag in loc.get("fragments", []): domains.append( { "pfam_accession": meta.get("accession", ""), "name": name, "short_name": short_name, "type": meta.get("type", ""), "integrated_interpro": meta.get("integrated"), "start": frag.get("start"), "end": frag.get("end"), "score": loc.get("score"), } ) # Sort domains by start position domains.sort(key=lambda d: d.get("start", 0)) return { "data": { "accession": accession, "protein_length": protein_length, "domain_count": len(domains), "domains": domains, }, "metadata": { "source": "InterPro API (Pfam annotations for protein)", "accession": accession, }, }
[docs] def _search_clans(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search Pfam clans (superfamilies).""" query = arguments.get("query", "") max_results = min(arguments.get("max_results", 20), 100) url = f"{INTERPRO_BASE_URL}/set/pfam/" params = {"page_size": max_results} if query: params["search"] = query response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() total = data.get("count", 0) results = data.get("results", []) clans = [] for r in results: meta = r.get("metadata", {}) clans.append( { "accession": meta.get("accession", ""), "name": meta.get("name", ""), "source_database": meta.get("source_database", ""), } ) return { "data": { "query": query if query else "(all clans)", "total_results": total, "returned": len(clans), "clans": clans, }, "metadata": { "source": "InterPro API (Pfam clan search)", "query": query, }, }
[docs] def _get_family_proteomes(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get proteome distribution for a Pfam family.""" pfam_acc = arguments.get("pfam_accession", "") if not pfam_acc: return {"error": "pfam_accession parameter is required (e.g., 'PF00001')"} max_results = min(arguments.get("max_results", 20), 100) url = f"{INTERPRO_BASE_URL}/proteome/uniprot/entry/pfam/{pfam_acc}/" params = {"page_size": max_results} response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() total = data.get("count", 0) results = data.get("results", []) proteomes = [] for r in results: meta = r.get("metadata", {}) proteomes.append( { "proteome_accession": meta.get("accession", ""), "organism_name": meta.get("name", ""), "taxonomy_id": meta.get("taxonomy"), "is_reference": meta.get("is_reference", False), } ) return { "data": { "pfam_accession": pfam_acc, "total_proteomes": total, "returned": len(proteomes), "proteomes": proteomes, }, "metadata": { "source": "InterPro API (Pfam family proteomes)", "pfam_accession": pfam_acc, }, }