Source code for tooluniverse.humanbase_tool
import networkx as nx
import requests
import urllib.parse
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("HumanBaseTool")
class HumanBaseTool(BaseTool):
"""
Tool to retrieve protein-protein interactions and biological processes from HumanBase.
"""
[docs]
def run(self, arguments):
"""Main entry point for the tool."""
gene_list = arguments.get("gene_list")
tissue = arguments.get("tissue", "brain")
max_node = arguments.get("max_node", 10)
interaction = arguments.get("interaction", None)
string_mode = arguments.get("string_mode", True)
graph, bp_collection = self.humanbase_ppi_retrieve(
gene_list, tissue, max_node, interaction
)
if string_mode:
return self._convert_to_string(graph, bp_collection, gene_list, tissue)
else:
return graph, bp_collection
[docs]
def get_official_gene_name(self, gene_name):
"""
Retrieve the official gene symbol (same as EnrichrTool method)
Parameters:
gene_name (str): The gene name or synonym to query.
Returns:
str: The official gene symbol.
"""
"""
Retrieve the official gene symbol for a given gene name or synonym using the MyGene.info API.
Parameters:
gene_name (str): The gene name or synonym to query.
Returns:
str: The official gene symbol if found; otherwise, raises an Exception.
"""
# URL-encode the gene_name to handle special characters
encoded_gene_name = urllib.parse.quote(gene_name)
url = f"https://mygene.info/v3/query?q={encoded_gene_name}&fields=symbol,alias&species=human"
response = requests.get(url)
if response.status_code != 200:
return f"Error querying MyGene.info API: {response.status_code}"
data = response.json()
hits = data.get("hits", [])
if not hits:
return f"No data found for: {gene_name}. Please check the gene name and try again."
# Attempt to find an exact match in the official symbol or among aliases.
for hit in hits:
symbol = hit.get("symbol", "")
if symbol.upper() == gene_name.upper():
print(
f"[humanbase_tool] Using the official gene name: '{symbol}' instead of {gene_name}",
flush=True,
)
return symbol
aliases = hit.get("alias", [])
if any(gene_name.upper() == alias.upper() for alias in aliases):
print(
f"[humanbase_tool] Using the official gene name: '{symbol}' instead of {gene_name}",
flush=True,
)
return symbol
# If no exact match is found, return the symbol of the top hit.
top_hit = hits[0]
symbol = top_hit.get("symbol", None)
if symbol:
print(
f"[humanbase_tool] Using the official gene name: '{symbol}' instead of {gene_name}",
flush=True,
)
return symbol
else:
return f"No official gene symbol found for: {gene_name}. Please ensure it is correct."
[docs]
def get_entrez_ids(self, gene_names):
"""
Convert gene names to Entrez IDs using NCBI Entrez API.
Parameters:
gene_names (list): List of gene names to convert.
Returns:
list: List of Entrez IDs corresponding to the gene names.
"""
# Define the NCBI Entrez API URL for querying gene information
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
# Initialize a list to store Entrez IDs
entrez_ids = []
gene_names = [self.get_official_gene_name(gene) for gene in gene_names]
# Loop over each gene name in the input list
for gene in gene_names:
# Define the parameters for the API request
params = {
"db": "gene", # Specify the database to search in (gene)
"term": gene
+ "[gene] AND Homo sapiens[orgn]", # Query term with organism filter
"retmode": "xml", # Request the output in XML format
"retmax": "1", # We only want the first result
}
# Send the request to the Entrez API
response = requests.get(url, params=params)
# Check if the response was successful
if response.status_code == 200:
# Parse the XML response
xml_data = response.text
# Find the Entrez Gene ID in the XML response
start_idx = xml_data.find("<Id>")
end_idx = xml_data.find("</Id>")
if start_idx != -1 and end_idx != -1:
# Extract and append the Entrez Gene ID to the list
entrez_id = xml_data[start_idx + 4 : end_idx]
entrez_ids.append(entrez_id)
else:
# If no Entrez ID is found, append None
entrez_ids.append(None)
else:
# Handle any errors in the API request
return f"Error fetching data for gene: {gene}. Please check whether the gene uses official gene name."
return entrez_ids
[docs]
def humanbase_ppi_retrieve(self, genes, tissue, max_node=10, interaction=None):
"""
Retrieve protein-protein interactions and biological processes from HumanBase.
Parameters:
genes (list): List of gene names to analyze.
tissue (str): Tissue type for tissue-specific interactions.
max_node (int): Maximum number of nodes to retrieve.
interaction (str): Specific interaction type to filter by.
Returns:
tuple: (NetworkX Graph of interactions, list of biological processes)
"""
genes = self.get_entrez_ids(genes)
tissue = tissue.replace(" ", "-").replace("_", "-").lower()
interaction_types = [
"co-expression",
"interaction",
"tf-binding",
"gsea-microrna-targets",
"gsea-perturbations",
]
if not interaction or interaction not in interaction_types:
interaction = "&datatypes=".join(interaction_types)
gene_id = "&entrez=".join(genes)
G = nx.Graph()
bp_collection = None
network_url = f"https://hb.flatironinstitute.org/api/integrations/{tissue}/network/?datatypes={interaction}&entrez={gene_id}&node_size={max_node}"
edge_type_url = "https://hb.flatironinstitute.org/api/integrations/{tissue}/evidence/?limit=20&source={source}&target={target}"
# Retrieve tissue-specific PPI
try:
response = requests.get(network_url)
response.raise_for_status()
data = response.json()
if "genes" in data.keys():
G.add_nodes_from(
[
(
g["standard_name"],
{"entrez": g["entrez"], "description": g["description"]},
)
for g in data["genes"]
]
)
if "edges" in data.keys():
for e in data["edges"]:
source = data["genes"][e["source"]]["standard_name"]
target = data["genes"][e["target"]]["standard_name"]
weight = e["weight"]
edge_response = requests.get(
edge_type_url.format(
tissue=tissue,
source=G.nodes[source]["entrez"],
target=G.nodes[target]["entrez"],
)
)
edge_response.raise_for_status()
edge_data = edge_response.json()
edge_info = {
t["title"]: t["weight"] for t in edge_data["datatypes"]
}
G.add_edge(source, target, weight=weight, interaction=edge_info)
except requests.exceptions.RequestException as exc:
print(f"Error retrieving PPI data: {exc}")
# Check gene ontology (biological process) graph involved
bp_url = f"https://hb.flatironinstitute.org/api/terms/annotated/?database=gene-ontology-bp&entrez={gene_id}&max_term_size=20"
try:
response = requests.get(bp_url)
response.raise_for_status()
data = response.json()
if len(data) > 0:
# Grab the top 20 common pathways
bp_collection = [bp_entity["title"] for bp_entity in data]
else:
print(f"[{genes}] No Gene Ontology Process recorded.")
except requests.exceptions.RequestException as exc:
print(f"Error retrieving biological process data: {exc}")
return G, bp_collection
[docs]
def _convert_to_string(self, graph, bp_collection, original_genes, tissue):
"""
Convert NetworkX graph and biological processes to string representation.
Parameters:
graph (networkx.Graph): The network graph.
bp_collection (list): List of biological processes.
original_genes (list): Original gene list provided by user.
tissue (str): Tissue type used for analysis.
Returns:
str: Comprehensive string representation of the network data.
"""
output = []
# Header information
output.append("🧬 HUMANBASE PROTEIN-PROTEIN INTERACTION NETWORK")
output.append("=" * 50)
output.append(f"Query Genes: {', '.join(original_genes)}")
output.append(f"Tissue: {tissue.capitalize()}")
output.append(f"Analysis Date: {self._get_current_timestamp()}")
output.append("")
# Network summary
num_nodes = graph.number_of_nodes()
num_edges = graph.number_of_edges()
output.append("📊 NETWORK SUMMARY")
output.append("-" * 20)
output.append(f"Total Proteins: {num_nodes}")
output.append(f"Total Interactions: {num_edges}")
if num_nodes > 0:
density = nx.density(graph)
output.append(f"Network Density: {density:.3f}")
output.append("")
# Node information
if num_nodes > 0:
output.append("🔗 PROTEIN NODES")
output.append("-" * 15)
for i, (node, data) in enumerate(graph.nodes(data=True), 1):
entrez_id = data.get("entrez", "N/A")
description = data.get("description", "No description available")
degree = graph.degree(node)
output.append(f"{i:2d}. {node} (Entrez: {entrez_id})")
output.append(f" Description: {description}")
output.append(f" Connections: {degree}")
output.append("")
# Edge information
if num_edges > 0:
output.append("⚡ PROTEIN INTERACTIONS")
output.append("-" * 22)
for i, (source, target, data) in enumerate(graph.edges(data=True), 1):
weight = data.get("weight", "N/A")
interaction_info = data.get("interaction", {})
output.append(f"{i:2d}. {source} ↔ {target}")
output.append(f" Weight: {weight}")
if interaction_info:
output.append(" Evidence Types:")
for evidence_type, evidence_weight in interaction_info.items():
output.append(f" • {evidence_type}: {evidence_weight}")
else:
output.append(
" Evidence Types: No detailed information available"
)
output.append("")
# Biological processes
if bp_collection:
output.append("🧬 ASSOCIATED BIOLOGICAL PROCESSES")
output.append("-" * 35)
output.append(f"Total Processes: {len(bp_collection)}")
output.append("")
for i, process in enumerate(bp_collection, 1):
output.append(f"{i:2d}. {process}")
output.append("")
else:
output.append("🧬 ASSOCIATED BIOLOGICAL PROCESSES")
output.append("-" * 35)
output.append("No biological processes found for this gene set.")
output.append("")
# Network analysis summary
if num_nodes > 1:
output.append("📈 NETWORK ANALYSIS")
output.append("-" * 18)
# Most connected proteins
if num_nodes > 0:
degrees = [(node, graph.degree(node)) for node in graph.nodes()]
degrees.sort(key=lambda x: x[1], reverse=True)
output.append("Most Connected Proteins:")
for i, (node, degree) in enumerate(degrees[:5], 1):
output.append(f" {i}. {node}: {degree} connections")
output.append("")
# Connectivity
is_connected = nx.is_connected(graph)
output.append(
f"Network Connectivity: {'Fully connected' if is_connected else 'Disconnected components'}"
)
if is_connected and num_nodes > 1:
try:
diameter = nx.diameter(graph)
avg_path_length = nx.average_shortest_path_length(graph)
output.append(f"Network Diameter: {diameter}")
output.append(f"Average Path Length: {avg_path_length:.2f}")
except Exception:
pass
# Clustering
try:
clustering = nx.average_clustering(graph)
output.append(f"Average Clustering: {clustering:.3f}")
except Exception:
pass
output.append("")
# Footer
output.append("📝 NOTES")
output.append("-" * 8)
output.append(
"• Interaction weights represent confidence scores from HumanBase"
)
output.append("• Evidence types indicate the source of interaction data")
output.append(
"• Biological processes are derived from Gene Ontology annotations"
)
output.append(
"• Network analysis metrics help understand protein relationship patterns"
)
return "\n".join(output)
[docs]
def _get_current_timestamp(self):
"""Get current timestamp for the report."""
from datetime import datetime
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")