Source code for tooluniverse.humanbase_tool

import networkx as nx
import requests
import urllib.parse
from .base_tool import BaseTool
from .tool_registry import register_tool



[docs]
@register_tool("HumanBaseTool")
class HumanBaseTool(BaseTool):
    """
    Tool to retrieve protein-protein interactions and biological processes from HumanBase.
    """


[docs]
    def __init__(self, tool_config):
        super().__init__(tool_config)



[docs]
    def run(self, arguments):
        """Main entry point for the tool."""
        gene_list = arguments.get("gene_list")
        tissue = arguments.get("tissue", "brain")
        max_node = arguments.get("max_node", 10)
        interaction = arguments.get("interaction", None)
        string_mode = arguments.get("string_mode", True)

        graph, bp_collection = self.humanbase_ppi_retrieve(
            gene_list, tissue, max_node, interaction
        )

        if string_mode:
            return self._convert_to_string(graph, bp_collection, gene_list, tissue)
        else:
            return graph, bp_collection



[docs]
    def get_official_gene_name(self, gene_name):
        """
        Retrieve the official gene symbol (same as EnrichrTool method)

        Parameters
            gene_name (str): The gene name or synonym to query.

        Returns
            str: The official gene symbol.
        """
        """
        Retrieve the official gene symbol for a given gene name or synonym using the MyGene.info API.

        Parameters
            gene_name (str): The gene name or synonym to query.

        Returns
            str: The official gene symbol if found; otherwise, raises an Exception.
        """
        # URL-encode the gene_name to handle special characters
        encoded_gene_name = urllib.parse.quote(gene_name)
        url = f"https://mygene.info/v3/query?q={encoded_gene_name}&fields=symbol,alias&species=human"

        response = requests.get(url)
        if response.status_code != 200:
            return f"Error querying MyGene.info API: {response.status_code}"

        data = response.json()
        hits = data.get("hits", [])
        if not hits:
            return f"No data found for: {gene_name}. Please check the gene name and try again."

        # Attempt to find an exact match in the official symbol or among aliases.
        for hit in hits:
            symbol = hit.get("symbol", "")
            if symbol.upper() == gene_name.upper():
                print(
                    f"[humanbase_tool] Using the official gene name: '{symbol}' instead of {gene_name}",
                    flush=True,
                )
                return symbol
            aliases = hit.get("alias", [])
            if any(gene_name.upper() == alias.upper() for alias in aliases):
                print(
                    f"[humanbase_tool] Using the official gene name: '{symbol}' instead of {gene_name}",
                    flush=True,
                )
                return symbol

        # If no exact match is found, return the symbol of the top hit.
        top_hit = hits[0]
        symbol = top_hit.get("symbol", None)
        if symbol:
            print(
                f"[humanbase_tool] Using the official gene name: '{symbol}' instead of {gene_name}",
                flush=True,
            )
            return symbol
        else:
            return f"No official gene symbol found for: {gene_name}. Please ensure it is correct."



[docs]
    def get_entrez_ids(self, gene_names):
        """
        Convert gene names to Entrez IDs using NCBI Entrez API.

        Parameters
            gene_names (list): List of gene names to convert.

        Returns
            list: List of Entrez IDs corresponding to the gene names.
        """
        # Define the NCBI Entrez API URL for querying gene information
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

        # Initialize a list to store Entrez IDs
        entrez_ids = []
        gene_names = [self.get_official_gene_name(gene) for gene in gene_names]

        # Loop over each gene name in the input list
        for gene in gene_names:
            # Define the parameters for the API request
            params = {
                "db": "gene",  # Specify the database to search in (gene)
                "term": gene
                + "[gene] AND Homo sapiens[orgn]",  # Query term with organism filter
                "retmode": "xml",  # Request the output in XML format
                "retmax": "1",  # We only want the first result
            }

            # Send the request to the Entrez API
            response = requests.get(url, params=params)

            # Check if the response was successful
            if response.status_code == 200:
                # Parse the XML response
                xml_data = response.text

                # Find the Entrez Gene ID in the XML response
                start_idx = xml_data.find("<Id>")
                end_idx = xml_data.find("</Id>")

                if start_idx != -1 and end_idx != -1:
                    # Extract and append the Entrez Gene ID to the list
                    entrez_id = xml_data[start_idx + 4 : end_idx]
                    entrez_ids.append(entrez_id)
                else:
                    # If no Entrez ID is found, append None
                    entrez_ids.append(None)
            else:
                # Handle any errors in the API request
                return f"Error fetching data for gene: {gene}. Please check whether the gene uses official gene name."

        return entrez_ids



[docs]
    def humanbase_ppi_retrieve(self, genes, tissue, max_node=10, interaction=None):
        """
        Retrieve protein-protein interactions and biological processes from HumanBase.

        Parameters
            genes (list): List of gene names to analyze.
            tissue (str): Tissue type for tissue-specific interactions.
            max_node (int): Maximum number of nodes to retrieve.
            interaction (str): Specific interaction type to filter by.

        Returns
            tuple: (NetworkX Graph of interactions, list of biological processes)
        """
        genes = self.get_entrez_ids(genes)

        tissue = tissue.replace(" ", "-").replace("_", "-").lower()
        interaction_types = [
            "co-expression",
            "interaction",
            "tf-binding",
            "gsea-microrna-targets",
            "gsea-perturbations",
        ]

        if not interaction or interaction not in interaction_types:
            interaction = "&datatypes=".join(interaction_types)

        gene_id = "&entrez=".join(genes)
        G = nx.Graph()
        bp_collection = None

        network_url = f"https://hb.flatironinstitute.org/api/integrations/{tissue}/network/?datatypes={interaction}&entrez={gene_id}&node_size={max_node}"
        edge_type_url = "https://hb.flatironinstitute.org/api/integrations/{tissue}/evidence/?limit=20&source={source}&target={target}"

        # Retrieve tissue-specific PPI
        try:
            response = requests.get(network_url)
            response.raise_for_status()
            data = response.json()

            if "genes" in data.keys():
                G.add_nodes_from(
                    [
                        (
                            g["standard_name"],
                            {"entrez": g["entrez"], "description": g["description"]},
                        )
                        for g in data["genes"]
                    ]
                )

            if "edges" in data.keys():
                for e in data["edges"]:
                    source = data["genes"][e["source"]]["standard_name"]
                    target = data["genes"][e["target"]]["standard_name"]
                    weight = e["weight"]

                    edge_response = requests.get(
                        edge_type_url.format(
                            tissue=tissue,
                            source=G.nodes[source]["entrez"],
                            target=G.nodes[target]["entrez"],
                        )
                    )
                    edge_response.raise_for_status()
                    edge_data = edge_response.json()
                    edge_info = {
                        t["title"]: t["weight"] for t in edge_data["datatypes"]
                    }

                    G.add_edge(source, target, weight=weight, interaction=edge_info)

        except requests.exceptions.RequestException as exc:
            print(f"Error retrieving PPI data: {exc}")

        # Check gene ontology (biological process) graph involved
        bp_url = f"https://hb.flatironinstitute.org/api/terms/annotated/?database=gene-ontology-bp&entrez={gene_id}&max_term_size=20"

        try:
            response = requests.get(bp_url)
            response.raise_for_status()
            data = response.json()

            if len(data) > 0:
                # Grab the top 20 common pathways
                bp_collection = [bp_entity["title"] for bp_entity in data]
            else:
                print(f"[{genes}] No Gene Ontology Process recorded.")

        except requests.exceptions.RequestException as exc:
            print(f"Error retrieving biological process data: {exc}")

        return G, bp_collection



[docs]
    def _convert_to_string(self, graph, bp_collection, original_genes, tissue):
        """
        Convert NetworkX graph and biological processes to string representation.

        Parameters
            graph (networkx.Graph): The network graph.
            bp_collection (list): List of biological processes.
            original_genes (list): Original gene list provided by user.
            tissue (str): Tissue type used for analysis.

        Returns
            str: Comprehensive string representation of the network data.
        """
        output = []

        # Header information
        output.append("🧬 HUMANBASE PROTEIN-PROTEIN INTERACTION NETWORK")
        output.append("=" * 50)
        output.append(f"Query Genes: {', '.join(original_genes)}")
        output.append(f"Tissue: {tissue.capitalize()}")
        output.append(f"Analysis Date: {self._get_current_timestamp()}")
        output.append("")

        # Network summary
        num_nodes = graph.number_of_nodes()
        num_edges = graph.number_of_edges()

        output.append("📊 NETWORK SUMMARY")
        output.append("-" * 20)
        output.append(f"Total Proteins: {num_nodes}")
        output.append(f"Total Interactions: {num_edges}")

        if num_nodes > 0:
            density = nx.density(graph)
            output.append(f"Network Density: {density:.3f}")

        output.append("")

        # Node information
        if num_nodes > 0:
            output.append("🔗 PROTEIN NODES")
            output.append("-" * 15)
            for i, (node, data) in enumerate(graph.nodes(data=True), 1):
                entrez_id = data.get("entrez", "N/A")
                description = data.get("description", "No description available")
                degree = graph.degree(node)
                output.append(f"{i:2d}. {node} (Entrez: {entrez_id})")
                output.append(f"    Description: {description}")
                output.append(f"    Connections: {degree}")
                output.append("")

        # Edge information
        if num_edges > 0:
            output.append("⚡ PROTEIN INTERACTIONS")
            output.append("-" * 22)
            for i, (source, target, data) in enumerate(graph.edges(data=True), 1):
                weight = data.get("weight", "N/A")
                interaction_info = data.get("interaction", {})

                output.append(f"{i:2d}. {source} ↔ {target}")
                output.append(f"    Weight: {weight}")

                if interaction_info:
                    output.append("    Evidence Types:")
                    for evidence_type, evidence_weight in interaction_info.items():
                        output.append(f"      • {evidence_type}: {evidence_weight}")
                else:
                    output.append(
                        "    Evidence Types: No detailed information available"
                    )
                output.append("")

        # Biological processes
        if bp_collection:
            output.append("🧬 ASSOCIATED BIOLOGICAL PROCESSES")
            output.append("-" * 35)
            output.append(f"Total Processes: {len(bp_collection)}")
            output.append("")

            for i, process in enumerate(bp_collection, 1):
                output.append(f"{i:2d}. {process}")
            output.append("")
        else:
            output.append("🧬 ASSOCIATED BIOLOGICAL PROCESSES")
            output.append("-" * 35)
            output.append("No biological processes found for this gene set.")
            output.append("")

        # Network analysis summary
        if num_nodes > 1:
            output.append("📈 NETWORK ANALYSIS")
            output.append("-" * 18)

            # Most connected proteins
            if num_nodes > 0:
                degrees = [(node, graph.degree(node)) for node in graph.nodes()]
                degrees.sort(key=lambda x: x[1], reverse=True)

                output.append("Most Connected Proteins:")
                for i, (node, degree) in enumerate(degrees[:5], 1):
                    output.append(f"  {i}. {node}: {degree} connections")
                output.append("")

            # Connectivity
            is_connected = nx.is_connected(graph)
            output.append(
                f"Network Connectivity: {'Fully connected' if is_connected else 'Disconnected components'}"
            )

            if is_connected and num_nodes > 1:
                try:
                    diameter = nx.diameter(graph)
                    avg_path_length = nx.average_shortest_path_length(graph)
                    output.append(f"Network Diameter: {diameter}")
                    output.append(f"Average Path Length: {avg_path_length:.2f}")
                except Exception:
                    pass

            # Clustering
            try:
                clustering = nx.average_clustering(graph)
                output.append(f"Average Clustering: {clustering:.3f}")
            except Exception:
                pass

            output.append("")

        # Footer
        output.append("📝 NOTES")
        output.append("-" * 8)
        output.append(
            "• Interaction weights represent confidence scores from HumanBase"
        )
        output.append("• Evidence types indicate the source of interaction data")
        output.append(
            "• Biological processes are derived from Gene Ontology annotations"
        )
        output.append(
            "• Network analysis metrics help understand protein relationship patterns"
        )

        return "\n".join(output)



[docs]
    def _get_current_timestamp(self):
        """Get current timestamp for the report."""
        from datetime import datetime

        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")