Source code for tooluniverse.enrichr_tool

import json
import requests
import urllib.parse
import networkx as nx
from .base_tool import BaseTool
from .tool_registry import register_tool


[docs] @register_tool("EnrichrTool") class EnrichrTool(BaseTool): """ Tool to perform gene enrichment analysis using Enrichr. """
[docs] def __init__(self, tool_config): super().__init__(tool_config) # Constants self.enrichr_url = "https://maayanlab.cloud/Enrichr/addList" self.enrichment_url = "https://maayanlab.cloud/Enrichr/enrich"
[docs] def run(self, arguments): """Main entry point for the tool.""" genes = arguments.get("gene_list") libs = arguments.get( "libs", [ "WikiPathways_2024_Human", "Reactome_Pathways_2024", "MSigDB_Hallmark_2020", "GO_Molecular_Function_2023", "GO_Biological_Process_2023", ], ) return self.enrichr_api(genes, libs)
[docs] def get_official_gene_name(self, gene_name): """ Retrieve the official gene symbol for a given gene name or synonym using the MyGene.info API. Parameters: gene_name (str): The gene name or synonym to query. Returns: str: The official gene symbol if found; otherwise, raises an Exception. """ # URL-encode the gene_name to handle special characters encoded_gene_name = urllib.parse.quote(gene_name) url = f"https://mygene.info/v3/query?q={encoded_gene_name}&fields=symbol,alias&species=human" response = requests.get(url) if response.status_code != 200: return f"Error querying MyGene.info API: {response.status_code}" data = response.json() hits = data.get("hits", []) if not hits: return f"No data found for: {gene_name}. Please check the gene name and try again." # Attempt to find an exact match in the official symbol or among aliases. for hit in hits: symbol = hit.get("symbol", "") if symbol.upper() == gene_name.upper(): print( f"[enrichr_api] Using the official gene name: '{symbol}' instead of {gene_name}", flush=True, ) return symbol aliases = hit.get("alias", []) if any(gene_name.upper() == alias.upper() for alias in aliases): print( f"[enrichr_api] Using the official gene name: '{symbol}' instead of {gene_name}", flush=True, ) return symbol # If no exact match is found, return the symbol of the top hit. top_hit = hits[0] symbol = top_hit.get("symbol", None) if symbol: print( f"[enrichr_api] Using the official gene name: '{symbol}' instead of {gene_name}", flush=True, ) return symbol else: return f"No official gene symbol found for: {gene_name}. Please ensure it is correct."
[docs] def submit_gene_list(self, gene_list): """ Submit the gene list to Enrichr and return the user list ID. Parameters: gene_list (str): Newline-separated string of gene names. Returns: str: The user list ID from Enrichr. """ payload = { "list": (None, gene_list), "description": (None, f"Gene list for {gene_list}"), } response = requests.post(self.enrichr_url, files=payload) if not response.ok: return "Error submitting gene list to Enrichr" return json.loads(response.text)["userListId"]
[docs] def get_enrichment_results(self, user_list_id, library): """ Fetch enrichment results for a specific library. Parameters: user_list_id (str): The user list ID from Enrichr. library (str): The name of the enrichment library. Returns: dict: The enrichment results. """ query_string = f"?userListId={user_list_id}&backgroundType={library}" response = requests.get(self.enrichment_url + query_string) if not response.ok: return f"Error fetching enrichment results for {library}" return json.loads(response.text)
[docs] def build_graph(self, genes, enrichment_results): """ Initialize and build the graph with gene nodes and enriched terms. Parameters: genes (list): List of gene names. enrichment_results (dict): Dictionary of enrichment results by library. Returns: networkx.Graph: The constructed graph. """ G = nx.Graph() # Add gene nodes for gene in genes: G.add_node(gene, type="gene") # Add enriched terms and edges for library, results in enrichment_results.items(): for term in results: term_name = term[1] associated_genes = term[5] G.add_node(term_name, type="term", library=library) for gene in associated_genes: if gene in genes: G.add_edge(gene, term_name, weight=round(term[4], 2)) return G
[docs] def rank_paths_by_weight(self, G, source, target): """ Find and rank paths between source and target based on total edge weight. Parameters: G (networkx.Graph): The graph to search. source (str): The source node. target (str): The target node. Returns: list: List of tuples (path, weight) sorted by weight descending. """ all_paths = list(nx.all_simple_paths(G, source=source, target=target)) path_weights = [] for path in all_paths: total_weight = sum( G[path[i]][path[i + 1]].get("weight", 1) for i in range(len(path) - 1) ) path_weights.append((path, total_weight)) return sorted(path_weights, key=lambda x: x[1], reverse=True)
[docs] def rank_paths_to_term(self, G, gene, term): """ Find and rank paths from each gene to a specified term based on total edge weight. Parameters: G (networkx.Graph): The graph to search. gene (str): The source gene. term (str): The target term. Returns: list or None: List of tuples (path, weight) sorted by weight descending, or None if no paths. """ all_paths = list(nx.all_simple_paths(G, source=gene, target=term)) path_weights = [] for path in all_paths: total_weight = sum( G[path[i]][path[i + 1]].get("weight", 1) for i in range(len(path) - 1) ) path_weights.append((path, total_weight)) if len(path_weights) != 0: return sorted(path_weights, key=lambda x: x[1], reverse=True) return None
[docs] def enrichr_api(self, genes, libs): """ Main API function to perform gene enrichment analysis. Parameters: genes (list): List of gene names. libs (list): List of enrichment libraries to use. Returns: tuple: (connected_path, connections) dictionaries. """ # Convert each gene to its official name and log the result genes = [self.get_official_gene_name(gene) for gene in genes] print("Official gene names:", genes) # Ensure at least two genes are provided for path ranking if len(genes) < 2: raise ValueError( "At least two genes are required to rank paths between genes." ) # Prepare the gene list for Enrichr submission gene_list_str = "\n".join(genes) user_list_id = self.submit_gene_list(gene_list_str) # Retrieve enrichment results for each specified library enrichment_results = {} for library in libs: results = self.get_enrichment_results(user_list_id, library) # Safely get the top 5 results; if the library key isn't found, default to an empty list enrichment_results[library] = results.get(library, [])[:5] # Build the graph from the gene list and enrichment results G = self.build_graph(genes, enrichment_results) # Rank paths from the first gene to the second ranked_paths = self.rank_paths_by_weight(G, genes[0], genes[1]) connected_path = {} for path, weight in ranked_paths: connected_path[f"Path: {path}"] = f"Total Weight: {weight}" # Compute connectivity data for each gene and graph node connections = {} for gene in genes: for term in G.nodes: paths_to_term = self.rank_paths_to_term(G, gene, term) if paths_to_term is not None: connections[f"Connectivity: {gene} - {term}"] = paths_to_term # Check for empty outputs and print helper messages if not connected_path: print( f"[Enrichr] No ranked paths were found between the gene pair {genes}." ) if not connections: print( f"[Enrichr] No connection between genes and terms in the enriched graph of {genes}." ) return connected_path, connections