Source code for tooluniverse.meme_tool

"""
MEME Suite Tool

Provides access to the MEME Suite motif analysis web services at
https://meme-suite.org/meme/

The MEME Suite is the gold-standard toolkit for sequence motif analysis.
This tool wraps three core programs:

- FIMO (Find Individual Motif Occurrences): Scan sequences for known TF
  binding motifs from databases like JASPAR and HOCOMOCO. Returns binding
  sites with p-values, scores, and genomic coordinates.

- MEME (Multiple Em for Motif Elicitation): De novo motif discovery from
  a set of input sequences. Finds overrepresented sequence patterns that
  may correspond to TF binding sites, splice sites, or other regulatory
  elements.

- TOMTOM: Compare a query motif against a database of known motifs
  (JASPAR, HOCOMOCO, CIS-BP, etc.) to identify the transcription factor
  most likely to bind the discovered motif.

Additionally provides a local database listing endpoint that catalogs
available motif databases by category (no remote API call needed).

API pattern:
1. POST multipart form to https://meme-suite.org/meme/tools/{program}
2. Parse job ID from the HTML verification response
3. Poll status via GET .../info/status?service={PROGRAM}&id={job_id}&xml=1
   Status values: pending, active, done, failed, expired, unknown
4. Retrieve TSV/text results from .../opal-jobs/{job_id}/{output_file}

No authentication required. Free academic service.
"""

import re
import time
import requests
from typing import Dict, Any, List, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool

MEME_BASE_URL = "https://meme-suite.org/meme"

# Motif database categories available on MEME Suite
# category_id -> {name, description, db_count}
MOTIF_DB_CATEGORIES = [
    {
        "id": 1,
        "name": "Eukaryote DNA",
        "count": 14,
        "description": "Curated eukaryotic TF motif collections including HOCOMOCO, Swiss Regulon, UniPROBE",
    },
    {
        "id": 2,
        "name": "Prokaryote DNA",
        "count": 5,
        "description": "Prokaryotic TF motif databases including RegTransBase and DPInteract",
    },
    {
        "id": 3,
        "name": "Methylcytosine DNA",
        "count": 1,
        "description": "Methylcytosine-aware TF binding motifs",
    },
    {
        "id": 4,
        "name": "JASPAR NON-REDUNDANT DNA",
        "count": 51,
        "description": "JASPAR CORE non-redundant TF binding profiles (2014-2026). Recommended for TF motif scanning.",
    },
    {
        "id": 5,
        "name": "JASPAR REDUNDANT DNA",
        "count": 51,
        "description": "JASPAR CORE redundant TF binding profiles (2014-2026)",
    },
    {
        "id": 6,
        "name": "JASPAR COLLECTIONS DNA",
        "count": 10,
        "description": "JASPAR specialized collections (PBM, CNE, POLII, PHYLOFACTS, SPLICE, etc.)",
    },
    {
        "id": 7,
        "name": "HOCOMOCO DNA",
        "count": 4,
        "description": "HOCOMOCO v12 human and mouse ortholog TF binding models. High-quality curated collection.",
    },
    {
        "id": 8,
        "name": "TFBSshape DNA",
        "count": 3,
        "description": "TFBSshape TF binding site shape-based motifs",
    },
    {
        "id": 9,
        "name": "CIS-BP 2.00 DNA",
        "count": 729,
        "description": "CIS-BP 2.00 single-species TF binding motifs (729 species)",
    },
    {
        "id": 10,
        "name": "CIS-BP 1.02 DNA",
        "count": 321,
        "description": "CIS-BP 1.02 single-species TF binding motifs (321 species)",
    },
    {
        "id": 11,
        "name": "ARABIDOPSIS DNA",
        "count": 2,
        "description": "Arabidopsis thaliana TF binding motifs (AthaMap, AGRIS)",
    },
    {
        "id": 12,
        "name": "ECOLI DNA",
        "count": 2,
        "description": "Escherichia coli TF binding motifs (DPInteract, RegTransBase)",
    },
    {
        "id": 13,
        "name": "FLY DNA",
        "count": 6,
        "description": "Drosophila melanogaster TF binding motifs (DMMPMM, FlyFactorSurvey, etc.)",
    },
    {
        "id": 14,
        "name": "HUMAN DNA",
        "count": 4,
        "description": "Human TF binding motifs (TRANSFAC, Zhao2011, Wei2010, Jolma2013)",
    },
    {
        "id": 15,
        "name": "MALARIA DNA",
        "count": 1,
        "description": "Plasmodium falciparum TF binding motifs",
    },
    {
        "id": 16,
        "name": "MOUSE DNA",
        "count": 3,
        "description": "Mus musculus TF binding motifs (UniPROBE, Chen2008)",
    },
    {
        "id": 17,
        "name": "WORM DNA",
        "count": 1,
        "description": "Caenorhabditis elegans TF binding motifs",
    },
    {
        "id": 18,
        "name": "YEAST DNA",
        "count": 4,
        "description": "Saccharomyces cerevisiae TF binding motifs (SCPD, MacIsaac, etc.)",
    },
    {
        "id": 19,
        "name": "CISBP-RNA RNA",
        "count": 729,
        "description": "CIS-BP RNA binding protein motifs (single species)",
    },
    {
        "id": 21,
        "name": "RNA",
        "count": 3,
        "description": "RNA binding protein motifs (Ray2013, etc.)",
    },
]

# Popular JASPAR database listings
JASPAR_DB_LISTINGS = {
    "JASPAR2026_vertebrates": {
        "category": 4,
        "listing": 22,
        "description": "JASPAR CORE 2026 vertebrates (non-redundant)",
    },
    "JASPAR2026_all": {
        "category": 4,
        "listing": 21,
        "description": "JASPAR CORE 2026 all species (non-redundant)",
    },
    "JASPAR2024_vertebrates": {
        "category": 4,
        "listing": 30,
        "description": "JASPAR CORE 2024 vertebrates (non-redundant)",
    },
    "HOCOMOCO_v12": {
        "category": 7,
        "listing": 1,
        "description": "HOCOMOCO v12 human and mouse CORE motifs",
    },
}


def _build_meme_motif_text(motif_name, matrix_rows, alphabet="ACGT"):
    """Build a minimal MEME-format motif string from a probability matrix."""
    w = len(matrix_rows)
    header = (
        "MEME version 5\n\n"
        "ALPHABET= {}\n\n"
        "strands: + -\n\n"
        "Background letter frequencies\n"
        "A 0.25 C 0.25 G 0.25 T 0.25\n\n"
        "MOTIF {}\n"
        "letter-probability matrix: alength= {} w= {} nsites= 100 E= 0\n"
    ).format(alphabet, motif_name, len(alphabet), w)
    rows = "\n".join(" ".join("{:.2f}".format(v) for v in row) for row in matrix_rows)
    return header + rows


[docs] @register_tool("MEMETool") class MEMETool(BaseTool): """ Tool for motif analysis using the MEME Suite web services. Supports three core operations: - fimo_scan: Scan sequences for known TF binding motifs - discover_motifs: De novo motif discovery with MEME - tomtom_compare: Compare motifs against known databases - list_databases: List available motif databases (local, no API call) All operations except list_databases submit jobs to meme-suite.org and poll for results. """
[docs] def __init__(self, tool_config): super().__init__(tool_config) self.parameter = tool_config.get("parameter", {}) self.required = self.parameter.get("required", []) self.session = requests.Session() self.session.headers.update( { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } )
[docs] def run(self, arguments): """Execute the MEME Suite tool with given arguments.""" operation = arguments.get("operation") if not operation: return {"status": "error", "error": "Missing required parameter: operation"} operation_handlers = { "fimo_scan": self._fimo_scan, "discover_motifs": self._discover_motifs, "tomtom_compare": self._tomtom_compare, "list_databases": self._list_databases, } handler = operation_handlers.get(operation) if not handler: return { "status": "error", "error": "Unknown operation: {}. Available: {}".format( operation, list(operation_handlers.keys()) ), } try: return handler(arguments) except requests.exceptions.Timeout: return { "status": "error", "error": "MEME Suite request timed out. The server may be busy.", } except requests.exceptions.ConnectionError: return { "status": "error", "error": "Could not connect to MEME Suite. Service may be temporarily unavailable.", } except Exception as e: return { "status": "error", "error": "MEME Suite error: {}".format(str(e)), }
# ─── FIMO ──────────────────────────────────────────────────────────
[docs] def _fimo_scan(self, arguments): """ Scan sequences for known TF binding motifs using FIMO. Submits a FIMO job with inline motifs (MEME format) or database motifs, polls for completion, and returns parsed TSV results. """ sequences = arguments.get("sequences") if not sequences: return { "status": "error", "error": "Missing required parameter: sequences (FASTA format)", } # Build motif input motif_text = arguments.get("motif_text") if not motif_text: return { "status": "error", "error": "Missing required parameter: motif_text (MEME format motif)", } pvalue_threshold = arguments.get("pvalue_threshold", 0.0001) scan_rc = arguments.get("scan_rc", True) # Build form data form_data = { "motifs_source": (None, "text"), "motifs_alphabet": (None, "dna"), "motifs_text": (None, motif_text), "sequences_source": (None, "text"), "sequences_text": (None, sequences), "output_pv": (None, str(pvalue_threshold)), "email": (None, "tooluniverse@example.com"), "description": (None, "FIMO scan via ToolUniverse"), "background_source": (None, "uniform"), "search": (None, "Start Search"), } if not scan_rc: form_data["norc"] = (None, "1") # Submit try: resp = self.session.post( "{}/tools/fimo".format(MEME_BASE_URL), files=form_data, timeout=120, ) except requests.exceptions.ReadTimeout: return {"status": "error", "error": "FIMO submission timed out"} if resp.status_code != 200: return { "status": "error", "error": "FIMO submission returned HTTP {}".format(resp.status_code), } # Extract job ID job_id = self._extract_job_id(resp.text) if not job_id: # Check for error error_msg = self._extract_form_error(resp.text) if error_msg: return {"status": "error", "error": "FIMO error: {}".format(error_msg)} return { "status": "error", "error": "Failed to extract FIMO job ID from response", } service = "FIMO" # Poll for completion status = self._poll_job(service, job_id) if status == "failed": error_detail = self._get_job_error(job_id) return { "status": "error", "error": "FIMO job failed: {}".format(error_detail or "unknown error"), } if status != "done": return { "status": "error", "error": "FIMO job did not complete (status: {})".format(status), } # Fetch TSV results tsv_url = "{}/opal-jobs/{}/fimo.tsv".format(MEME_BASE_URL, job_id) try: tsv_resp = self.session.get(tsv_url, timeout=30, allow_redirects=True) except Exception as e: return { "status": "error", "error": "Failed to fetch FIMO results: {}".format(str(e)), } if tsv_resp.status_code != 200: return { "status": "error", "error": "FIMO results returned HTTP {}".format(tsv_resp.status_code), } # Parse TSV hits = self._parse_fimo_tsv(tsv_resp.text) return { "status": "success", "data": { "hits": hits, "total_hits": len(hits), "job_id": job_id, "pvalue_threshold": pvalue_threshold, "result_url": "{}/opal-jobs/{}/fimo.html".format(MEME_BASE_URL, job_id), }, }
# ─── MEME (de novo discovery) ──────────────────────────────────────
[docs] def _discover_motifs(self, arguments): """ Run de novo motif discovery using MEME. Requires multiple FASTA sequences (>= 2) as input. Discovers overrepresented sequence patterns. """ sequences = arguments.get("sequences") if not sequences: return { "status": "error", "error": "Missing required parameter: sequences (FASTA format)", } nmotifs = arguments.get("nmotifs", 3) minw = arguments.get("minw", 6) maxw = arguments.get("maxw", 50) distribution = arguments.get("distribution", "zoops") scan_rc = arguments.get("scan_rc", True) # MEME background_source is numeric for Markov model order form_data = { "disc_mode": (None, "classic"), "alphabet_custom": (None, "0"), "sequences_source": (None, "text"), "sequences_text": (None, sequences), "dist": (None, distribution), "nmotifs": (None, str(nmotifs)), "email": (None, "tooluniverse@example.com"), "description": (None, "MEME discovery via ToolUniverse"), "background_source": (None, "0"), "minw": (None, str(minw)), "maxw": (None, str(maxw)), "search": (None, "Start Search"), } if not scan_rc: form_data["norc"] = (None, "1") try: resp = self.session.post( "{}/tools/meme".format(MEME_BASE_URL), files=form_data, timeout=120, ) except requests.exceptions.ReadTimeout: return {"status": "error", "error": "MEME submission timed out"} if resp.status_code != 200: return { "status": "error", "error": "MEME submission returned HTTP {}".format(resp.status_code), } job_id = self._extract_job_id(resp.text) if not job_id: error_msg = self._extract_form_error(resp.text) if error_msg: return {"status": "error", "error": "MEME error: {}".format(error_msg)} return { "status": "error", "error": "Failed to extract MEME job ID from response", } service = "MEME" # MEME can take longer - poll with longer max wait status = self._poll_job(service, job_id, max_wait=600, interval=10) if status == "failed": error_detail = self._get_job_error(job_id) return { "status": "error", "error": "MEME job failed: {}".format(error_detail or "unknown error"), } if status != "done": return { "status": "error", "error": "MEME job did not complete (status: {}). " "De novo discovery may need more time for large inputs.".format(status), } # Fetch text output txt_url = "{}/opal-jobs/{}/meme.txt".format(MEME_BASE_URL, job_id) try: txt_resp = self.session.get(txt_url, timeout=30, allow_redirects=True) except Exception as e: return { "status": "error", "error": "Failed to fetch MEME results: {}".format(str(e)), } if txt_resp.status_code != 200: return { "status": "error", "error": "MEME results returned HTTP {}".format(txt_resp.status_code), } # Parse MEME text output motifs = self._parse_meme_text(txt_resp.text) return { "status": "success", "data": { "motifs": motifs, "total_motifs": len(motifs), "job_id": job_id, "parameters": { "nmotifs": nmotifs, "minw": minw, "maxw": maxw, "distribution": distribution, }, "result_url": "{}/opal-jobs/{}/meme.html".format(MEME_BASE_URL, job_id), }, }
# ─── TOMTOM ────────────────────────────────────────────────────────
[docs] def _tomtom_compare(self, arguments): """ Compare a query motif against a database of known motifs using TOMTOM. The query motif must be in MEME format. Compares against a selected target database (JASPAR, HOCOMOCO, etc.). """ query_motif = arguments.get("query_motif") if not query_motif: return { "status": "error", "error": "Missing required parameter: query_motif (MEME format)", } # Target database target_db = arguments.get("target_db", "JASPAR2026_vertebrates") db_info = JASPAR_DB_LISTINGS.get(target_db) if not db_info: return { "status": "error", "error": "Unknown target_db '{}'. Available: {}".format( target_db, list(JASPAR_DB_LISTINGS.keys()) ), } evalue_threshold = arguments.get("evalue_threshold", 0.5) comparison_function = arguments.get("comparison_function", "pearson") form_data = { "query_motifs_source": (None, "text"), "query_motifs_alphabet": (None, "dna"), "query_motifs_text": (None, query_motif), "target_motifs_source": (None, str(db_info["category"])), "target_motifs_db_listing": (None, str(db_info["listing"])), "instant_run": (None, "1"), "comparison_function": (None, comparison_function), "thresh_type": (None, "evalue"), "thresh": (None, str(evalue_threshold)), "complete_scoring": (None, "1"), "email": (None, "tooluniverse@example.com"), "description": (None, "TOMTOM comparison via ToolUniverse"), "search": (None, "Start Search"), } try: resp = self.session.post( "{}/tools/tomtom".format(MEME_BASE_URL), files=form_data, timeout=120, ) except requests.exceptions.ReadTimeout: return {"status": "error", "error": "TOMTOM submission timed out"} if resp.status_code != 200: return { "status": "error", "error": "TOMTOM submission returned HTTP {}".format(resp.status_code), } # Check for immediate error error_msg = self._extract_form_error(resp.text) if error_msg: return {"status": "error", "error": "TOMTOM error: {}".format(error_msg)} job_id = self._extract_job_id(resp.text) if not job_id: return { "status": "error", "error": "Failed to extract TOMTOM job ID from response", } service = "TOMTOM" # TOMTOM runs on short queue, should be fast status = self._poll_job(service, job_id, max_wait=300, interval=5) if status == "failed": error_detail = self._get_job_error(job_id) return { "status": "error", "error": "TOMTOM job failed: {}".format( error_detail or "unknown error" ), } if status != "done": return { "status": "error", "error": "TOMTOM job did not complete (status: {})".format(status), } # Fetch TSV results tsv_url = "{}/opal-jobs/{}/tomtom.tsv".format(MEME_BASE_URL, job_id) try: tsv_resp = self.session.get(tsv_url, timeout=30, allow_redirects=True) except Exception as e: return { "status": "error", "error": "Failed to fetch TOMTOM results: {}".format(str(e)), } if tsv_resp.status_code != 200: return { "status": "error", "error": "TOMTOM results returned HTTP {}".format(tsv_resp.status_code), } # Parse TSV matches = self._parse_tomtom_tsv(tsv_resp.text) return { "status": "success", "data": { "matches": matches, "total_matches": len(matches), "job_id": job_id, "target_database": target_db, "evalue_threshold": evalue_threshold, "result_url": "{}/opal-jobs/{}/tomtom.html".format( MEME_BASE_URL, job_id ), }, }
# ─── List databases ────────────────────────────────────────────────
[docs] def _list_databases(self, arguments): """List available motif databases on MEME Suite (local data, no API call).""" category_filter = arguments.get("category_filter") categories = MOTIF_DB_CATEGORIES if category_filter: categories = [ c for c in categories if category_filter.lower() in c["name"].lower() or category_filter.lower() in c["description"].lower() ] # Also include the shortcut database names db_shortcuts = [ { "name": k, "description": v["description"], "category_id": v["category"], "listing_id": v["listing"], } for k, v in JASPAR_DB_LISTINGS.items() ] return { "status": "success", "data": { "categories": categories, "total_categories": len(categories), "database_shortcuts": db_shortcuts, "note": "Use category IDs with FIMO/TOMTOM. Database shortcuts can be used directly with the target_db parameter in tomtom_compare.", }, }
# ─── Shared helpers ────────────────────────────────────────────────
[docs] def _extract_job_id(self, html): """Extract job ID from the MEME Suite verification response.""" # Pattern: "id": "appFIMO_5.5.91772090284184-1332521127" match = re.search(r'"id":\s*"(app[^"]+)"', html) if match: return match.group(1) return None
[docs] def _extract_form_error(self, html): """Extract error message from a MEME Suite form error response.""" if "Problems with request" in html: errors = re.findall(r"<li>(.*?)</li>", html, re.S) if errors: return "; ".join(re.sub(r"<[^>]+>", "", e).strip() for e in errors) # Also check for HTTP 500 servlet errors msg_match = re.search(r"<b>Message</b>\s*(.*?)</p>", html, re.S) if msg_match: return re.sub(r"<[^>]+>", "", msg_match.group(1)).strip() return None
[docs] def _poll_job(self, service, job_id, max_wait=300, interval=5): """ Poll MEME Suite job status until completion. Status values: pending, active, done, failed, expired, unknown """ status_url = "{}/info/status?service={}&id={}&xml=1".format( MEME_BASE_URL, service, job_id ) elapsed = 0 while elapsed < max_wait: try: resp = self.session.get(status_url, timeout=30) if resp.status_code == 200: # Parse XML status status_match = re.search(r"<status>(.*?)</status>", resp.text) if status_match: status = status_match.group(1).strip() if status in ("done", "failed", "expired", "unknown"): return status except Exception: pass # Network blip, retry time.sleep(interval) elapsed += interval return "timeout"
[docs] def _get_job_error(self, job_id): """Fetch error details from a failed job's index.html.""" url = "{}/opal-jobs/{}/index.html".format(MEME_BASE_URL, job_id) try: resp = self.session.get(url, timeout=30, allow_redirects=True) if resp.status_code == 200: # Extract error messages from <li> tags errors = re.findall(r"<li>(.*?)</li>", resp.text, re.S) if errors: clean = [ re.sub(r"<[^>]+>", "", e).strip() for e in errors if "Error" in e or "invalid" in e.lower() or "failed" in e.lower() ] if clean: return "; ".join(clean) return "; ".join(re.sub(r"<[^>]+>", "", e).strip() for e in errors) except Exception: pass return None
[docs] def _parse_fimo_tsv(self, tsv_text): """Parse FIMO TSV output into structured hits.""" hits = [] for line in tsv_text.strip().split("\n"): if line.startswith("#") or line.startswith("motif_id"): continue parts = line.split("\t") if len(parts) < 10: continue try: hit = { "motif_id": parts[0], "motif_alt_id": parts[1] if parts[1] else None, "sequence_name": parts[2], "start": int(parts[3]), "stop": int(parts[4]), "strand": parts[5], "score": float(parts[6]), "pvalue": float(parts[7]), "qvalue": float(parts[8]), "matched_sequence": parts[9], } hits.append(hit) except (ValueError, IndexError): continue return hits
[docs] def _parse_tomtom_tsv(self, tsv_text): """Parse TOMTOM TSV output into structured matches.""" matches = [] for line in tsv_text.strip().split("\n"): if line.startswith("#") or line.startswith("Query_ID"): continue parts = line.split("\t") if len(parts) < 10: continue try: match = { "query_id": parts[0], "target_id": parts[1], "optimal_offset": int(parts[2]), "pvalue": float(parts[3]), "evalue": float(parts[4]), "qvalue": float(parts[5]), "overlap": int(parts[6]), "query_consensus": parts[7], "target_consensus": parts[8], "orientation": parts[9], } matches.append(match) except (ValueError, IndexError): continue return matches
[docs] def _parse_meme_text(self, text): """Parse MEME text output to extract discovered motifs.""" motifs = [] # Find all MOTIF blocks # Pattern: MOTIF <consensus> MEME-<N> width = <w> sites = <s> llr = <llr> E-value = <e> motif_pattern = re.compile( r"MOTIF\s+(\S+)\s+MEME-(\d+)\s+width\s*=\s*(\d+)\s+" r"sites\s*=\s*(\d+)\s+llr\s*=\s*(\d+)\s+E-value\s*=\s*(\S+)" ) # Find the simplified probability matrix sections matrix_pattern = re.compile( r"letter-probability matrix:.*?alength=\s*(\d+)\s+w=\s*(\d+).*?\n" r"((?:\s*[\d.]+(?:\s+[\d.]+)*\s*\n)+)", re.MULTILINE, ) # Sites pattern: each line is like "seq1 + 7 ... TATAAAAG ..." re.compile( r"Motif\s+\S+\s+MEME-(\d+)\s+sites\s+sorted.*?\n-+\n" r"(.*?)\n-+", re.S, ) for match in motif_pattern.finditer(text): consensus = match.group(1) motif_num = int(match.group(2)) width = int(match.group(3)) sites = int(match.group(4)) llr = int(match.group(5)) evalue_str = match.group(6) try: evalue = float(evalue_str) except ValueError: evalue = None motif_entry = { "motif_number": motif_num, "consensus": consensus, "width": width, "sites": sites, "log_likelihood_ratio": llr, "evalue": evalue, } motifs.append(motif_entry) # Try to extract probability matrices for i, matrix_match in enumerate(matrix_pattern.finditer(text)): if i < len(motifs): rows_text = matrix_match.group(3).strip() matrix = [] for row_line in rows_text.split("\n"): row_line = row_line.strip() if row_line: try: matrix.append([float(x) for x in row_line.split()]) except ValueError: continue if matrix: motifs[i]["probability_matrix"] = matrix return motifs