Source code for tooluniverse.ampsphere_record_tool

"""AMPSphere single-AMP record + sequence-match tools (live REST, keyless).

AMPSphere (Big Data Biology Lab) is a global survey of antimicrobial peptides
(AMPs) computationally predicted from publicly available metagenomes and
metaproteomes — 863,498 non-redundant AMPs grouped into SPHERE families. The
public API at https://ampsphere-api.big-data-biology.org is keyless (no login,
no token) and returns JSON.

This module adds two tools that complement the catalog-browse / family tools in
``ampsphere_tool.py`` (the disjoint single-record and exact-match capabilities):

- ``AMPSphereGetAmpTool`` (AMPSphere_get_amp): full record for one AMP
  accession (sequence, family, physicochemical properties, QC flags, predicted
  secondary structure, gene/sample provenance) via /v1/amps/{accession}.
- ``AMPSphereSequenceMatchTool`` (AMPSphere_sequence_match): exact-sequence
  membership test — does this peptide already exist in AMPSphere? — via
  /v1/search/sequence-match.

API behavior (verified live): an invalid AMP accession returns HTTP 500 with
body ``{"detail": "invalid accession received."}``; the sequence-match endpoint
is case-sensitive (lowercase input misses), so the query is uppercased and
whitespace-stripped, and a non-member returns ``{"query": ..., "result": null}``.
"""

from typing import Any, Dict, Optional, Tuple

import requests

from .base_tool import BaseTool
from .tool_registry import register_tool

_BASE_URL = "https://ampsphere-api.big-data-biology.org/v1"
_TIMEOUT = 30
_HEADERS = {"Accept": "application/json"}
_SOURCE = "AMPSphere (Big Data Biology Lab, ampsphere.big-data-biology.org)"


def _err(message: str, **extra: Any) -> Dict[str, Any]:
    out: Dict[str, Any] = {"status": "error", "error": message}
    out.update(extra)
    return out


def _detail(resp: requests.Response) -> str:
    """Extract the API's JSON ``detail`` message if present, else raw text."""
    try:
        body = resp.json()
        if isinstance(body, dict) and body.get("detail"):
            return str(body["detail"])[:200]
    except ValueError:
        pass
    return (resp.text or "")[:200]


def _request(
    url: str, params: Optional[Dict[str, Any]] = None
) -> Tuple[Optional[Any], Optional[Dict[str, Any]]]:
    """GET helper returning (payload, error_dict). Exactly one is non-None."""
    try:
        resp = requests.get(url, params=params, headers=_HEADERS, timeout=_TIMEOUT)
    except requests.exceptions.RequestException as exc:
        return None, _err(f"Request to AMPSphere failed: {exc}", url=url)

    if resp.status_code != 200:
        detail = _detail(resp)
        # AMPSphere signals a missing record with an "invalid accession" body
        # (HTTP 400/500); surface that as a clean not-found error.
        if "invalid accession" in detail.lower():
            return None, _err(
                "AMPSphere has no record for the requested accession.",
                url=resp.url,
                response_snippet=detail,
            )
        return None, _err(
            f"AMPSphere returned HTTP {resp.status_code}",
            url=resp.url,
            response_snippet=detail,
        )

    try:
        return resp.json(), None
    except ValueError:
        return None, _err(
            "AMPSphere returned a non-JSON response",
            url=resp.url,
            response_snippet=(resp.text or "")[:200],
        )


[docs] @register_tool( "AMPSphereGetAmpTool", config={ "name": "AMPSphere_get_amp", "type": "AMPSphereGetAmpTool", "description": ( "Get the full AMPSphere record for one antimicrobial peptide (AMP) " "by accession. AMPSphere (Big Data Biology Lab) is a global survey " "of 863,498 AMPs predicted from metagenomes/metaproteomes. Returns " "the amino-acid sequence, SPHERE family, length, physicochemical " "properties (molecular_weight, isoelectric_point, charge, " "aromaticity, instability_index, gravy), quality-control flags " "(Antifam, RNAcode, metaproteomes, metatranscriptomes, coordinates; " "each Passed/Failed/Not tested), predicted secondary_structure " "(helix/turn/sheet fractions), and a metadata.data[] array of " "gene/sample provenance rows (GMSC gene accession, gene sequence, " "sample, habitat, microbial source, geography). Keyless public API." ), "parameter": { "type": "object", "properties": { "accession": { "type": "string", "description": ( "AMPSphere AMP accession of the form 'AMP10.XXX_XXX'. " "Example: 'AMP10.000_000' (sequence " "KKVKSIFKKALAMMGENEVKAWGIGIK, family SPHERE-III.001_493)." ), } }, "required": ["accession"], }, "return_schema": { "oneOf": [ { "type": "object", "description": "Successful AMP lookup.", "properties": { "status": {"type": "string", "enum": ["success"]}, "data": { "type": "object", "description": "Full AMPSphere AMP record.", "properties": { "accession": {"type": "string"}, "sequence": {"type": "string"}, "family": {"type": ["string", "null"]}, "length": {"type": ["integer", "null"]}, "molecular_weight": {"type": ["number", "null"]}, "isoelectric_point": {"type": ["number", "null"]}, "charge": {"type": ["number", "null"]}, "aromaticity": {"type": ["number", "null"]}, "instability_index": {"type": ["number", "null"]}, "gravy": {"type": ["number", "null"]}, "Antifam": {"type": ["string", "null"]}, "RNAcode": {"type": ["string", "null"]}, "metaproteomes": {"type": ["string", "null"]}, "metatranscriptomes": {"type": ["string", "null"]}, "coordinates": {"type": ["string", "null"]}, "num_genes": {"type": ["integer", "null"]}, "secondary_structure": { "type": ["object", "null"], "properties": { "helix": {"type": ["number", "null"]}, "turn": {"type": ["number", "null"]}, "sheet": {"type": ["number", "null"]}, }, }, "metadata": {"type": ["object", "null"]}, }, }, "metadata": { "type": "object", "properties": { "source": {"type": "string"}, "url": {"type": "string"}, "accession": {"type": "string"}, "family": {"type": ["string", "null"]}, "length": {"type": ["integer", "null"]}, "gene_count": {"type": "integer"}, }, }, }, "required": ["status", "data"], }, { "type": "object", "description": "Error result.", "properties": { "status": {"type": "string", "enum": ["error"]}, "error": {"type": "string"}, "url": {"type": "string"}, "response_snippet": {"type": "string"}, }, "required": ["status", "error"], }, ] }, "test_examples": [ {"accession": "AMP10.000_000"}, {"accession": "AMP10.000_001"}, ], "label": ["AMPSphere", "Antimicrobial Peptide", "AMP", "Metagenome", "Peptide"], "metadata": { "tags": [ "antimicrobial peptide", "AMP", "AMPSphere", "metagenome", "metaproteome", "physicochemical", "peptide", ], "estimated_execution_time": "1-3 seconds", }, }, ) class AMPSphereGetAmpTool(BaseTool): """Fetch a single AMPSphere AMP record by accession."""
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: raw = (arguments or {}).get("accession") if raw is None or str(raw).strip() == "": return _err("accession is required (e.g. 'AMP10.000_000').") accession = str(raw).strip() url = f"{_BASE_URL}/amps/{accession}" payload, error = _request(url) if error is not None: return error if not isinstance(payload, dict) or not payload.get("accession"): return _err(f"No AMPSphere record for accession {accession!r}.", url=url) meta = payload.get("metadata") gene_rows = meta.get("data") if isinstance(meta, dict) else None return { "status": "success", "data": payload, "metadata": { "source": _SOURCE, "url": url, "accession": payload.get("accession"), "family": payload.get("family"), "length": payload.get("length"), "gene_count": len(gene_rows) if isinstance(gene_rows, list) else 0, }, }
[docs] @register_tool( "AMPSphereSequenceMatchTool", config={ "name": "AMPSphere_sequence_match", "type": "AMPSphereSequenceMatchTool", "description": ( "Check whether an exact amino-acid sequence already exists in " "AMPSphere (the global survey of 863,498 metagenomic antimicrobial " "peptides) and, if so, return its AMPSphere accession. This is an " "exact-match membership test, not a homology search (for homology " "use AMPSphere's /search/mmseqs or /search/hmmer endpoints). " "Returns {query, result, matched}: result is the AMPSphere " "accession (e.g. 'AMP10.000_000') when the sequence is a catalog " "member, or null when it is not present. The match is case-" "insensitive here (input is uppercased before querying). Keyless " "public API." ), "parameter": { "type": "object", "properties": { "query": { "type": "string", "description": ( "Amino-acid sequence (single-letter code) to test for " "exact membership. Example: " "'KKVKSIFKKALAMMGENEVKAWGIGIK' -> AMP10.000_000." ), } }, "required": ["query"], }, "return_schema": { "oneOf": [ { "type": "object", "description": "Successful exact-match test.", "properties": { "status": {"type": "string", "enum": ["success"]}, "data": { "type": "object", "properties": { "query": {"type": "string"}, "result": {"type": ["string", "null"]}, "matched": {"type": "boolean"}, }, }, "metadata": { "type": "object", "properties": { "source": {"type": "string"}, "url": {"type": "string"}, "accession": {"type": ["string", "null"]}, }, }, }, "required": ["status", "data"], }, { "type": "object", "description": "Error result.", "properties": { "status": {"type": "string", "enum": ["error"]}, "error": {"type": "string"}, "url": {"type": "string"}, "response_snippet": {"type": "string"}, }, "required": ["status", "error"], }, ] }, "test_examples": [ {"query": "KKVKSIFKKALAMMGENEVKAWGIGIK"}, {"query": "ACDEFGHIKLMNPQRSTVWYACDEFG"}, ], "label": ["AMPSphere", "Antimicrobial Peptide", "AMP", "Sequence", "Peptide"], "metadata": { "tags": [ "antimicrobial peptide", "AMP", "AMPSphere", "sequence match", "exact match", "peptide", ], "estimated_execution_time": "1-3 seconds", }, }, ) class AMPSphereSequenceMatchTool(BaseTool): """Exact-sequence membership test against the AMPSphere catalog."""
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: raw = (arguments or {}).get("query") if raw is None or str(raw).strip() == "": return _err("query (an amino-acid sequence) is required.") # The endpoint is case-sensitive and rejects internal whitespace; clean # the input to a contiguous uppercase residue string. query = "".join(str(raw).split()).upper() if not query: return _err("query contains no sequence characters.") url = f"{_BASE_URL}/search/sequence-match" payload, error = _request(url, {"query": query}) if error is not None: return error if not isinstance(payload, dict): return _err("Unexpected AMPSphere response shape", url=url) result = payload.get("result") return { "status": "success", "data": { "query": payload.get("query", query), "result": result, "matched": bool(result), }, "metadata": { "source": _SOURCE, "url": url, "accession": result, }, }