Source code for tooluniverse.conoserver_tool
"""ConoServer conopeptide tools (live, keyless).
ConoServer (https://www.conoserver.org) is the reference database of cone-snail
venom peptides (conotoxins/conopeptides). It exposes no per-record JSON/REST
endpoint; the only structured access is the bulk protein export at
``/download/conoserver_protein.xml.gz`` (~8,500 entries). That XML embeds HTML
named entities (``α``, ``β`` ...), stray control characters, and the
occasional mojibake, so it is not well-formed and stock ``xml.etree`` rejects it.
This module sanitizes those constructs and parses with lxml's recovering parser.
Two tools:
- ``ConoServerGetConopeptideTool`` (ConoServer_get_conopeptide): one record by ID.
- ``ConoServerSearchConopeptidesTool`` (ConoServer_search_conopeptides): filter by
name / sequence / pharmacological family / gene superfamily / cysteine
framework / organism / class.
"""
import gzip
import re
from functools import lru_cache
from html.entities import name2codepoint
from typing import Any, Dict, List, Optional
import requests
from lxml import etree
from .base_tool import BaseTool
from .tool_registry import register_tool
_URL = "https://www.conoserver.org/download/conoserver_protein.xml.gz"
_TIMEOUT = 30
_XML_PREDEFINED = {"amp", "lt", "gt", "quot", "apos"}
_NAMED_ENTITY = re.compile(r"&([a-zA-Z][a-zA-Z0-9]*);")
_BARE_AMP = re.compile(r"&(?!(?:amp|lt|gt|quot|apos|#[0-9]+|#x[0-9a-fA-F]+);)")
_INVALID_XML_CTRL = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
# ConoServer's own XML uses these (misspelled) tag names verbatim.
_FRAMEWORK_TAG = "cysteineFramewrok"
_PI_TAG = "isoelecticPoint"
def _err(message: str, **extra: Any) -> Dict[str, Any]:
out: Dict[str, Any] = {"status": "error", "error": message}
out.update(extra)
return out
def _sanitize_xml(text: str) -> str:
"""Make ConoServer's not-well-formed XML parseable.
Converts HTML named entities to their unicode characters (leaving the five
predefined XML entities intact), escapes any remaining bare ``&``, and drops
control characters that are illegal in XML 1.0.
"""
def _replace_named(match: "re.Match[str]") -> str:
name = match.group(1)
if name in _XML_PREDEFINED:
return match.group(0)
codepoint = name2codepoint.get(name)
return chr(codepoint) if codepoint is not None else f"&{name};"
text = _NAMED_ENTITY.sub(_replace_named, text)
text = _BARE_AMP.sub("&", text)
return _INVALID_XML_CTRL.sub("", text)
def _text(elem, tag: str) -> Optional[str]:
value = elem.findtext(tag)
if value is None:
return None
value = value.strip()
return value or None
def _parse_entry(elem) -> Dict[str, Any]:
"""Turn one ConoServer ``<entry>`` element into a JSON-able dict."""
modifications: List[Dict[str, Any]] = [
{
"position": mod.get("position"),
"symbol": mod.get("symbol"),
"name": mod.get("name"),
}
for mod in elem.findall("./sequenceModifications/modification")
]
references: List[Dict[str, Any]] = [
{
"authors": _text(ref, "authors"),
"year": _text(ref, "year"),
"title": _text(ref, "title"),
"journal": _text(ref, "journal"),
"volume": _text(ref, "volume"),
"pages": _text(ref, "pages"),
"pmid": _text(ref, "pmid"),
}
for ref in elem.findall("reference")
]
alt_names = [
a.text.strip()
for a in elem.findall("./alternativeNames/altName")
if a.text and a.text.strip()
]
return {
"id": _text(elem, "id"),
"name": _text(elem, "name"),
"alternative_names": alt_names,
"class": _text(elem, "class"),
"gene_superfamily": _text(elem, "geneSuperfamily"),
"cysteine_framework": _text(elem, _FRAMEWORK_TAG),
"pharmacological_family": _text(elem, "pharmacologicalFamily"),
"organism_latin": _text(elem, "organismLatin"),
"organism_diet": _text(elem, "organismDiet"),
"organism_region": _text(elem, "organismRegion"),
"sequence": _text(elem, "sequence"),
"sequence_modifications": modifications,
"sequence_evidence": _text(elem, "sequenceEvidence"),
"average_mass": _text(elem, "averageMass"),
"monoisotopic_mass": _text(elem, "monoisotopicMass"),
"isoelectric_point": _text(elem, _PI_TAG),
"extinction_coefficient": _text(elem, "extinctionCoefficient"),
"references": references,
}
@lru_cache(maxsize=1)
def _load_entries() -> List[Dict[str, Any]]:
"""Download, sanitize and parse the bulk ConoServer protein export.
Cached for the process lifetime (the export changes rarely). Raises on a
network/parse failure so the failure is never cached.
"""
resp = requests.get(_URL, timeout=_TIMEOUT)
resp.raise_for_status()
xml_text = gzip.decompress(resp.content).decode("utf-8", "replace")
parser = etree.XMLParser(recover=True, huge_tree=True, resolve_entities=False)
root = etree.fromstring(_sanitize_xml(xml_text).encode("utf-8"), parser=parser)
if root is None:
raise ValueError("ConoServer XML could not be parsed")
return [_parse_entry(e) for e in root.findall("entry")]
[docs]
@register_tool(
"ConoServerGetConopeptideTool",
config={
"name": "ConoServer_get_conopeptide",
"type": "ConoServerGetConopeptideTool",
"description": (
"Get a full ConoServer conopeptide (cone-snail venom peptide) record "
"by its ConoServer protein ID (e.g. P00001). Returns the sequence, "
"post-translational modifications, cysteine framework, gene "
"superfamily, pharmacological family, source Conus species/diet/"
"region, average and monoisotopic mass, pI, extinction coefficient, "
"and literature references. Data from the keyless ConoServer bulk "
"protein export (conoserver.org)."
),
"parameter": {
"type": "object",
"properties": {
"conoserver_id": {
"type": "string",
"description": (
"ConoServer protein ID, e.g. 'P00001' (alpha-conotoxin "
"SI, sequence ICCNPACGPKYSCX, from Conus striatus). "
"Case-insensitive."
),
}
},
"required": ["conoserver_id"],
},
},
)
class ConoServerGetConopeptideTool(BaseTool):
"""Fetch a single ConoServer conopeptide record by ID."""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
raw = arguments.get("conoserver_id")
if raw is None or not str(raw).strip():
return _err("conoserver_id is required")
target = str(raw).strip().upper()
try:
entries = _load_entries()
except Exception as exc: # network / decompress / parse
return _err(f"Failed to load ConoServer data: {exc}", url=_URL)
for entry in entries:
if (entry.get("id") or "").upper() == target:
return {
"status": "success",
"data": entry,
"metadata": {
"source": "ConoServer",
"url": _URL,
"id": entry.get("id"),
"name": entry.get("name"),
"sequence": entry.get("sequence"),
"organism_latin": entry.get("organism_latin"),
},
}
return _err(f"No ConoServer conopeptide found with id {target!r}", url=_URL)
_SUBSTRING_FILTERS = {
"name": "name",
"sequence": "sequence",
"pharmacological_family": "pharmacological_family",
"gene_superfamily": "gene_superfamily",
"cysteine_framework": "cysteine_framework",
"organism": "organism_latin",
"conopeptide_class": "class",
}
[docs]
@register_tool(
"ConoServerSearchConopeptidesTool",
config={
"name": "ConoServer_search_conopeptides",
"type": "ConoServerSearchConopeptidesTool",
"description": (
"Search ConoServer conopeptides (cone-snail venom peptides) by one or "
"more case-insensitive substring filters: name, sequence, "
"pharmacological_family (e.g. 'alpha conotoxin'), gene_superfamily "
"(e.g. 'A superfamily'), cysteine_framework (e.g. 'I'), organism "
"(Conus species, e.g. 'Conus geographus'), or conopeptide_class. "
"Returns matching records (sequence, modifications, masses, organism, "
"references). Keyless ConoServer bulk export (conoserver.org)."
),
"parameter": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Peptide name substring."},
"sequence": {
"type": "string",
"description": "Amino-acid sequence substring (e.g. 'GCCS').",
},
"pharmacological_family": {
"type": "string",
"description": "e.g. 'alpha conotoxin', 'omega conotoxin'.",
},
"gene_superfamily": {
"type": "string",
"description": "e.g. 'A superfamily', 'O1 superfamily'.",
},
"cysteine_framework": {
"type": "string",
"description": "Cysteine framework, e.g. 'I', 'III', 'VI/VII'.",
},
"organism": {
"type": "string",
"description": "Source Conus species, e.g. 'Conus geographus'.",
},
"conopeptide_class": {
"type": "string",
"description": "Conopeptide class, e.g. 'conotoxin'.",
},
"limit": {
"type": "integer",
"description": "Max records to return (default 25, max 200).",
},
},
},
},
)
class ConoServerSearchConopeptidesTool(BaseTool):
"""Filter ConoServer conopeptides by substring on indexed fields."""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
filters = {}
for arg_name, field in _SUBSTRING_FILTERS.items():
value = arguments.get(arg_name)
if value is not None and str(value).strip():
filters[field] = str(value).strip().lower()
if not filters:
return _err(
"Provide at least one filter: " + ", ".join(_SUBSTRING_FILTERS.keys())
)
limit = arguments.get("limit", 25)
try:
limit = int(limit)
except (TypeError, ValueError):
limit = 25
limit = max(1, min(200, limit))
try:
entries = _load_entries()
except Exception as exc:
return _err(f"Failed to load ConoServer data: {exc}", url=_URL)
matched = [
entry
for entry in entries
if all(
needle in (entry.get(field) or "").lower()
for field, needle in filters.items()
)
]
return {
"status": "success",
"data": {
"count": len(matched),
"returned": min(len(matched), limit),
"results": matched[:limit],
},
"metadata": {
"source": "ConoServer",
"url": _URL,
"filters": {
k: value for k in _SUBSTRING_FILTERS if (value := arguments.get(k))
},
"total_matched": len(matched),
"limit": limit,
},
}