Source code for tooluniverse.cellosaurus_tool
import os
import re
import requests
from typing import Any, Dict, List, Tuple
from difflib import SequenceMatcher
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("CellosaurusSearchTool")
class CellosaurusSearchTool(BaseTool):
"""
Tool to search Cellosaurus cell lines using the official API.
"""
[docs]
def __init__(self, tool_config, base_url="https://api.cellosaurus.org"):
super().__init__(tool_config)
self.base_url = base_url
self.timeout_seconds = int(os.environ.get("CELLOSAURUS_TIMEOUT", "30"))
[docs]
def run(self, arguments):
q = arguments.get("q")
offset = arguments.get("offset", 0)
size = arguments.get("size", 20)
if not q:
return {"error": "`q` parameter is required."}
return self._search_cell_lines(q, offset, size)
[docs]
def _search_cell_lines(self, query, offset, size):
"""
Search Cellosaurus cell lines using the /search/cell-line endpoint.
"""
try:
params = {"q": query.strip(), "offset": offset, "size": size}
url = f"{self.base_url}/search/cell-line"
headers = {"Accept": "application/json"}
resp = requests.get(
url,
params=params,
headers=headers,
timeout=self.timeout_seconds,
)
resp.raise_for_status()
data = resp.json()
# Extract cell lines from the API response structure
cell_lines = []
total_count = 0
if isinstance(data, dict) and "Cellosaurus" in data:
cellosaurus_data = data["Cellosaurus"]
if "cell-line-list" in cellosaurus_data:
cell_lines = cellosaurus_data["cell-line-list"]
total_count = len(cell_lines)
return {
"success": True,
"results": {
"cell_lines": cell_lines,
"total": total_count,
"offset": offset,
"size": size,
},
"query": query.strip(),
}
except requests.HTTPError as http_err:
status = getattr(http_err.response, "status_code", None)
return {"error": f"HTTP {status}: {http_err}"}
except Exception as e:
return {"error": str(e)}
[docs]
@register_tool("CellosaurusQueryConverterTool")
class CellosaurusQueryConverterTool(BaseTool):
"""
Tool to convert natural language queries to Solr syntax for
Cellosaurus API.
"""
[docs]
def __init__(self, tool_config):
super().__init__(tool_config)
# Complete Cellosaurus field definitions from official API
# documentation
# https://api.cellosaurus.org/api-fields
self.cellosaurus_fields = {
"id": {
"short_name": "-",
"description": (
"Recommended name. Most frequently the name of the cell "
"line as provided in the original publication."
),
"keywords": [
"name",
"cell line",
"cellline",
"recommended",
"publication",
],
},
"sy": {
"short_name": "-",
"description": (
"List of synonyms. We try to list all the different "
"synonyms for the cell line, including alternative use of "
"lower and upper cases characters. Misspellings are not "
'included in synonyms (see the "misspelling" tag).'
),
"keywords": [
"synonym",
"synonyms",
"alias",
"alternative",
"different names",
],
},
"idsy": {
"short_name": "-",
"description": (
"Recommended name with all its synonyms. Concatenation of "
"ID and SY."
),
"keywords": [
"name",
"synonyms",
"combined",
"concatenation",
],
},
"ac": {
"short_name": "-",
"description": (
"Primary accession. It is the unique identifier of the "
"cell line. It is normally stable across Cellosaurus "
"versions but when two entries are merged, one of the two "
"accessions stays primary while the second one becomes "
"secondary (see ACAS)"
),
"keywords": [
"accession",
"primary",
"identifier",
"unique",
"stable",
],
},
"acas": {
"short_name": "-",
"description": (
"Primary and secondary accession. Secondary accession are "
"former primary accession kept here to ensure the access "
"to a cell line via old identifiers."
),
"keywords": [
"accession",
"primary",
"secondary",
"former",
"old identifiers",
],
},
"dr": {
"short_name": "-",
"description": (
"Cross-references to external resources: cell catalogs, "
"databases, resources listing cell lines as samples or to "
"ontologies. A cross-reference has two parts: the short "
"name of the resource (i.e. CCLE) and an identifier used "
"to locate a particular entry of the resource related to "
"the cell line. For a formal description of all the "
"resources referred to in Cellosaurus, see here ."
),
"keywords": [
"cross-reference",
"external",
"catalog",
"database",
"atcc",
"dsmz",
"ccle",
"ecacc",
],
},
"ref": {
"short_name": "-",
"description": (
"Publication references. Mostly publications describing "
"the establishment of a cell line or its "
"characterization. Can be journal articles, book "
"chapters, patents and theses. Contains the "
"cross-reference of the publication, its title, authors "
"(or group/consortium) and citation elements."
),
"keywords": [
"reference",
"publication",
"paper",
"article",
"journal",
"book",
"patent",
"thesis",
],
},
"rx": {
"short_name": "-",
"description": (
"Publication cross-reference. A unique identifier "
"allowing access the publication online. The "
"cross-reference has two parts: the shortname of the "
"online resource (i.e. PubMed, DOI, PMCID, CLPUB) "
"and an identifier used to locate the particular "
"publication related to the cell line. For a formal "
"description of all the resources referred to in "
"Cellosaurus, see here ."
),
"keywords": [
"cross-reference",
"online",
"pubmed",
"doi",
"pmcid",
"clpub",
"patent",
"identifier",
],
},
"ra": {
"short_name": "-",
"description": (
"Publication authors. List of authors of a publication "
"referenced in a cell line entry."
),
"keywords": [
"author",
"authors",
"written by",
"publication",
],
},
"rt": {
"short_name": "-",
"description": (
"Publication title. Title of a publication referenced in "
"cell line entry."
),
"keywords": ["title", "paper", "article", "publication"],
},
"rl": {
"short_name": "-",
"description": (
"Publication citation elements. Citation elements of a "
"publication referenced in a cell line entry."
),
"keywords": ["citation", "cite", "reference", "elements"],
},
"ww": {
"short_name": "-",
"description": "Web page related to the cell line",
"keywords": ["website", "web page", "homepage", "url"],
},
"anc": {
"short_name": "genome-ancestry",
"description": (
"Estimated ethnic ancestry of the donor of a human cell "
"line based on the analysis of its genome."
),
"keywords": [
"ancestry",
"ethnic",
"genetic background",
"genome",
"donor",
],
},
"hla": {
"short_name": "-",
"description": (
"HLA typing information. Alleles identified on the MHC "
"type I and type II genes of the donor of a human cell "
"line."
),
"keywords": [
"hla",
"mhc",
"typing",
"alleles",
"genes",
"donor",
],
},
"reg": {
"short_name": "registration",
"description": (
"Official list, or register in which the cell line is "
"registered."
),
"keywords": [
"registration",
"registered",
"official",
"register",
],
},
"var": {
"short_name": "sequence-variation",
"description": (
"Important sequence variations of the cell line compared "
"to the reference genome of the species."
),
"keywords": [
"variation",
"mutation",
"sequence",
"variant",
"snv",
"indel",
],
},
"anec": {
"short_name": "anecdotal",
"description": (
"Anecdotal details regarding the cell line (its origin, "
"its name or any other particularity)."
),
"keywords": [
"anecdotal",
"story",
"history",
"background",
"origin",
],
},
"biot": {
"short_name": "biotechnology",
"description": (
"Type of use of the cell line in a biotechnological " "context."
),
"keywords": [
"biotechnology",
"biotech",
"production",
"manufacturing",
"use",
],
},
"breed": {
"short_name": "-",
"description": (
"Breed or subspecies an animal cell line is derived from "
"with breed identifiers from FlyBase_Strain, RS and VBO."
),
"keywords": [
"breed",
"subspecies",
"animal",
"flybase",
"strain",
],
},
"caution": {
"short_name": "-",
"description": (
"Errors, inconsistencies, ambiguities regarding the "
"origin or other aspects of the cell line."
),
"keywords": [
"caution",
"warning",
"error",
"inconsistency",
"ambiguity",
],
},
"cell": {
"short_name": "cell-type",
"description": ("Cell type from which the cell line is derived."),
"keywords": ["cell type", "cell", "derived", "type"],
},
"char": {
"short_name": "characteristics",
"description": (
"Production process or specific biological properties of "
"the cell line."
),
"keywords": [
"characteristics",
"properties",
"biological",
"production",
"process",
"cancer",
"tumor",
"malignant",
],
},
"donor": {
"short_name": "donor-info",
"description": (
"Miscellaneous information relevant to the donor of the "
"cell line."
),
"keywords": [
"donor",
"patient",
"miscellaneous",
"information",
],
},
"site": {
"short_name": "derived-from-site",
"description": (
"Body part (tissue or organ) the cell line is derived " "from."
),
"keywords": [
"site",
"tissue",
"organ",
"body part",
"derived",
"lung",
"breast",
"colon",
"skin",
"blood",
"bone",
"brain",
],
},
"disc": {
"short_name": "discontinued",
"description": (
"Discontinuation status of the cell line in a cell line " "catalog."
),
"keywords": [
"discontinued",
"unavailable",
"no longer available",
"status",
],
},
"time": {
"short_name": "doubling-time",
"description": "Population doubling-time of the cell line.",
"keywords": [
"doubling time",
"doubling",
"population",
"time",
"hours",
],
},
"from": {
"short_name": "-",
"description": (
"Laboratory, research institute, university having "
"established the cell line."
),
"keywords": [
"laboratory",
"lab",
"institute",
"university",
"established",
],
},
"group": {
"short_name": "-",
"description": (
"Specific group the cell line belongs to (example: fish "
"cell lines, vaccine production cell lines)."
),
"keywords": [
"group",
"fish cell lines",
"vaccine production",
"stem cell",
"embryonic",
],
},
"kar": {
"short_name": "karyotype",
"description": (
"Information relevant to the chromosomes of a cell line "
"(often to describe chromosomal abnormalities)."
),
"keywords": [
"karyotype",
"chromosome",
"chromosomal",
"abnormalities",
"defects",
],
},
"ko": {
"short_name": "knockout",
"description": (
"Gene(s) knocked-out in the cell line and method to "
"obtain the KO."
),
"keywords": ["knockout", "ko", "gene", "knocked-out"],
},
"msi": {
"short_name": "-",
"description": "Microsatellite instability degree.",
"keywords": ["msi", "microsatellite instability"],
},
"misc": {
"short_name": "miscellaneous",
"description": "Miscellaneous remarks about the cell line.",
"keywords": [
"miscellaneous",
"other",
"additional",
"notes",
"remarks",
],
},
"miss": {
"short_name": "misspelling",
"description": (
"Identified misspelling(s) of the cell line name with in "
"some case the specific publication or external resource "
"entry where it appears."
),
"keywords": ["misspelling", "misspelled", "typo"],
},
"mabi": {
"short_name": "mab-isotype",
"description": (
"Monoclonal antibody isotype. Examples: IgG2a, kappa; "
"IgM, lambda."
),
"keywords": [
"isotype",
"igg",
"igm",
"iga",
"ige",
"monoclonal antibody",
],
},
"mabt": {
"short_name": "mab-target",
"description": (
"Monoclonal antibody target molecule. Generally a "
"specific protein or chemical compound."
),
"keywords": [
"antibody",
"mab",
"target",
"targeting",
"protein",
"molecule",
],
},
"omics": {
"short_name": "-",
"description": ('"Omics" study(ies) carried out on the cell line.'),
"keywords": [
"omics",
"genomics",
"transcriptomics",
"proteomics",
"metabolomics",
"study",
],
},
"part": {
"short_name": "part-of",
"description": (
"The cell line belongs to a specific panel or collection "
"of cell lines."
),
"keywords": ["part", "panel", "collection", "belongs to"],
},
"pop": {
"short_name": "population",
"description": (
"Ethnic group, nationality of the individual from which "
"the cell line was sampled."
),
"keywords": [
"population",
"ethnic",
"nationality",
"caucasian",
"african",
"asian",
],
},
"prob": {
"short_name": "problematic",
"description": (
"Known problem(s) related to the cell line: contaminated, "
"misidentified, misclassified cell line or appearing in a "
"retracted paper."
),
"keywords": [
"problematic",
"contaminated",
"misidentified",
"problem",
"retracted",
],
},
"res": {
"short_name": "resistance",
"description": (
"Selected to be resistant to some chemical compound "
"(like a drug used in chemotherapy) or toxin. with a "
"cross-reference to either ChEBI, DrugBank, NCIt or "
"UniProtKB."
),
"keywords": [
"resistance",
"resistant",
"drug",
"chemotherapy",
"toxin",
"cisplatin",
"doxorubicin",
],
},
"sen": {
"short_name": "senescence",
"description": "When a finite cell line will senesce.",
"keywords": ["senescence", "senescent", "finite"],
},
"int": {
"short_name": "integrated",
"description": (
"Genetic element(s) integrated in the cell line: gene "
"name and identifier in CGNC, FlyBase, FPbase, HGNC, MGI, "
"RGD, UniProtKB, and VGNC."
),
"keywords": ["integrated", "genetic element", "gene"],
},
"tfor": {
"short_name": "transformant",
"description": (
"What caused the cell line to be transformed: generally a "
"virus (with a cross-reference to NCBI taxon identifier), "
"a chemical compound (with a cross-reference to ChEBI) or "
"a form of irradiation (with a cross-reference to NCIt)."
),
"keywords": [
"transformant",
"transformation",
"virus",
"chemical",
"irradiation",
],
},
"vir": {
"short_name": "virology",
"description": (
"Susceptibility of the cell line to viral infection, "
"presence of integrated viruses or any other "
"virology-related information."
),
"keywords": [
"virology",
"viral",
"virus",
"susceptibility",
"infection",
],
},
"cc": {
"short_name": "-",
"description": (
"Comment(s). Any content described in fields "
"genome-ancestry, hla, registration, sequence-variation, "
"anecdotal, biotechnology, breed, caution, "
"characteristics, discontinued, donor-info, "
"doubling-time, from, group, karyotype, knockout, "
"miscellaneous, misspelling, mab-isotype, mab-target, "
"msi, omics, population, problematic, resistance, "
"senescence, transfected, transformant, virology."
),
"keywords": ["comment", "note", "remark", "observation"],
},
"str": {
"short_name": "-",
"description": "Short tandem repeat profile.",
"keywords": [
"str",
"short tandem repeat",
"microsatellite",
"profile",
],
},
"di": {
"short_name": "-",
"description": (
"Disease(s) suffered by the individual from which the "
"cell line originated with its NCI Thesaurus or ORDO "
"identifier."
),
"keywords": [
"disease",
"condition",
"leukemia",
"lymphoma",
"carcinoma",
"sarcoma",
],
},
"din": {
"short_name": "-",
"description": (
"Disease(s) suffered by the individual from which the "
"cell line originated, restricted to diseases having a "
"NCI Thesaurus identifier."
),
"keywords": ["disease", "nci", "thesaurus"],
},
"dio": {
"short_name": "-",
"description": (
"Disease(s) suffered by the individual from which the "
"cell line originated, restricted to diseases having an "
"ORDO identifier."
),
"keywords": ["disease", "ordo"],
},
"ox": {
"short_name": "-",
"description": (
"Species of the individual from which the cell line "
"originates with its NCBI taxon identifier."
),
"keywords": [
"species",
"organism",
"human",
"mouse",
"rat",
"ncbi",
"taxon",
],
},
"sx": {
"short_name": "-",
"description": (
"Sex of the individual from which the cell line " "originates."
),
"keywords": [
"sex",
"gender",
"male",
"female",
"man",
"woman",
],
},
"ag": {
"short_name": "-",
"description": (
"Age at sampling time of the individual from which the "
"cell line was established."
),
"keywords": [
"age",
"aged",
"years",
"months",
"days",
"sampling",
],
},
"oi": {
"short_name": "-",
"description": (
"Cell line(s) originating from same individual (sister "
"cell lines)."
),
"keywords": [
"sister",
"sibling",
"related",
"same individual",
],
},
"hi": {
"short_name": "-",
"description": (
"Parent cell line from which the cell line originates."
),
"keywords": ["parent", "derived from"],
},
"ch": {
"short_name": "-",
"description": (
"Cell line(s) originated from the cell line (child cell " "lines)."
),
"keywords": ["child", "derived", "subclone"],
},
"ca": {
"short_name": "-",
"description": (
"Category to which a cell line belongs, one of 14 defined "
"terms. Example: cancer cell line, hybridoma, transformed "
"cell line."
),
"keywords": [
"category",
"cancer cell line",
"hybridoma",
"transformed",
"primary",
"immortalized",
],
},
"dt": {
"short_name": "-",
"description": (
"Creation date, last modification date and version number "
"of the cell line Cellosaurus entry."
),
"keywords": ["date", "creation", "modification", "version"],
},
"dtc": {
"short_name": "-",
"description": ("Creation date of the cell line Cellosaurus entry."),
"keywords": [
"created",
"creation",
"established",
"founded",
],
},
"dtu": {
"short_name": "-",
"description": (
"Last modification date of the cell line Cellosaurus " "entry."
),
"keywords": [
"modified",
"modification",
"updated",
"changed",
],
},
"dtv": {
"short_name": "-",
"description": ("Version number of the cell line Cellosaurus entry."),
"keywords": ["version", "v"],
},
}
# Special species mappings with NCBI taxon IDs
self.species_mappings = {
"human": "ox:9606",
"homo sapiens": "ox:9606",
"mouse": "ox:10090",
"mus musculus": "ox:10090",
"rat": "ox:10116",
"rattus norvegicus": "ox:10116",
}
# Boolean operator patterns
self.boolean_patterns = [
(r"\b(and|&)\b", " AND "),
(r"\b(or|\|)\b", " OR "),
(r"\b(not|!)\b", " NOT "),
]
# Wildcard patterns
self.wildcard_patterns = [
(r"\*", "*"),
(r"\?", "?"),
]
# Range query patterns
self.range_patterns = [
(
r"\b(\d+)\s*(?:to|-)\s*(\d+)\s*(hours?|days?|years?)\b",
r"[\1 TO \2] \3",
),
(
r"\bbetween\s+(\d+)\s+and\s+(\d+)\s*(hours?|days?|years?)\b",
r"[\1 TO \2] \3",
),
]
[docs]
def run(self, arguments):
query = arguments.get("query")
include_explanation = arguments.get("include_explanation", True)
if not query:
return {"error": "`query` parameter is required."}
return self._convert_query(query, include_explanation)
[docs]
def _calculate_similarity(self, term: str, text: str) -> float:
"""
Calculate similarity between a term and text using SequenceMatcher.
"""
return SequenceMatcher(None, term.lower(), text.lower()).ratio()
[docs]
def _map_term_to_field(self, term: str) -> List[Tuple[str, float, str]]:
"""
Map a natural language term to Cellosaurus fields based on semantic
similarity.
"""
matches = []
term_lower = term.lower()
# Direct field tag matches
if term_lower in self.cellosaurus_fields:
matches.append((term_lower, 1.0, "direct_field_tag"))
# Species mappings
if term_lower in self.species_mappings:
field_tag = self.species_mappings[term_lower].split(":")[0]
matches.append((field_tag, 1.0, "species_mapping"))
# Keyword matching
for field_tag, field_info in self.cellosaurus_fields.items():
# Check keywords
for keyword in field_info["keywords"]:
if keyword.lower() in term_lower or term_lower in keyword.lower():
similarity = self._calculate_similarity(term, keyword)
matches.append((field_tag, similarity, "keyword_match:" + keyword))
# Check description similarity
desc_similarity = self._calculate_similarity(
term, field_info["description"]
)
if desc_similarity > 0.3: # Threshold for description matching
matches.append((field_tag, desc_similarity, "description_match"))
# Check short name similarity
if field_info["short_name"] != "-":
short_similarity = self._calculate_similarity(
term, field_info["short_name"]
)
if short_similarity > 0.3:
matches.append(
(
field_tag,
short_similarity,
"short_name_match:" + field_info["short_name"],
)
)
# Sort by similarity score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
return matches
[docs]
def _extract_field_terms(self, query: str) -> List[Tuple[str, str, float, str]]:
"""
Extract field-specific terms from the query using semantic mapping.
"""
terms = []
query_lower = query.lower()
# Split query into words and phrases
words = re.findall(r"\b\w+\b", query_lower)
# Also extract common phrases
phrases = []
for i in range(len(words) - 1):
phrases.append(f"{words[i]} {words[i+1]}")
all_terms = words + phrases
# Map each term to fields
for term in all_terms:
if len(term) < 2: # Skip very short terms
continue
field_matches = self._map_term_to_field(term)
# Take the best match if confidence is high enough
if field_matches and field_matches[0][1] > 0.4:
field_tag, confidence, reason = field_matches[0]
# Handle special cases
if field_tag == "ox" and term in self.species_mappings:
# Use the full species mapping (e.g., "ox:9606")
field_tag = self.species_mappings[term]
value = ""
else:
value = term
terms.append((field_tag, value, confidence, reason))
# Remove duplicates while preserving order
seen = set()
unique_terms = []
for field_tag, value, confidence, reason in terms:
key = (field_tag, value.lower())
if key not in seen:
seen.add(key)
unique_terms.append((field_tag, value, confidence, reason))
return unique_terms
[docs]
def _apply_boolean_operators(self, query: str) -> str:
"""Convert natural language boolean operators to Solr syntax."""
result = query
for pattern, replacement in self.boolean_patterns:
result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)
return result
[docs]
def _apply_range_queries(self, query: str) -> str:
"""Convert natural language ranges to Solr range syntax."""
result = query
for pattern, replacement in self.range_patterns:
result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)
return result
[docs]
def _apply_wildcards(self, query: str) -> str:
"""Convert natural language wildcard patterns to Solr syntax."""
result = query
for pattern, replacement in self.wildcard_patterns:
result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)
return result
[docs]
def _construct_solr_query(
self, terms: List[Tuple[str, str, float, str]], original_query: str
) -> str:
"""Construct the final Solr query from extracted terms."""
if not terms:
# If no field-specific terms found, return original query as
# general search
return self._apply_boolean_operators(original_query.strip())
# Build field-specific queries
field_queries = []
for field_tag, value, _confidence, _reason in terms:
# Handle special field mappings that already include values
# (like ox:9606)
if ":" in field_tag and not value:
field_queries.append(field_tag)
elif ":" in field_tag and value:
# Handle cases like "ox:9606" with additional value
field_queries.append(
f"{field_tag} AND {field_tag.split(':')[0]}:{value}"
)
else:
# Escape special characters in values
escaped_value = re.sub(r'([+\-&|!(){}[\]^"~*?:\\/])', r"\\\1", value)
field_queries.append(f"{field_tag}:{escaped_value}")
# Join with AND by default (most restrictive)
if len(field_queries) == 1:
return field_queries[0]
else:
return f"({' AND '.join(field_queries)})"
[docs]
def _validate_solr_query(self, query: str) -> Tuple[bool, str]:
"""Basic validation of Solr query syntax."""
try:
# Check for balanced parentheses
paren_count = query.count("(") - query.count(")")
if paren_count != 0:
return False, f"Unbalanced parentheses in query: {query}"
# Check for balanced brackets in ranges
bracket_count = query.count("[") - query.count("]")
if bracket_count != 0:
return False, f"Unbalanced brackets in range query: {query}"
# Check for empty field queries
if re.search(r":\s*$", query):
return False, f"Empty field value in query: {query}"
return True, "Valid Solr query"
except Exception as e:
return False, f"Query validation error: {str(e)}"
[docs]
def _convert_query(
self, natural_query: str, include_explanation: bool = True
) -> Dict[str, Any]:
"""
Convert natural language query to Solr syntax using systematic field
mapping.
"""
try:
# Normalize input
normalized_query = natural_query.lower()
# Replace common conjunctions with spaces for better term
# extraction
normalized_query = re.sub(
r"\b(with|from|in|of|for)\b", " ", normalized_query
)
# Remove extra whitespace
normalized_query = re.sub(r"\s+", " ", normalized_query)
# Apply transformations
processed_query = self._apply_boolean_operators(normalized_query)
processed_query = self._apply_range_queries(processed_query)
processed_query = self._apply_wildcards(processed_query)
# Extract field-specific terms using semantic mapping
terms = self._extract_field_terms(natural_query)
# Construct Solr query
solr_query = self._construct_solr_query(terms, processed_query)
# Validate the query
is_valid, validation_msg = self._validate_solr_query(solr_query)
result = {
"success": True,
"original_query": natural_query,
"solr_query": solr_query,
"is_valid": is_valid,
"validation_message": validation_msg,
}
if include_explanation:
result["explanation"] = {
"extracted_terms": [
{
"field": field_tag,
"value": value,
"confidence": confidence,
"match_reason": reason,
}
for field_tag, value, confidence, reason in terms
],
"transformations_applied": [
"boolean_operators",
"range_queries",
"wildcards",
],
"available_fields": list(self.cellosaurus_fields.keys()),
}
return result
except Exception as e:
return {
"success": False,
"error": f"Query conversion failed: {e}",
"original_query": natural_query,
}
[docs]
@register_tool("CellosaurusGetCellLineInfoTool")
class CellosaurusGetCellLineInfoTool(BaseTool):
"""
Tool to get detailed information about a specific cell line using its
accession number.
"""
[docs]
def __init__(self, tool_config, base_url="https://api.cellosaurus.org"):
super().__init__(tool_config)
self.base_url = base_url
self.timeout_seconds = int(os.environ.get("CELLOSAURUS_TIMEOUT", "30"))
[docs]
def run(self, arguments):
accession = arguments.get("accession")
format_type = arguments.get("format", "json")
fields = arguments.get("fields")
if not accession:
return {"error": "`accession` parameter is required."}
return self._get_cell_line_info(accession, format_type, fields)
[docs]
def _get_cell_line_info(self, accession, format_type, fields):
"""Get detailed cell line information by accession number."""
try:
# Validate accession format
# (Cellosaurus accessions start with CVCL_)
if not accession.startswith("CVCL_"):
return {
"error": (
"Accession must start with 'CVCL_' " "(Cellosaurus format)"
)
}
# Validate format
valid_formats = ["json", "xml", "txt", "fasta"]
if format_type not in valid_formats:
return {
"error": ("Format must be one of: " f"{', '.join(valid_formats)}")
}
# Validate fields if provided
if fields is not None:
if not isinstance(fields, list):
return {"error": "Fields must be a list of field names"}
# Valid Cellosaurus field tags
valid_fields = {
"id",
"sy",
"idsy",
"ac",
"acas",
"dr",
"ref",
"rx",
"ra",
"rt",
"rl",
"ww",
"anc",
"hla",
"reg",
"var",
"anec",
"biot",
"breed",
"caution",
"cell",
"char",
"donor",
"site",
"disc",
"time",
"from",
"group",
"kar",
"ko",
"msi",
"misc",
"miss",
"mabi",
"mabt",
"omics",
"part",
"pop",
"prob",
"res",
"sen",
"int",
"tfor",
"vir",
"cc",
"str",
"di",
"din",
"dio",
"ox",
"sx",
"ag",
"oi",
"hi",
"ch",
"ca",
"dt",
"dtc",
"dtu",
"dtv",
}
invalid_fields = set(fields) - valid_fields
if invalid_fields:
return {"error": f"Invalid fields: {list(invalid_fields)}"}
# Prepare request parameters
params = {"format": format_type}
if fields:
params["fields"] = ",".join(fields)
# Make API request
url = f"{self.base_url}/cell-line/{accession}"
headers = {"Accept": f"application/{format_type}"}
resp = requests.get(
url,
params=params,
headers=headers,
timeout=self.timeout_seconds,
)
resp.raise_for_status()
# Parse response based on format
if format_type == "json":
data = resp.json()
# Extract cell line data from nested structure
cell_line_data = None
if isinstance(data, dict) and "Cellosaurus" in data:
cellosaurus_data = data["Cellosaurus"]
if (
"cell-line-list" in cellosaurus_data
and cellosaurus_data["cell-line-list"]
):
cell_line_data = cellosaurus_data["cell-line-list"][0]
if not cell_line_data:
return {
"error": (
"No cell line data found for accession " f"{accession}"
)
}
# Apply field filtering if requested
if fields:
filtered_data = {}
for field in fields:
if field in cell_line_data:
filtered_data[field] = cell_line_data[field]
cell_line_data = filtered_data
return {
"success": True,
"accession": accession,
"data": cell_line_data,
"format": format_type,
}
else:
# For non-JSON formats, return the raw content
return {
"success": True,
"accession": accession,
"data": resp.text,
"format": format_type,
}
except requests.HTTPError as http_err:
status = getattr(http_err.response, "status_code", None)
if status == 404:
return {"error": f"Cell line with accession {accession} not found"}
return {"error": f"HTTP {status}: {http_err}"}
except Exception as e:
return {"error": str(e)}