Source code for tooluniverse.cellosaurus_tool

import os
import re
import requests
from typing import Any, Dict, List, Tuple
from difflib import SequenceMatcher

from .base_tool import BaseTool
from .tool_registry import register_tool


[docs] @register_tool("CellosaurusSearchTool") class CellosaurusSearchTool(BaseTool): """ Tool to search Cellosaurus cell lines using the official API. """
[docs] def __init__(self, tool_config, base_url="https://api.cellosaurus.org"): super().__init__(tool_config) self.base_url = base_url self.timeout_seconds = int(os.environ.get("CELLOSAURUS_TIMEOUT", "30"))
[docs] def run(self, arguments): q = arguments.get("q") offset = arguments.get("offset", 0) size = arguments.get("size", 20) if not q: return {"error": "`q` parameter is required."} return self._search_cell_lines(q, offset, size)
[docs] def _search_cell_lines(self, query, offset, size): """ Search Cellosaurus cell lines using the /search/cell-line endpoint. """ try: params = {"q": query.strip(), "offset": offset, "size": size} url = f"{self.base_url}/search/cell-line" headers = {"Accept": "application/json"} resp = requests.get( url, params=params, headers=headers, timeout=self.timeout_seconds, ) resp.raise_for_status() data = resp.json() # Extract cell lines from the API response structure cell_lines = [] total_count = 0 if isinstance(data, dict) and "Cellosaurus" in data: cellosaurus_data = data["Cellosaurus"] if "cell-line-list" in cellosaurus_data: cell_lines = cellosaurus_data["cell-line-list"] total_count = len(cell_lines) return { "success": True, "results": { "cell_lines": cell_lines, "total": total_count, "offset": offset, "size": size, }, "query": query.strip(), } except requests.HTTPError as http_err: status = getattr(http_err.response, "status_code", None) return {"error": f"HTTP {status}: {http_err}"} except Exception as e: return {"error": str(e)}
[docs] @register_tool("CellosaurusQueryConverterTool") class CellosaurusQueryConverterTool(BaseTool): """ Tool to convert natural language queries to Solr syntax for Cellosaurus API. """
[docs] def __init__(self, tool_config): super().__init__(tool_config) # Complete Cellosaurus field definitions from official API # documentation # https://api.cellosaurus.org/api-fields self.cellosaurus_fields = { "id": { "short_name": "-", "description": ( "Recommended name. Most frequently the name of the cell " "line as provided in the original publication." ), "keywords": [ "name", "cell line", "cellline", "recommended", "publication", ], }, "sy": { "short_name": "-", "description": ( "List of synonyms. We try to list all the different " "synonyms for the cell line, including alternative use of " "lower and upper cases characters. Misspellings are not " 'included in synonyms (see the "misspelling" tag).' ), "keywords": [ "synonym", "synonyms", "alias", "alternative", "different names", ], }, "idsy": { "short_name": "-", "description": ( "Recommended name with all its synonyms. Concatenation of " "ID and SY." ), "keywords": [ "name", "synonyms", "combined", "concatenation", ], }, "ac": { "short_name": "-", "description": ( "Primary accession. It is the unique identifier of the " "cell line. It is normally stable across Cellosaurus " "versions but when two entries are merged, one of the two " "accessions stays primary while the second one becomes " "secondary (see ACAS)" ), "keywords": [ "accession", "primary", "identifier", "unique", "stable", ], }, "acas": { "short_name": "-", "description": ( "Primary and secondary accession. Secondary accession are " "former primary accession kept here to ensure the access " "to a cell line via old identifiers." ), "keywords": [ "accession", "primary", "secondary", "former", "old identifiers", ], }, "dr": { "short_name": "-", "description": ( "Cross-references to external resources: cell catalogs, " "databases, resources listing cell lines as samples or to " "ontologies. A cross-reference has two parts: the short " "name of the resource (i.e. CCLE) and an identifier used " "to locate a particular entry of the resource related to " "the cell line. For a formal description of all the " "resources referred to in Cellosaurus, see here ." ), "keywords": [ "cross-reference", "external", "catalog", "database", "atcc", "dsmz", "ccle", "ecacc", ], }, "ref": { "short_name": "-", "description": ( "Publication references. Mostly publications describing " "the establishment of a cell line or its " "characterization. Can be journal articles, book " "chapters, patents and theses. Contains the " "cross-reference of the publication, its title, authors " "(or group/consortium) and citation elements." ), "keywords": [ "reference", "publication", "paper", "article", "journal", "book", "patent", "thesis", ], }, "rx": { "short_name": "-", "description": ( "Publication cross-reference. A unique identifier " "allowing access the publication online. The " "cross-reference has two parts: the shortname of the " "online resource (i.e. PubMed, DOI, PMCID, CLPUB) " "and an identifier used to locate the particular " "publication related to the cell line. For a formal " "description of all the resources referred to in " "Cellosaurus, see here ." ), "keywords": [ "cross-reference", "online", "pubmed", "doi", "pmcid", "clpub", "patent", "identifier", ], }, "ra": { "short_name": "-", "description": ( "Publication authors. List of authors of a publication " "referenced in a cell line entry." ), "keywords": [ "author", "authors", "written by", "publication", ], }, "rt": { "short_name": "-", "description": ( "Publication title. Title of a publication referenced in " "cell line entry." ), "keywords": ["title", "paper", "article", "publication"], }, "rl": { "short_name": "-", "description": ( "Publication citation elements. Citation elements of a " "publication referenced in a cell line entry." ), "keywords": ["citation", "cite", "reference", "elements"], }, "ww": { "short_name": "-", "description": "Web page related to the cell line", "keywords": ["website", "web page", "homepage", "url"], }, "anc": { "short_name": "genome-ancestry", "description": ( "Estimated ethnic ancestry of the donor of a human cell " "line based on the analysis of its genome." ), "keywords": [ "ancestry", "ethnic", "genetic background", "genome", "donor", ], }, "hla": { "short_name": "-", "description": ( "HLA typing information. Alleles identified on the MHC " "type I and type II genes of the donor of a human cell " "line." ), "keywords": [ "hla", "mhc", "typing", "alleles", "genes", "donor", ], }, "reg": { "short_name": "registration", "description": ( "Official list, or register in which the cell line is " "registered." ), "keywords": [ "registration", "registered", "official", "register", ], }, "var": { "short_name": "sequence-variation", "description": ( "Important sequence variations of the cell line compared " "to the reference genome of the species." ), "keywords": [ "variation", "mutation", "sequence", "variant", "snv", "indel", ], }, "anec": { "short_name": "anecdotal", "description": ( "Anecdotal details regarding the cell line (its origin, " "its name or any other particularity)." ), "keywords": [ "anecdotal", "story", "history", "background", "origin", ], }, "biot": { "short_name": "biotechnology", "description": ( "Type of use of the cell line in a biotechnological " "context." ), "keywords": [ "biotechnology", "biotech", "production", "manufacturing", "use", ], }, "breed": { "short_name": "-", "description": ( "Breed or subspecies an animal cell line is derived from " "with breed identifiers from FlyBase_Strain, RS and VBO." ), "keywords": [ "breed", "subspecies", "animal", "flybase", "strain", ], }, "caution": { "short_name": "-", "description": ( "Errors, inconsistencies, ambiguities regarding the " "origin or other aspects of the cell line." ), "keywords": [ "caution", "warning", "error", "inconsistency", "ambiguity", ], }, "cell": { "short_name": "cell-type", "description": ("Cell type from which the cell line is derived."), "keywords": ["cell type", "cell", "derived", "type"], }, "char": { "short_name": "characteristics", "description": ( "Production process or specific biological properties of " "the cell line." ), "keywords": [ "characteristics", "properties", "biological", "production", "process", "cancer", "tumor", "malignant", ], }, "donor": { "short_name": "donor-info", "description": ( "Miscellaneous information relevant to the donor of the " "cell line." ), "keywords": [ "donor", "patient", "miscellaneous", "information", ], }, "site": { "short_name": "derived-from-site", "description": ( "Body part (tissue or organ) the cell line is derived " "from." ), "keywords": [ "site", "tissue", "organ", "body part", "derived", "lung", "breast", "colon", "skin", "blood", "bone", "brain", ], }, "disc": { "short_name": "discontinued", "description": ( "Discontinuation status of the cell line in a cell line " "catalog." ), "keywords": [ "discontinued", "unavailable", "no longer available", "status", ], }, "time": { "short_name": "doubling-time", "description": "Population doubling-time of the cell line.", "keywords": [ "doubling time", "doubling", "population", "time", "hours", ], }, "from": { "short_name": "-", "description": ( "Laboratory, research institute, university having " "established the cell line." ), "keywords": [ "laboratory", "lab", "institute", "university", "established", ], }, "group": { "short_name": "-", "description": ( "Specific group the cell line belongs to (example: fish " "cell lines, vaccine production cell lines)." ), "keywords": [ "group", "fish cell lines", "vaccine production", "stem cell", "embryonic", ], }, "kar": { "short_name": "karyotype", "description": ( "Information relevant to the chromosomes of a cell line " "(often to describe chromosomal abnormalities)." ), "keywords": [ "karyotype", "chromosome", "chromosomal", "abnormalities", "defects", ], }, "ko": { "short_name": "knockout", "description": ( "Gene(s) knocked-out in the cell line and method to " "obtain the KO." ), "keywords": ["knockout", "ko", "gene", "knocked-out"], }, "msi": { "short_name": "-", "description": "Microsatellite instability degree.", "keywords": ["msi", "microsatellite instability"], }, "misc": { "short_name": "miscellaneous", "description": "Miscellaneous remarks about the cell line.", "keywords": [ "miscellaneous", "other", "additional", "notes", "remarks", ], }, "miss": { "short_name": "misspelling", "description": ( "Identified misspelling(s) of the cell line name with in " "some case the specific publication or external resource " "entry where it appears." ), "keywords": ["misspelling", "misspelled", "typo"], }, "mabi": { "short_name": "mab-isotype", "description": ( "Monoclonal antibody isotype. Examples: IgG2a, kappa; " "IgM, lambda." ), "keywords": [ "isotype", "igg", "igm", "iga", "ige", "monoclonal antibody", ], }, "mabt": { "short_name": "mab-target", "description": ( "Monoclonal antibody target molecule. Generally a " "specific protein or chemical compound." ), "keywords": [ "antibody", "mab", "target", "targeting", "protein", "molecule", ], }, "omics": { "short_name": "-", "description": ('"Omics" study(ies) carried out on the cell line.'), "keywords": [ "omics", "genomics", "transcriptomics", "proteomics", "metabolomics", "study", ], }, "part": { "short_name": "part-of", "description": ( "The cell line belongs to a specific panel or collection " "of cell lines." ), "keywords": ["part", "panel", "collection", "belongs to"], }, "pop": { "short_name": "population", "description": ( "Ethnic group, nationality of the individual from which " "the cell line was sampled." ), "keywords": [ "population", "ethnic", "nationality", "caucasian", "african", "asian", ], }, "prob": { "short_name": "problematic", "description": ( "Known problem(s) related to the cell line: contaminated, " "misidentified, misclassified cell line or appearing in a " "retracted paper." ), "keywords": [ "problematic", "contaminated", "misidentified", "problem", "retracted", ], }, "res": { "short_name": "resistance", "description": ( "Selected to be resistant to some chemical compound " "(like a drug used in chemotherapy) or toxin. with a " "cross-reference to either ChEBI, DrugBank, NCIt or " "UniProtKB." ), "keywords": [ "resistance", "resistant", "drug", "chemotherapy", "toxin", "cisplatin", "doxorubicin", ], }, "sen": { "short_name": "senescence", "description": "When a finite cell line will senesce.", "keywords": ["senescence", "senescent", "finite"], }, "int": { "short_name": "integrated", "description": ( "Genetic element(s) integrated in the cell line: gene " "name and identifier in CGNC, FlyBase, FPbase, HGNC, MGI, " "RGD, UniProtKB, and VGNC." ), "keywords": ["integrated", "genetic element", "gene"], }, "tfor": { "short_name": "transformant", "description": ( "What caused the cell line to be transformed: generally a " "virus (with a cross-reference to NCBI taxon identifier), " "a chemical compound (with a cross-reference to ChEBI) or " "a form of irradiation (with a cross-reference to NCIt)." ), "keywords": [ "transformant", "transformation", "virus", "chemical", "irradiation", ], }, "vir": { "short_name": "virology", "description": ( "Susceptibility of the cell line to viral infection, " "presence of integrated viruses or any other " "virology-related information." ), "keywords": [ "virology", "viral", "virus", "susceptibility", "infection", ], }, "cc": { "short_name": "-", "description": ( "Comment(s). Any content described in fields " "genome-ancestry, hla, registration, sequence-variation, " "anecdotal, biotechnology, breed, caution, " "characteristics, discontinued, donor-info, " "doubling-time, from, group, karyotype, knockout, " "miscellaneous, misspelling, mab-isotype, mab-target, " "msi, omics, population, problematic, resistance, " "senescence, transfected, transformant, virology." ), "keywords": ["comment", "note", "remark", "observation"], }, "str": { "short_name": "-", "description": "Short tandem repeat profile.", "keywords": [ "str", "short tandem repeat", "microsatellite", "profile", ], }, "di": { "short_name": "-", "description": ( "Disease(s) suffered by the individual from which the " "cell line originated with its NCI Thesaurus or ORDO " "identifier." ), "keywords": [ "disease", "condition", "leukemia", "lymphoma", "carcinoma", "sarcoma", ], }, "din": { "short_name": "-", "description": ( "Disease(s) suffered by the individual from which the " "cell line originated, restricted to diseases having a " "NCI Thesaurus identifier." ), "keywords": ["disease", "nci", "thesaurus"], }, "dio": { "short_name": "-", "description": ( "Disease(s) suffered by the individual from which the " "cell line originated, restricted to diseases having an " "ORDO identifier." ), "keywords": ["disease", "ordo"], }, "ox": { "short_name": "-", "description": ( "Species of the individual from which the cell line " "originates with its NCBI taxon identifier." ), "keywords": [ "species", "organism", "human", "mouse", "rat", "ncbi", "taxon", ], }, "sx": { "short_name": "-", "description": ( "Sex of the individual from which the cell line " "originates." ), "keywords": [ "sex", "gender", "male", "female", "man", "woman", ], }, "ag": { "short_name": "-", "description": ( "Age at sampling time of the individual from which the " "cell line was established." ), "keywords": [ "age", "aged", "years", "months", "days", "sampling", ], }, "oi": { "short_name": "-", "description": ( "Cell line(s) originating from same individual (sister " "cell lines)." ), "keywords": [ "sister", "sibling", "related", "same individual", ], }, "hi": { "short_name": "-", "description": ( "Parent cell line from which the cell line originates." ), "keywords": ["parent", "derived from"], }, "ch": { "short_name": "-", "description": ( "Cell line(s) originated from the cell line (child cell " "lines)." ), "keywords": ["child", "derived", "subclone"], }, "ca": { "short_name": "-", "description": ( "Category to which a cell line belongs, one of 14 defined " "terms. Example: cancer cell line, hybridoma, transformed " "cell line." ), "keywords": [ "category", "cancer cell line", "hybridoma", "transformed", "primary", "immortalized", ], }, "dt": { "short_name": "-", "description": ( "Creation date, last modification date and version number " "of the cell line Cellosaurus entry." ), "keywords": ["date", "creation", "modification", "version"], }, "dtc": { "short_name": "-", "description": ("Creation date of the cell line Cellosaurus entry."), "keywords": [ "created", "creation", "established", "founded", ], }, "dtu": { "short_name": "-", "description": ( "Last modification date of the cell line Cellosaurus " "entry." ), "keywords": [ "modified", "modification", "updated", "changed", ], }, "dtv": { "short_name": "-", "description": ("Version number of the cell line Cellosaurus entry."), "keywords": ["version", "v"], }, } # Special species mappings with NCBI taxon IDs self.species_mappings = { "human": "ox:9606", "homo sapiens": "ox:9606", "mouse": "ox:10090", "mus musculus": "ox:10090", "rat": "ox:10116", "rattus norvegicus": "ox:10116", } # Boolean operator patterns self.boolean_patterns = [ (r"\b(and|&)\b", " AND "), (r"\b(or|\|)\b", " OR "), (r"\b(not|!)\b", " NOT "), ] # Wildcard patterns self.wildcard_patterns = [ (r"\*", "*"), (r"\?", "?"), ] # Range query patterns self.range_patterns = [ ( r"\b(\d+)\s*(?:to|-)\s*(\d+)\s*(hours?|days?|years?)\b", r"[\1 TO \2] \3", ), ( r"\bbetween\s+(\d+)\s+and\s+(\d+)\s*(hours?|days?|years?)\b", r"[\1 TO \2] \3", ), ]
[docs] def run(self, arguments): query = arguments.get("query") include_explanation = arguments.get("include_explanation", True) if not query: return {"error": "`query` parameter is required."} return self._convert_query(query, include_explanation)
[docs] def _calculate_similarity(self, term: str, text: str) -> float: """ Calculate similarity between a term and text using SequenceMatcher. """ return SequenceMatcher(None, term.lower(), text.lower()).ratio()
[docs] def _map_term_to_field(self, term: str) -> List[Tuple[str, float, str]]: """ Map a natural language term to Cellosaurus fields based on semantic similarity. """ matches = [] term_lower = term.lower() # Direct field tag matches if term_lower in self.cellosaurus_fields: matches.append((term_lower, 1.0, "direct_field_tag")) # Species mappings if term_lower in self.species_mappings: field_tag = self.species_mappings[term_lower].split(":")[0] matches.append((field_tag, 1.0, "species_mapping")) # Keyword matching for field_tag, field_info in self.cellosaurus_fields.items(): # Check keywords for keyword in field_info["keywords"]: if keyword.lower() in term_lower or term_lower in keyword.lower(): similarity = self._calculate_similarity(term, keyword) matches.append((field_tag, similarity, "keyword_match:" + keyword)) # Check description similarity desc_similarity = self._calculate_similarity( term, field_info["description"] ) if desc_similarity > 0.3: # Threshold for description matching matches.append((field_tag, desc_similarity, "description_match")) # Check short name similarity if field_info["short_name"] != "-": short_similarity = self._calculate_similarity( term, field_info["short_name"] ) if short_similarity > 0.3: matches.append( ( field_tag, short_similarity, "short_name_match:" + field_info["short_name"], ) ) # Sort by similarity score (highest first) matches.sort(key=lambda x: x[1], reverse=True) return matches
[docs] def _extract_field_terms(self, query: str) -> List[Tuple[str, str, float, str]]: """ Extract field-specific terms from the query using semantic mapping. """ terms = [] query_lower = query.lower() # Split query into words and phrases words = re.findall(r"\b\w+\b", query_lower) # Also extract common phrases phrases = [] for i in range(len(words) - 1): phrases.append(f"{words[i]} {words[i+1]}") all_terms = words + phrases # Map each term to fields for term in all_terms: if len(term) < 2: # Skip very short terms continue field_matches = self._map_term_to_field(term) # Take the best match if confidence is high enough if field_matches and field_matches[0][1] > 0.4: field_tag, confidence, reason = field_matches[0] # Handle special cases if field_tag == "ox" and term in self.species_mappings: # Use the full species mapping (e.g., "ox:9606") field_tag = self.species_mappings[term] value = "" else: value = term terms.append((field_tag, value, confidence, reason)) # Remove duplicates while preserving order seen = set() unique_terms = [] for field_tag, value, confidence, reason in terms: key = (field_tag, value.lower()) if key not in seen: seen.add(key) unique_terms.append((field_tag, value, confidence, reason)) return unique_terms
[docs] def _apply_boolean_operators(self, query: str) -> str: """Convert natural language boolean operators to Solr syntax.""" result = query for pattern, replacement in self.boolean_patterns: result = re.sub(pattern, replacement, result, flags=re.IGNORECASE) return result
[docs] def _apply_range_queries(self, query: str) -> str: """Convert natural language ranges to Solr range syntax.""" result = query for pattern, replacement in self.range_patterns: result = re.sub(pattern, replacement, result, flags=re.IGNORECASE) return result
[docs] def _apply_wildcards(self, query: str) -> str: """Convert natural language wildcard patterns to Solr syntax.""" result = query for pattern, replacement in self.wildcard_patterns: result = re.sub(pattern, replacement, result, flags=re.IGNORECASE) return result
[docs] def _construct_solr_query( self, terms: List[Tuple[str, str, float, str]], original_query: str ) -> str: """Construct the final Solr query from extracted terms.""" if not terms: # If no field-specific terms found, return original query as # general search return self._apply_boolean_operators(original_query.strip()) # Build field-specific queries field_queries = [] for field_tag, value, _confidence, _reason in terms: # Handle special field mappings that already include values # (like ox:9606) if ":" in field_tag and not value: field_queries.append(field_tag) elif ":" in field_tag and value: # Handle cases like "ox:9606" with additional value field_queries.append( f"{field_tag} AND {field_tag.split(':')[0]}:{value}" ) else: # Escape special characters in values escaped_value = re.sub(r'([+\-&|!(){}[\]^"~*?:\\/])', r"\\\1", value) field_queries.append(f"{field_tag}:{escaped_value}") # Join with AND by default (most restrictive) if len(field_queries) == 1: return field_queries[0] else: return f"({' AND '.join(field_queries)})"
[docs] def _validate_solr_query(self, query: str) -> Tuple[bool, str]: """Basic validation of Solr query syntax.""" try: # Check for balanced parentheses paren_count = query.count("(") - query.count(")") if paren_count != 0: return False, f"Unbalanced parentheses in query: {query}" # Check for balanced brackets in ranges bracket_count = query.count("[") - query.count("]") if bracket_count != 0: return False, f"Unbalanced brackets in range query: {query}" # Check for empty field queries if re.search(r":\s*$", query): return False, f"Empty field value in query: {query}" return True, "Valid Solr query" except Exception as e: return False, f"Query validation error: {str(e)}"
[docs] def _convert_query( self, natural_query: str, include_explanation: bool = True ) -> Dict[str, Any]: """ Convert natural language query to Solr syntax using systematic field mapping. """ try: # Normalize input normalized_query = natural_query.lower() # Replace common conjunctions with spaces for better term # extraction normalized_query = re.sub( r"\b(with|from|in|of|for)\b", " ", normalized_query ) # Remove extra whitespace normalized_query = re.sub(r"\s+", " ", normalized_query) # Apply transformations processed_query = self._apply_boolean_operators(normalized_query) processed_query = self._apply_range_queries(processed_query) processed_query = self._apply_wildcards(processed_query) # Extract field-specific terms using semantic mapping terms = self._extract_field_terms(natural_query) # Construct Solr query solr_query = self._construct_solr_query(terms, processed_query) # Validate the query is_valid, validation_msg = self._validate_solr_query(solr_query) result = { "success": True, "original_query": natural_query, "solr_query": solr_query, "is_valid": is_valid, "validation_message": validation_msg, } if include_explanation: result["explanation"] = { "extracted_terms": [ { "field": field_tag, "value": value, "confidence": confidence, "match_reason": reason, } for field_tag, value, confidence, reason in terms ], "transformations_applied": [ "boolean_operators", "range_queries", "wildcards", ], "available_fields": list(self.cellosaurus_fields.keys()), } return result except Exception as e: return { "success": False, "error": f"Query conversion failed: {e}", "original_query": natural_query, }
[docs] @register_tool("CellosaurusGetCellLineInfoTool") class CellosaurusGetCellLineInfoTool(BaseTool): """ Tool to get detailed information about a specific cell line using its accession number. """
[docs] def __init__(self, tool_config, base_url="https://api.cellosaurus.org"): super().__init__(tool_config) self.base_url = base_url self.timeout_seconds = int(os.environ.get("CELLOSAURUS_TIMEOUT", "30"))
[docs] def run(self, arguments): accession = arguments.get("accession") format_type = arguments.get("format", "json") fields = arguments.get("fields") if not accession: return {"error": "`accession` parameter is required."} return self._get_cell_line_info(accession, format_type, fields)
[docs] def _get_cell_line_info(self, accession, format_type, fields): """Get detailed cell line information by accession number.""" try: # Validate accession format # (Cellosaurus accessions start with CVCL_) if not accession.startswith("CVCL_"): return { "error": ( "Accession must start with 'CVCL_' " "(Cellosaurus format)" ) } # Validate format valid_formats = ["json", "xml", "txt", "fasta"] if format_type not in valid_formats: return { "error": ("Format must be one of: " f"{', '.join(valid_formats)}") } # Validate fields if provided if fields is not None: if not isinstance(fields, list): return {"error": "Fields must be a list of field names"} # Valid Cellosaurus field tags valid_fields = { "id", "sy", "idsy", "ac", "acas", "dr", "ref", "rx", "ra", "rt", "rl", "ww", "anc", "hla", "reg", "var", "anec", "biot", "breed", "caution", "cell", "char", "donor", "site", "disc", "time", "from", "group", "kar", "ko", "msi", "misc", "miss", "mabi", "mabt", "omics", "part", "pop", "prob", "res", "sen", "int", "tfor", "vir", "cc", "str", "di", "din", "dio", "ox", "sx", "ag", "oi", "hi", "ch", "ca", "dt", "dtc", "dtu", "dtv", } invalid_fields = set(fields) - valid_fields if invalid_fields: return {"error": f"Invalid fields: {list(invalid_fields)}"} # Prepare request parameters params = {"format": format_type} if fields: params["fields"] = ",".join(fields) # Make API request url = f"{self.base_url}/cell-line/{accession}" headers = {"Accept": f"application/{format_type}"} resp = requests.get( url, params=params, headers=headers, timeout=self.timeout_seconds, ) resp.raise_for_status() # Parse response based on format if format_type == "json": data = resp.json() # Extract cell line data from nested structure cell_line_data = None if isinstance(data, dict) and "Cellosaurus" in data: cellosaurus_data = data["Cellosaurus"] if ( "cell-line-list" in cellosaurus_data and cellosaurus_data["cell-line-list"] ): cell_line_data = cellosaurus_data["cell-line-list"][0] if not cell_line_data: return { "error": ( "No cell line data found for accession " f"{accession}" ) } # Apply field filtering if requested if fields: filtered_data = {} for field in fields: if field in cell_line_data: filtered_data[field] = cell_line_data[field] cell_line_data = filtered_data return { "success": True, "accession": accession, "data": cell_line_data, "format": format_type, } else: # For non-JSON formats, return the raw content return { "success": True, "accession": accession, "data": resp.text, "format": format_type, } except requests.HTTPError as http_err: status = getattr(http_err.response, "status_code", None) if status == 404: return {"error": f"Cell line with accession {accession} not found"} return {"error": f"HTTP {status}: {http_err}"} except Exception as e: return {"error": str(e)}