From 0d1746e2513e5c9bc52061f6ccfa33ede4f8f2ba Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Fri, 20 Feb 2026 18:25:34 +0000 Subject: [PATCH 1/8] a --- src/formats/ncbi.rs | 198 ++++++++++++++++++++++++++++++++++++++++---- src/python.rs | 123 +++++++++++++++++++++++++++ 2 files changed, 306 insertions(+), 15 deletions(-) diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index c3b8522..aa56511 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -23,12 +23,12 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult = Vec::new(); let mut parents: Vec = Vec::new(); let mut ranks: Vec = Vec::new(); + let mut data: Vec> = Vec::new(); let mut tax_to_idx: HashMap = HashMap::new(); for (ix, line) in BufReader::new(nodes_file).lines().enumerate() { let mut fields: Vec = line?.split("\t|\t").map(|x| x.to_string()).collect(); - if fields.len() < 10 { - // should be at least 14 + if fields.len() < 12 { return Err(Error::new(ErrorKind::ImportError { line: ix, msg: "Not enough fields in nodes.dmp; bad line?".to_owned(), @@ -36,11 +36,60 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult>(ncbi_directory: P) -> TaxonomyResult = vec![String::new(); tax_ids.len()]; for (ix, line) in BufReader::new(names_file).lines().enumerate() { let mut fields: Vec = line?.split("\t|\t").map(|x| x.to_string()).collect(); - if fields.len() > 10 { - // should only be 5 + if fields.len() < 4 { return Err(Error::new(ErrorKind::ImportError { line: ix, - msg: "Too many fields in names.dmp".to_owned(), + msg: "Not enough fields in names.dmp".to_owned(), })); } let tax_id = fields.remove(0).trim().to_string(); - let name = fields.remove(0).trim().to_string(); - let name_class = fields.remove(1); - if name_class.starts_with("scientific name") { - let name = name.to_string(); - names[tax_to_idx[&*tax_id]] = name; + let name_txt = fields.remove(0).trim().to_string(); + let unique_name = fields.remove(0).trim().to_string(); + let name_class = fields.remove(0).trim().trim_end_matches("\t|").to_string(); + + if let Some(&idx) = tax_to_idx.get(&tax_id) { + if name_class == "scientific name" { + names[idx] = name_txt.clone(); + } + + // Store all name types in data + let name_key = format!("name_{}", name_class.replace(" ", "_")); + data[idx].insert(name_key, serde_json::Value::String(name_txt)); + + if !unique_name.is_empty() { + let unique_key = format!("unique_name_{}", name_class.replace(" ", "_")); + data[idx].insert(unique_key, serde_json::Value::String(unique_name)); + } } } - let gt = - GeneralTaxonomy::from_arrays(tax_ids, parent_ids, Some(names), Some(ranks), None, None)?; + let gt = GeneralTaxonomy::from_arrays( + tax_ids, + parent_ids, + Some(names), + Some(ranks), + None, + Some(data), + )?; gt.validate_uniqueness()?; Ok(gt) } @@ -106,14 +172,88 @@ where .map(|(x, _)| format!("{}", x)) .unwrap_or_default() }; + + // Extract data fields + let node_data = tax.data(key.clone())?; + let embl_code = node_data + .get("embl_code") + .and_then(|v| v.as_str()) + .unwrap_or(""); + let division_id = node_data + .get("division_id") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let inherited_div_flag = node_data + .get("inherited_div_flag") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let genetic_code_id = node_data + .get("genetic_code_id") + .and_then(|v| v.as_str()) + .unwrap_or("1"); + let inherited_GC_flag = node_data + .get("inherited_GC_flag") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let mitochondrial_genetic_code_id = node_data + .get("mitochondrial_genetic_code_id") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let inherited_MGC_flag = node_data + .get("inherited_MGC_flag") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let GenBank_hidden_flag = node_data + .get("GenBank_hidden_flag") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let hidden_subtree_root_flag = node_data + .get("hidden_subtree_root_flag") + .and_then(|v| v.as_str()) + .unwrap_or("0"); + let comments = node_data + .get("comments") + .and_then(|v| v.as_str()) + .unwrap_or(""); + + // Write scientific name name_writer .write_all(format!("{}\t|\t{}\t|\t\t|\tscientific name\t|\n", &key, name).as_bytes())?; + + // Write all alternative names from data + for (data_key, value) in node_data.iter() { + if data_key.starts_with("name_") && data_key != "name_scientific_name" { + let name_class = data_key.strip_prefix("name_").unwrap().replace("_", " "); + let name_txt = value.as_str().unwrap_or(""); + let unique_name_key = format!("unique_name_{}", data_key.strip_prefix("name_").unwrap()); + let unique_name = node_data + .get(&unique_name_key) + .and_then(|v| v.as_str()) + .unwrap_or(""); + name_writer.write_all( + format!("{}\t|\t{}\t|\t{}\t|\t{}\t|\n", &key, name_txt, unique_name, name_class) + .as_bytes(), + )?; + } + } + + // Write nodes.dmp entry with all fields node_writer.write_all( format!( - "{}\t|\t{}\t|\t{}\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", + "{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\n", &key, parent, rank.to_ncbi_rank(), + embl_code, + division_id, + inherited_div_flag, + genetic_code_id, + inherited_GC_flag, + mitochondrial_genetic_code_id, + inherited_MGC_flag, + GenBank_hidden_flag, + hidden_subtree_root_flag, + comments, ) .as_bytes(), )?; @@ -219,6 +359,20 @@ mod tests { Some(("561", 1.)) ); + // Check that node data fields are loaded + let data_562 = Taxonomy::<&str>::data(&tax, "562").unwrap(); + assert_eq!( + data_562.get("genetic_code_id").and_then(|v| v.as_str()), + Some("11") + ); + + // Check that alternative names are stored + assert!(data_562.contains_key("name_common_name")); + assert_eq!( + data_562.get("name_common_name").and_then(|v| v.as_str()), + Some("E. coli") + ); + let out = path.join("out"); save::<&str, _, _>(&tax, &out).unwrap(); @@ -258,5 +412,19 @@ mod tests { Taxonomy::<&str>::children(&tax2, "561").unwrap(), vec!["562"] ); + + // Check that data fields are preserved through save/load cycle + let data_562_after = Taxonomy::<&str>::data(&tax2, "562").unwrap(); + assert_eq!( + data_562_after.get("genetic_code_id").and_then(|v| v.as_str()), + Some("11") + ); + + // Check that alternative names are preserved + assert!(data_562_after.contains_key("name_common_name")); + assert_eq!( + data_562_after.get("name_common_name").and_then(|v| v.as_str()), + Some("E. coli") + ); } } diff --git a/src/python.rs b/src/python.rs index ba104fd..940cd58 100644 --- a/src/python.rs +++ b/src/python.rs @@ -124,6 +124,78 @@ impl TaxonomyNode { self.id, self.rank, self.name )) } + + /// get_data(self) + /// -- + /// + /// Get all extra data fields as a dictionary. + /// + /// Returns: + /// dict: Dictionary containing all additional fields from the taxonomy + fn get_data(&self, py: Python<'_>) -> PyResult { + let pydict = PyDict::new(py); + for (key, val) in self.extra.iter() { + pydict.set_item(key, json_value_to_pyobject(val))?; + } + Ok(pydict.to_object(py)) + } + + /// get_data_keys(self) + /// -- + /// + /// Get list of all available data field keys. + /// + /// Returns: + /// list: List of field names available in the extra data + fn get_data_keys(&self, py: Python<'_>) -> PyResult { + let pylist = PyList::empty(py); + for key in self.extra.keys() { + pylist.append(key)?; + } + Ok(pylist.to_object(py)) + } + + /// get(self, key: str, default=None) + /// -- + /// + /// Get a data field value with optional default. + /// + /// Args: + /// key: Field name to retrieve + /// default: Default value if field doesn't exist (default: None) + /// + /// Returns: + /// Value of the field or default if not found + fn get(&self, key: &str, default: Option<&PyAny>, py: Python<'_>) -> PyResult { + if self.extra.contains_key(key) { + Ok(json_value_to_pyobject(self.extra.get(key).unwrap())) + } else if let Some(def) = default { + Ok(def.to_object(py)) + } else { + Ok(py.None()) + } + } + + /// Convenience properties for common NCBI fields + #[getter] + fn genetic_code_id(&self, py: Python<'_>) -> PyResult { + self.get("genetic_code_id", None, py) + } + + #[getter] + fn embl_code(&self, py: Python<'_>) -> PyResult { + self.get("embl_code", None, py) + } + + #[getter] + fn division_id(&self, py: Python<'_>) -> PyResult { + self.get("division_id", None, py) + } + + #[getter] + fn mitochondrial_genetic_code_id(&self, py: Python<'_>) -> PyResult { + self.get("mitochondrial_genetic_code_id", None, py) + } } /// The Taxonomy object provides the primary interface for exploring a @@ -206,6 +278,15 @@ impl Taxonomy { /// /// Load a Taxonomy from a directory. /// The directory must contain the `nodes.dmp` and `names.dmp` files. + /// + /// All fields from both nodes.dmp and names.dmp are loaded and accessible + /// via the node's data methods or dict-like interface. + /// + /// Args: + /// dump_dir: Path to directory containing NCBI taxonomy dump files + /// + /// Returns: + /// Taxonomy: Loaded taxonomy with all NCBI fields accessible #[classmethod] fn from_ncbi(_cls: &PyType, dump_dir: &str) -> PyResult { let tax = py_try!(ncbi::load(dump_dir)); @@ -393,6 +474,48 @@ impl Taxonomy { Ok(res) } + /// data(self, tax_id: str) + /// -- + /// + /// Get all extra data fields for a taxonomy node as a dictionary. + /// + /// Args: + /// tax_id: The taxonomy ID to look up + /// + /// Returns: + /// dict: Dictionary containing all additional fields from the taxonomy + fn data(&self, tax_id: &str, py: Python<'_>) -> PyResult { + let data = py_try!(self.tax.data(tax_id)); + let pydict = PyDict::new(py); + for (key, val) in data.iter() { + pydict.set_item(key, json_value_to_pyobject(val))?; + } + Ok(pydict.to_object(py)) + } + + /// get_field(self, tax_id: str, field: str, default=None) + /// -- + /// + /// Get a specific data field for a taxonomy node. + /// + /// Args: + /// tax_id: The taxonomy ID to look up + /// field: Field name to retrieve + /// default: Default value if field doesn't exist (default: None) + /// + /// Returns: + /// Value of the field or default if not found + fn get_field(&self, tax_id: &str, field: &str, default: Option<&PyAny>, py: Python<'_>) -> PyResult { + let data = py_try!(self.tax.data(tax_id)); + if let Some(value) = data.get(field) { + Ok(json_value_to_pyobject(value)) + } else if let Some(def) = default { + Ok(def.to_object(py)) + } else { + Ok(py.None()) + } + } + /// internal_index(self, tax_id: str) /// -- /// From e1fe38f4b52ac57cbcf456058fb73aa067467c61 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Fri, 20 Feb 2026 20:58:26 +0000 Subject: [PATCH 2/8] added helper --- src/formats/ncbi.rs | 82 ++++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index aa56511..9fdf406 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -149,6 +149,47 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult, + tax_id: &str, + parent: &str, + rank: &str, + embl_code: &str, + division_id: &str, + inherited_div: &str, + genetic_code: &str, + inherited_gc: &str, + mito_gc: &str, + inherited_mgc: &str, + genbank_hidden: &str, + subtree_hidden: &str, + comments: &str, +) -> std::io::Result<()> { + write!( + writer, + "{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\n", + tax_id, parent, rank, embl_code, division_id, inherited_div, + genetic_code, inherited_gc, mito_gc, inherited_mgc, + genbank_hidden, subtree_hidden, comments + ) +} + +/// Helper function to write a single row to names.dmp +fn write_names_row( + writer: &mut BufWriter, + tax_id: &str, + name: &str, + unique_name: &str, + name_class: &str, +) -> std::io::Result<()> { + write!( + writer, + "{}\t|\t{}\t|\t{}\t|\t{}\t|\n", + tax_id, name, unique_name, name_class + ) +} + pub fn save<'t, T: 't, P: AsRef, X: Taxonomy<'t, T>>( tax: &'t X, out_dir: P, @@ -217,8 +258,7 @@ where .unwrap_or(""); // Write scientific name - name_writer - .write_all(format!("{}\t|\t{}\t|\t\t|\tscientific name\t|\n", &key, name).as_bytes())?; + write_names_row(&mut name_writer, &format!("{}", &key), &name, "", "scientific name")?; // Write all alternative names from data for (data_key, value) in node_data.iter() { @@ -230,32 +270,26 @@ where .get(&unique_name_key) .and_then(|v| v.as_str()) .unwrap_or(""); - name_writer.write_all( - format!("{}\t|\t{}\t|\t{}\t|\t{}\t|\n", &key, name_txt, unique_name, name_class) - .as_bytes(), - )?; + write_names_row(&mut name_writer, &format!("{}", &key), name_txt, unique_name, &name_class)?; } } // Write nodes.dmp entry with all fields - node_writer.write_all( - format!( - "{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\n", - &key, - parent, - rank.to_ncbi_rank(), - embl_code, - division_id, - inherited_div_flag, - genetic_code_id, - inherited_GC_flag, - mitochondrial_genetic_code_id, - inherited_MGC_flag, - GenBank_hidden_flag, - hidden_subtree_root_flag, - comments, - ) - .as_bytes(), + write_nodes_row( + &mut node_writer, + &format!("{}", &key), + &parent, + rank.to_ncbi_rank(), + embl_code, + division_id, + inherited_div_flag, + genetic_code_id, + inherited_GC_flag, + mitochondrial_genetic_code_id, + inherited_MGC_flag, + GenBank_hidden_flag, + hidden_subtree_root_flag, + comments, )?; } From 00963e66a0ba0adaa3a3e316ade689999c5b5698 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Fri, 20 Feb 2026 21:23:23 +0000 Subject: [PATCH 3/8] a --- src/formats/ncbi.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index 9fdf406..d5a3b5f 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -47,7 +47,7 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult>(ncbi_directory: P) -> TaxonomyResult Date: Fri, 20 Feb 2026 22:58:39 +0000 Subject: [PATCH 4/8] added json support --- src/formats/json.rs | 168 ++++++++++++++++++++++++++++++++++++++++++++ src/python.rs | 19 +++++ taxonomy.pyi | 71 +++++++++++++++++++ 3 files changed, 258 insertions(+) diff --git a/src/formats/json.rs b/src/formats/json.rs index 9d1d81a..2b4a9bd 100644 --- a/src/formats/json.rs +++ b/src/formats/json.rs @@ -369,6 +369,80 @@ where Ok(()) } +/// Memory-efficient streaming tree export that writes JSON incrementally +/// instead of building the entire tree structure in memory first. +pub fn save_tree_streaming<'t, W: Write, T: 't, X: Taxonomy<'t, T>>( + mut writer: W, + taxonomy: &'t X, + root_node: Option, +) -> TaxonomyResult<()> +where + T: Clone + Debug + Display + Eq + Hash + PartialEq, +{ + let tax_id = root_node + .or_else(|| { + if taxonomy.is_empty() { + None + } else { + Some(taxonomy.root()) + } + }) + .ok_or(Error::new(ErrorKind::InvalidTaxonomy( + "Taxonomy must have a root node.".to_string(), + )))?; + + write_node_streaming(taxonomy, tax_id, &mut writer)?; + writer.flush()?; + Ok(()) +} + +fn write_node_streaming<'t, W: Write, T: 't>( + tax: &'t impl Taxonomy<'t, T>, + tax_id: T, + writer: &mut W, +) -> TaxonomyResult<()> +where + T: Clone + Debug + Display + Eq + Hash + PartialEq, +{ + write!(writer, "{{")?; + + // Write id + write!(writer, "\"id\":")?; + to_writer(&mut *writer, &tax_id.to_string())?; + + // Write name + write!(writer, ",\"name\":")?; + to_writer(&mut *writer, &tax.name(tax_id.clone())?)?; + + // Write rank + write!(writer, ",\"rank\":")?; + to_writer(&mut *writer, tax.rank(tax_id.clone())?.to_ncbi_rank())?; + + // Write extra data fields + let data = tax.data(tax_id.clone())?; + for (key, value) in data.iter() { + write!(writer, ",{}", serde_json::to_string(key)?)?; + write!(writer, ":")?; + to_writer(&mut *writer, value)?; + } + + // Write children + let children = tax.children(tax_id)?; + if !children.is_empty() { + write!(writer, ",\"children\":[")?; + for (i, child) in children.into_iter().enumerate() { + if i > 0 { + write!(writer, ",")?; + } + write_node_streaming(tax, child, writer)?; + } + write!(writer, "]")?; + } + + write!(writer, "}}")?; + Ok(()) +} + fn serialize_as_tree<'t, T: 't>( taxonomy: &'t impl Taxonomy<'t, T>, root_node: Option, @@ -720,4 +794,98 @@ mod tests { let example = r#"{"id": "1", "rank": null, "name": ""}"#; assert!(load(Cursor::new(example), None).is_ok()); } + + #[test] + fn streaming_tree_matches_regular_tree() { + // Create a taxonomy with multiple nodes and extra data + let example = r#"{ + "id": "1", + "name": "root", + "rank": "no rank", + "custom_field": "value1", + "children": [ + { + "id": "2", + "name": "Bacteria", + "rank": "superkingdom", + "genetic_code_id": "11", + "children": [ + { + "id": "562", + "name": "Escherichia coli", + "rank": "species", + "genetic_code_id": "11", + "embl_code": "EC", + "division_id": "0", + "name_common_name": "E. coli" + } + ] + } + ] + }"#; + + let tax = load(Cursor::new(example), None).unwrap(); + + // Save using regular method + let mut regular_output = Vec::new(); + save::<_, &str, _>(&mut regular_output, &tax, JsonFormat::Tree, None).unwrap(); + + // Save using streaming method + let mut streaming_output = Vec::new(); + save_tree_streaming::<_, &str, _>(&mut streaming_output, &tax, None).unwrap(); + + // Parse both outputs + let regular_json: Value = serde_json::from_slice(®ular_output).unwrap(); + let streaming_json: Value = serde_json::from_slice(&streaming_output).unwrap(); + + // They should be equivalent + assert_eq!(regular_json, streaming_json); + + // Verify we can reload from streaming output + let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap(); + assert_eq!(Taxonomy::<&str>::len(&tax2), 3); + assert_eq!(Taxonomy::<&str>::name(&tax2, "562").unwrap(), "Escherichia coli"); + + // Verify extra data is preserved + let data = Taxonomy::<&str>::data(&tax2, "562").unwrap(); + assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11")); + assert_eq!(data.get("embl_code").and_then(|v| v.as_str()), Some("EC")); + assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli")); + } + + #[test] + fn streaming_handles_large_data_fields() { + // Test that streaming handles nodes with lots of extra fields (like NCBI data) + let example = r#"{ + "id": "562", + "name": "Escherichia coli", + "rank": "species", + "genetic_code_id": "11", + "division_id": "0", + "inherited_div_flag": "1", + "mitochondrial_genetic_code_id": "0", + "inherited_MGC_flag": "1", + "GenBank_hidden_flag": "0", + "hidden_subtree_root_flag": "0", + "comments": "", + "name_scientific_name": "Escherichia coli", + "name_common_name": "E. coli", + "name_synonym": "Bacterium coli", + "name_authority": "Escherichia coli (Migula 1895) Castellani and Chalmers 1919" + }"#; + + let tax = load(Cursor::new(example), None).unwrap(); + + let mut streaming_output = Vec::new(); + save_tree_streaming::<_, &str, _>(&mut streaming_output, &tax, None).unwrap(); + + // Reload and verify all fields preserved + let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap(); + let data = Taxonomy::<&str>::data(&tax2, "562").unwrap(); + + assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11")); + assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli")); + assert_eq!(data.get("name_synonym").and_then(|v| v.as_str()), Some("Bacterium coli")); + assert_eq!(data.get("comments").and_then(|v| v.as_str()), Some("")); + } } diff --git a/src/python.rs b/src/python.rs index 940cd58..5b830cf 100644 --- a/src/python.rs +++ b/src/python.rs @@ -329,6 +329,25 @@ impl Taxonomy { Ok(PyBytes::new(py, &bytes).into()) } + /// to_json_tree_streaming(self) + /// -- + /// + /// Export a Taxonomy as a JSON-encoded byte string in a tree format. + /// This version uses streaming/incremental writing for much better memory + /// efficiency with large taxonomies (e.g., full NCBI taxonomy with 2.5M+ nodes). + /// + /// Unlike to_json_tree(), which builds the entire tree structure in memory + /// before serialization, this writes JSON directly as it traverses the tree. + fn to_json_tree_streaming(&self, py: Python<'_>) -> PyResult { + let mut bytes = Vec::new(); + py_try!(json::save_tree_streaming::<_, &str, _>( + &mut bytes, + &self.tax, + None + )); + Ok(PyBytes::new(py, &bytes).into()) + } + /// to_json_node_links(self) /// -- /// diff --git a/taxonomy.pyi b/taxonomy.pyi index 91eb779..25ee7ce 100644 --- a/taxonomy.pyi +++ b/taxonomy.pyi @@ -19,6 +19,38 @@ class TaxonomyNode: def __eq__(self, other: object) -> bool: ... def __ne__(self, other: object) -> bool: ... + def get_data(self) -> dict: + """Get all extra data fields as a dictionary.""" + ... + + def get_data_keys(self) -> List[str]: + """Get list of all available data field keys.""" + ... + + def get(self, key: str, default: Any = None) -> Any: + """Get a data field value with optional default.""" + ... + + @property + def genetic_code_id(self) -> Optional[str]: + """NCBI genetic code ID.""" + ... + + @property + def embl_code(self) -> Optional[str]: + """NCBI EMBL code.""" + ... + + @property + def division_id(self) -> Optional[str]: + """NCBI division ID.""" + ... + + @property + def mitochondrial_genetic_code_id(self) -> Optional[str]: + """NCBI mitochondrial genetic code ID.""" + ... + class Taxonomy: """ The Taxonomy object provides the primary interface for exploring a @@ -67,6 +99,19 @@ class Taxonomy: """Export a Taxonomy as a JSON-encoded byte string in a tree format""" ... + def to_json_tree_streaming(self) -> bytes: + """ + Export a Taxonomy as a JSON-encoded byte string in a tree format. + + This version uses streaming/incremental writing for much better memory + efficiency with large taxonomies (e.g., full NCBI taxonomy with 2.5M+ nodes). + + Unlike to_json_tree(), which builds the entire tree structure in memory + before serialization, this writes JSON directly as it traverses the tree. + Use this for large NCBI taxonomies to avoid high memory usage. + """ + ... + def to_json_node_links(self) -> bytes: """Export a Taxonomy as a JSON-encoded byte string in a node link format""" ... @@ -123,6 +168,32 @@ class Taxonomy: """ ... + def data(self, tax_id: str) -> dict: + """ + Get all extra data fields for a taxonomy node as a dictionary. + + Args: + tax_id: The taxonomy ID to look up + + Returns: + Dictionary containing all additional fields from the taxonomy + """ + ... + + def get_field(self, tax_id: str, field: str, default: Any = None) -> Any: + """ + Get a specific data field for a taxonomy node. + + Args: + tax_id: The taxonomy ID to look up + field: Field name to retrieve + default: Default value if field doesn't exist (default: None) + + Returns: + Value of the field or default if not found + """ + ... + def internal_index(self, tax_id: str) -> int: """Return the internal integer ID generated by the taxonomy library""" ... From 374dcf1893fc76215faf7baaa49d9356ccb47416 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Fri, 20 Feb 2026 23:08:51 +0000 Subject: [PATCH 5/8] comments --- src/python.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python.rs b/src/python.rs index 5b830cf..2c5fc05 100644 --- a/src/python.rs +++ b/src/python.rs @@ -318,6 +318,9 @@ impl Taxonomy { /// -- /// /// Export a Taxonomy as a JSON-encoded byte string in a tree format + /// Suggest using to_json_tree_streaming instead for large trees + /// with additional data available in the tree, this function may now OOM for users + /// perhaps limit this function to outputting the few fields previously available fn to_json_tree(&self, py: Python<'_>) -> PyResult { let mut bytes = Vec::new(); py_try!(json::save::<_, &str, _>( From b143758ee6897e5b5a854557d024782f293a47b0 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Fri, 20 Feb 2026 23:24:37 +0000 Subject: [PATCH 6/8] resolved newick format --- src/python.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python.rs b/src/python.rs index 2c5fc05..e2b5daa 100644 --- a/src/python.rs +++ b/src/python.rs @@ -381,7 +381,7 @@ impl Taxonomy { /// to_newick(self) /// -- - /// + /// Uses taxonomy ID for node names (e.g, for E. coli the node name is 562) /// Export a Taxonomy as a Newick-encoded byte string. fn to_newick(&self, py: Python<'_>) -> PyResult { let mut bytes = Vec::new(); From 03b308e7ace1d7080ad7b7e6167c0e2619521f07 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Fri, 20 Feb 2026 23:31:05 +0000 Subject: [PATCH 7/8] a --- src/formats/json.rs | 30 ++++++++++++---- src/formats/ncbi.rs | 86 ++++++++++++++++++++++++++++++++++++--------- src/python.rs | 12 ++++--- 3 files changed, 102 insertions(+), 26 deletions(-) diff --git a/src/formats/json.rs b/src/formats/json.rs index 2b4a9bd..5839b1e 100644 --- a/src/formats/json.rs +++ b/src/formats/json.rs @@ -844,13 +844,22 @@ mod tests { // Verify we can reload from streaming output let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap(); assert_eq!(Taxonomy::<&str>::len(&tax2), 3); - assert_eq!(Taxonomy::<&str>::name(&tax2, "562").unwrap(), "Escherichia coli"); + assert_eq!( + Taxonomy::<&str>::name(&tax2, "562").unwrap(), + "Escherichia coli" + ); // Verify extra data is preserved let data = Taxonomy::<&str>::data(&tax2, "562").unwrap(); - assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11")); + assert_eq!( + data.get("genetic_code_id").and_then(|v| v.as_str()), + Some("11") + ); assert_eq!(data.get("embl_code").and_then(|v| v.as_str()), Some("EC")); - assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli")); + assert_eq!( + data.get("name_common_name").and_then(|v| v.as_str()), + Some("E. coli") + ); } #[test] @@ -883,9 +892,18 @@ mod tests { let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap(); let data = Taxonomy::<&str>::data(&tax2, "562").unwrap(); - assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11")); - assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli")); - assert_eq!(data.get("name_synonym").and_then(|v| v.as_str()), Some("Bacterium coli")); + assert_eq!( + data.get("genetic_code_id").and_then(|v| v.as_str()), + Some("11") + ); + assert_eq!( + data.get("name_common_name").and_then(|v| v.as_str()), + Some("E. coli") + ); + assert_eq!( + data.get("name_synonym").and_then(|v| v.as_str()), + Some("Bacterium coli") + ); assert_eq!(data.get("comments").and_then(|v| v.as_str()), Some("")); } } diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index d5a3b5f..0011b0e 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -47,7 +47,12 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult>(ncbi_directory: P) -> TaxonomyResult>(ncbi_directory: P) -> TaxonomyResult::data(&tax2, "562").unwrap(); assert_eq!( - data_562_after.get("genetic_code_id").and_then(|v| v.as_str()), + data_562_after + .get("genetic_code_id") + .and_then(|v| v.as_str()), Some("11") ); // Check that alternative names are preserved assert!(data_562_after.contains_key("name_common_name")); assert_eq!( - data_562_after.get("name_common_name").and_then(|v| v.as_str()), + data_562_after + .get("name_common_name") + .and_then(|v| v.as_str()), Some("E. coli") ); } diff --git a/src/python.rs b/src/python.rs index e2b5daa..27e5b42 100644 --- a/src/python.rs +++ b/src/python.rs @@ -344,9 +344,7 @@ impl Taxonomy { fn to_json_tree_streaming(&self, py: Python<'_>) -> PyResult { let mut bytes = Vec::new(); py_try!(json::save_tree_streaming::<_, &str, _>( - &mut bytes, - &self.tax, - None + &mut bytes, &self.tax, None )); Ok(PyBytes::new(py, &bytes).into()) } @@ -527,7 +525,13 @@ impl Taxonomy { /// /// Returns: /// Value of the field or default if not found - fn get_field(&self, tax_id: &str, field: &str, default: Option<&PyAny>, py: Python<'_>) -> PyResult { + fn get_field( + &self, + tax_id: &str, + field: &str, + default: Option<&PyAny>, + py: Python<'_>, + ) -> PyResult { let data = py_try!(self.tax.data(tax_id)); if let Some(value) = data.get(field) { Ok(json_value_to_pyobject(value)) From 4bd9740f4f88e49f95a103b8157afad1d5504a82 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Sat, 21 Feb 2026 21:20:05 +0000 Subject: [PATCH 8/8] fixed json teest --- src/formats/json.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/formats/json.rs b/src/formats/json.rs index 5839b1e..8e2dd68 100644 --- a/src/formats/json.rs +++ b/src/formats/json.rs @@ -426,18 +426,16 @@ where to_writer(&mut *writer, value)?; } - // Write children + // Write children (always include, even if empty, to match regular format) let children = tax.children(tax_id)?; - if !children.is_empty() { - write!(writer, ",\"children\":[")?; - for (i, child) in children.into_iter().enumerate() { - if i > 0 { - write!(writer, ",")?; - } - write_node_streaming(tax, child, writer)?; + write!(writer, ",\"children\":[")?; + for (i, child) in children.into_iter().enumerate() { + if i > 0 { + write!(writer, ",")?; } - write!(writer, "]")?; + write_node_streaming(tax, child, writer)?; } + write!(writer, "]")?; write!(writer, "}}")?; Ok(())