From 0d1746e2513e5c9bc52061f6ccfa33ede4f8f2ba Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 18:25:34 +0000
Subject: [PATCH 1/8] a

---
 src/formats/ncbi.rs | 198 ++++++++++++++++++++++++++++++++++++++++----
 src/python.rs       | 123 +++++++++++++++++++++++++++
 2 files changed, 306 insertions(+), 15 deletions(-)
diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs
index c3b8522..aa56511 100644
--- a/src/formats/ncbi.rs
+++ b/src/formats/ncbi.rs
@@ -23,12 +23,12 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
     let mut tax_ids: Vec<String> = Vec::new();
     let mut parents: Vec<String> = Vec::new();
     let mut ranks: Vec<TaxRank> = Vec::new();
+    let mut data: Vec<HashMap<String, serde_json::Value>> = Vec::new();
     let mut tax_to_idx: HashMap<String, usize> = HashMap::new();
 
     for (ix, line) in BufReader::new(nodes_file).lines().enumerate() {
         let mut fields: Vec<String> = line?.split("\t|\t").map(|x| x.to_string()).collect();
-        if fields.len() < 10 {
-            // should be at least 14
+        if fields.len() < 12 {
             return Err(Error::new(ErrorKind::ImportError {
                 line: ix,
                 msg: "Not enough fields in nodes.dmp; bad line?".to_owned(),
@@ -36,11 +36,60 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         }
         let tax_id = fields.remove(0).trim().to_string();
         let parent_tax_id = fields.remove(0).trim().to_string();
-        let rank = fields.remove(0);
+        let rank = fields.remove(0).trim().to_string();
+        let embl_code = fields.remove(0).trim().to_string();
+        let division_id = fields.remove(0).trim().to_string();
+        let inherited_div_flag = fields.remove(0).trim().to_string();
+        let genetic_code_id = fields.remove(0).trim().to_string();
+        let inherited_GC_flag = fields.remove(0).trim().to_string();
+        let mitochondrial_genetic_code_id = fields.remove(0).trim().to_string();
+        let inherited_MGC_flag = fields.remove(0).trim().to_string();
+        let GenBank_hidden_flag = fields.remove(0).trim().to_string();
+        let hidden_subtree_root_flag = fields.remove(0).trim().to_string();
+        let comments = if !fields.is_empty() {
+            fields.remove(0).trim().trim_end_matches("\t|").to_string()
+        } else {
+            String::new()
+        };
 
         tax_ids.push(tax_id.clone());
         parents.push(parent_tax_id.to_string());
         ranks.push(TaxRank::from_str(&rank)?);
+
+        // Store all node fields in data HashMap
+        let mut node_data = HashMap::new();
+        if !embl_code.is_empty() {
+            node_data.insert("embl_code".to_string(), serde_json::Value::String(embl_code));
+        }
+        if !division_id.is_empty() {
+            node_data.insert("division_id".to_string(), serde_json::Value::String(division_id));
+        }
+        if !inherited_div_flag.is_empty() {
+            node_data.insert("inherited_div_flag".to_string(), serde_json::Value::String(inherited_div_flag));
+        }
+        if !genetic_code_id.is_empty() {
+            node_data.insert("genetic_code_id".to_string(), serde_json::Value::String(genetic_code_id));
+        }
+        if !inherited_GC_flag.is_empty() {
+            node_data.insert("inherited_GC_flag".to_string(), serde_json::Value::String(inherited_GC_flag));
+        }
+        if !mitochondrial_genetic_code_id.is_empty() {
+            node_data.insert("mitochondrial_genetic_code_id".to_string(), serde_json::Value::String(mitochondrial_genetic_code_id));
+        }
+        if !inherited_MGC_flag.is_empty() {
+            node_data.insert("inherited_MGC_flag".to_string(), serde_json::Value::String(inherited_MGC_flag));
+        }
+        if !GenBank_hidden_flag.is_empty() {
+            node_data.insert("GenBank_hidden_flag".to_string(), serde_json::Value::String(GenBank_hidden_flag));
+        }
+        if !hidden_subtree_root_flag.is_empty() {
+            node_data.insert("hidden_subtree_root_flag".to_string(), serde_json::Value::String(hidden_subtree_root_flag));
+        }
+        if !comments.is_empty() {
+            node_data.insert("comments".to_string(), serde_json::Value::String(comments));
+        }
+
+        data.push(node_data);
         tax_to_idx.insert(tax_id, ix);
     }
 
@@ -57,28 +106,45 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         }
     }
 
-    // And then grab their names by their idx
+    // Grab scientific names and store all other name types in data
     let mut names: Vec<String> = vec![String::new(); tax_ids.len()];
     for (ix, line) in BufReader::new(names_file).lines().enumerate() {
         let mut fields: Vec<String> = line?.split("\t|\t").map(|x| x.to_string()).collect();
-        if fields.len() > 10 {
-            // should only be 5
+        if fields.len() < 4 {
             return Err(Error::new(ErrorKind::ImportError {
                 line: ix,
-                msg: "Too many fields in names.dmp".to_owned(),
+                msg: "Not enough fields in names.dmp".to_owned(),
             }));
         }
         let tax_id = fields.remove(0).trim().to_string();
-        let name = fields.remove(0).trim().to_string();
-        let name_class = fields.remove(1);
-        if name_class.starts_with("scientific name") {
-            let name = name.to_string();
-            names[tax_to_idx[&*tax_id]] = name;
+        let name_txt = fields.remove(0).trim().to_string();
+        let unique_name = fields.remove(0).trim().to_string();
+        let name_class = fields.remove(0).trim().trim_end_matches("\t|").to_string();
+
+        if let Some(&idx) = tax_to_idx.get(&tax_id) {
+            if name_class == "scientific name" {
+                names[idx] = name_txt.clone();
+            }
+
+            // Store all name types in data
+            let name_key = format!("name_{}", name_class.replace(" ", "_"));
+            data[idx].insert(name_key, serde_json::Value::String(name_txt));
+
+            if !unique_name.is_empty() {
+                let unique_key = format!("unique_name_{}", name_class.replace(" ", "_"));
+                data[idx].insert(unique_key, serde_json::Value::String(unique_name));
+            }
         }
     }
 
-    let gt =
-        GeneralTaxonomy::from_arrays(tax_ids, parent_ids, Some(names), Some(ranks), None, None)?;
+    let gt = GeneralTaxonomy::from_arrays(
+        tax_ids,
+        parent_ids,
+        Some(names),
+        Some(ranks),
+        None,
+        Some(data),
+    )?;
     gt.validate_uniqueness()?;
     Ok(gt)
 }
@@ -106,14 +172,88 @@ where
                 .map(|(x, _)| format!("{}", x))
                 .unwrap_or_default()
         };
+
+        // Extract data fields
+        let node_data = tax.data(key.clone())?;
+        let embl_code = node_data
+            .get("embl_code")
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+        let division_id = node_data
+            .get("division_id")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let inherited_div_flag = node_data
+            .get("inherited_div_flag")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let genetic_code_id = node_data
+            .get("genetic_code_id")
+            .and_then(|v| v.as_str())
+            .unwrap_or("1");
+        let inherited_GC_flag = node_data
+            .get("inherited_GC_flag")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let mitochondrial_genetic_code_id = node_data
+            .get("mitochondrial_genetic_code_id")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let inherited_MGC_flag = node_data
+            .get("inherited_MGC_flag")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let GenBank_hidden_flag = node_data
+            .get("GenBank_hidden_flag")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let hidden_subtree_root_flag = node_data
+            .get("hidden_subtree_root_flag")
+            .and_then(|v| v.as_str())
+            .unwrap_or("0");
+        let comments = node_data
+            .get("comments")
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+
+        // Write scientific name
         name_writer
             .write_all(format!("{}\t|\t{}\t|\t\t|\tscientific name\t|\n", &key, name).as_bytes())?;
+
+        // Write all alternative names from data
+        for (data_key, value) in node_data.iter() {
+            if data_key.starts_with("name_") && data_key != "name_scientific_name" {
+                let name_class = data_key.strip_prefix("name_").unwrap().replace("_", " ");
+                let name_txt = value.as_str().unwrap_or("");
+                let unique_name_key = format!("unique_name_{}", data_key.strip_prefix("name_").unwrap());
+                let unique_name = node_data
+                    .get(&unique_name_key)
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("");
+                name_writer.write_all(
+                    format!("{}\t|\t{}\t|\t{}\t|\t{}\t|\n", &key, name_txt, unique_name, name_class)
+                        .as_bytes(),
+                )?;
+            }
+        }
+
+        // Write nodes.dmp entry with all fields
         node_writer.write_all(
             format!(
-                "{}\t|\t{}\t|\t{}\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n",
+                "{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\n",
                 &key,
                 parent,
                 rank.to_ncbi_rank(),
+                embl_code,
+                division_id,
+                inherited_div_flag,
+                genetic_code_id,
+                inherited_GC_flag,
+                mitochondrial_genetic_code_id,
+                inherited_MGC_flag,
+                GenBank_hidden_flag,
+                hidden_subtree_root_flag,
+                comments,
             )
             .as_bytes(),
         )?;
@@ -219,6 +359,20 @@ mod tests {
             Some(("561", 1.))
         );
 
+        // Check that node data fields are loaded
+        let data_562 = Taxonomy::<&str>::data(&tax, "562").unwrap();
+        assert_eq!(
+            data_562.get("genetic_code_id").and_then(|v| v.as_str()),
+            Some("11")
+        );
+
+        // Check that alternative names are stored
+        assert!(data_562.contains_key("name_common_name"));
+        assert_eq!(
+            data_562.get("name_common_name").and_then(|v| v.as_str()),
+            Some("E. coli")
+        );
+
         let out = path.join("out");
         save::<&str, _, _>(&tax, &out).unwrap();
 
@@ -258,5 +412,19 @@ mod tests {
             Taxonomy::<&str>::children(&tax2, "561").unwrap(),
             vec!["562"]
         );
+
+        // Check that data fields are preserved through save/load cycle
+        let data_562_after = Taxonomy::<&str>::data(&tax2, "562").unwrap();
+        assert_eq!(
+            data_562_after.get("genetic_code_id").and_then(|v| v.as_str()),
+            Some("11")
+        );
+
+        // Check that alternative names are preserved
+        assert!(data_562_after.contains_key("name_common_name"));
+        assert_eq!(
+            data_562_after.get("name_common_name").and_then(|v| v.as_str()),
+            Some("E. coli")
+        );
     }
 }
diff --git a/src/python.rs b/src/python.rs
index ba104fd..940cd58 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -124,6 +124,78 @@ impl TaxonomyNode {
             self.id, self.rank, self.name
         ))
     }
+
+    /// get_data(self)
+    /// --
+    ///
+    /// Get all extra data fields as a dictionary.
+    ///
+    /// Returns:
+    ///     dict: Dictionary containing all additional fields from the taxonomy
+    fn get_data(&self, py: Python<'_>) -> PyResult<PyObject> {
+        let pydict = PyDict::new(py);
+        for (key, val) in self.extra.iter() {
+            pydict.set_item(key, json_value_to_pyobject(val))?;
+        }
+        Ok(pydict.to_object(py))
+    }
+
+    /// get_data_keys(self)
+    /// --
+    ///
+    /// Get list of all available data field keys.
+    ///
+    /// Returns:
+    ///     list: List of field names available in the extra data
+    fn get_data_keys(&self, py: Python<'_>) -> PyResult<PyObject> {
+        let pylist = PyList::empty(py);
+        for key in self.extra.keys() {
+            pylist.append(key)?;
+        }
+        Ok(pylist.to_object(py))
+    }
+
+    /// get(self, key: str, default=None)
+    /// --
+    ///
+    /// Get a data field value with optional default.
+    ///
+    /// Args:
+    ///     key: Field name to retrieve
+    ///     default: Default value if field doesn't exist (default: None)
+    ///
+    /// Returns:
+    ///     Value of the field or default if not found
+    fn get(&self, key: &str, default: Option<&PyAny>, py: Python<'_>) -> PyResult<PyObject> {
+        if self.extra.contains_key(key) {
+            Ok(json_value_to_pyobject(self.extra.get(key).unwrap()))
+        } else if let Some(def) = default {
+            Ok(def.to_object(py))
+        } else {
+            Ok(py.None())
+        }
+    }
+
+    /// Convenience properties for common NCBI fields
+    #[getter]
+    fn genetic_code_id(&self, py: Python<'_>) -> PyResult<PyObject> {
+        self.get("genetic_code_id", None, py)
+    }
+
+    #[getter]
+    fn embl_code(&self, py: Python<'_>) -> PyResult<PyObject> {
+        self.get("embl_code", None, py)
+    }
+
+    #[getter]
+    fn division_id(&self, py: Python<'_>) -> PyResult<PyObject> {
+        self.get("division_id", None, py)
+    }
+
+    #[getter]
+    fn mitochondrial_genetic_code_id(&self, py: Python<'_>) -> PyResult<PyObject> {
+        self.get("mitochondrial_genetic_code_id", None, py)
+    }
 }
 
 /// The Taxonomy object provides the primary interface for exploring a
@@ -206,6 +278,15 @@ impl Taxonomy {
     ///
     /// Load a Taxonomy from a directory.
     /// The directory must contain the `nodes.dmp` and `names.dmp` files.
+    ///
+    /// All fields from both nodes.dmp and names.dmp are loaded and accessible
+    /// via the node's data methods or dict-like interface.
+    ///
+    /// Args:
+    ///     dump_dir: Path to directory containing NCBI taxonomy dump files
+    ///
+    /// Returns:
+    ///     Taxonomy: Loaded taxonomy with all NCBI fields accessible
     #[classmethod]
     fn from_ncbi(_cls: &PyType, dump_dir: &str) -> PyResult<Taxonomy> {
         let tax = py_try!(ncbi::load(dump_dir));
@@ -393,6 +474,48 @@ impl Taxonomy {
         Ok(res)
     }
 
+    /// data(self, tax_id: str)
+    /// --
+    ///
+    /// Get all extra data fields for a taxonomy node as a dictionary.
+    ///
+    /// Args:
+    ///     tax_id: The taxonomy ID to look up
+    ///
+    /// Returns:
+    ///     dict: Dictionary containing all additional fields from the taxonomy
+    fn data(&self, tax_id: &str, py: Python<'_>) -> PyResult<PyObject> {
+        let data = py_try!(self.tax.data(tax_id));
+        let pydict = PyDict::new(py);
+        for (key, val) in data.iter() {
+            pydict.set_item(key, json_value_to_pyobject(val))?;
+        }
+        Ok(pydict.to_object(py))
+    }
+
+    /// get_field(self, tax_id: str, field: str, default=None)
+    /// --
+    ///
+    /// Get a specific data field for a taxonomy node.
+    ///
+    /// Args:
+    ///     tax_id: The taxonomy ID to look up
+    ///     field: Field name to retrieve
+    ///     default: Default value if field doesn't exist (default: None)
+    ///
+    /// Returns:
+    ///     Value of the field or default if not found
+    fn get_field(&self, tax_id: &str, field: &str, default: Option<&PyAny>, py: Python<'_>) -> PyResult<PyObject> {
+        let data = py_try!(self.tax.data(tax_id));
+        if let Some(value) = data.get(field) {
+            Ok(json_value_to_pyobject(value))
+        } else if let Some(def) = default {
+            Ok(def.to_object(py))
+        } else {
+            Ok(py.None())
+        }
+    }
+
     /// internal_index(self, tax_id: str)
     /// --
     ///

From e1fe38f4b52ac57cbcf456058fb73aa067467c61 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 20:58:26 +0000
Subject: [PATCH 2/8] added helper

---
 src/formats/ncbi.rs | 82 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 24 deletions(-)

diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs
index aa56511..9fdf406 100644
--- a/src/formats/ncbi.rs
+++ b/src/formats/ncbi.rs
@@ -149,6 +149,47 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
     Ok(gt)
 }
 
+/// Helper function to write a single row to nodes.dmp
+fn write_nodes_row(
+    writer: &mut BufWriter<std::fs::File>,
+    tax_id: &str,
+    parent: &str,
+    rank: &str,
+    embl_code: &str,
+    division_id: &str,
+    inherited_div: &str,
+    genetic_code: &str,
+    inherited_gc: &str,
+    mito_gc: &str,
+    inherited_mgc: &str,
+    genbank_hidden: &str,
+    subtree_hidden: &str,
+    comments: &str,
+) -> std::io::Result<()> {
+    write!(
+        writer,
+        "{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\n",
+        tax_id, parent, rank, embl_code, division_id, inherited_div,
+        genetic_code, inherited_gc, mito_gc, inherited_mgc,
+        genbank_hidden, subtree_hidden, comments
+    )
+}
+
+/// Helper function to write a single row to names.dmp
+fn write_names_row(
+    writer: &mut BufWriter<std::fs::File>,
+    tax_id: &str,
+    name: &str,
+    unique_name: &str,
+    name_class: &str,
+) -> std::io::Result<()> {
+    write!(
+        writer,
+        "{}\t|\t{}\t|\t{}\t|\t{}\t|\n",
+        tax_id, name, unique_name, name_class
+    )
+}
+
 pub fn save<'t, T: 't, P: AsRef<Path>, X: Taxonomy<'t, T>>(
     tax: &'t X,
     out_dir: P,
@@ -217,8 +258,7 @@ where
             .unwrap_or("");
 
         // Write scientific name
-        name_writer
-            .write_all(format!("{}\t|\t{}\t|\t\t|\tscientific name\t|\n", &key, name).as_bytes())?;
+        write_names_row(&mut name_writer, &format!("{}", &key), &name, "", "scientific name")?;
 
         // Write all alternative names from data
         for (data_key, value) in node_data.iter() {
@@ -230,32 +270,26 @@ where
                     .get(&unique_name_key)
                     .and_then(|v| v.as_str())
                     .unwrap_or("");
-                name_writer.write_all(
-                    format!("{}\t|\t{}\t|\t{}\t|\t{}\t|\n", &key, name_txt, unique_name, name_class)
-                        .as_bytes(),
-                )?;
+                write_names_row(&mut name_writer, &format!("{}", &key), name_txt, unique_name, &name_class)?;
             }
         }
 
         // Write nodes.dmp entry with all fields
-        node_writer.write_all(
-            format!(
-                "{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\t{}\t|\n",
-                &key,
-                parent,
-                rank.to_ncbi_rank(),
-                embl_code,
-                division_id,
-                inherited_div_flag,
-                genetic_code_id,
-                inherited_GC_flag,
-                mitochondrial_genetic_code_id,
-                inherited_MGC_flag,
-                GenBank_hidden_flag,
-                hidden_subtree_root_flag,
-                comments,
-            )
-            .as_bytes(),
+        write_nodes_row(
+            &mut node_writer,
+            &format!("{}", &key),
+            &parent,
+            rank.to_ncbi_rank(),
+            embl_code,
+            division_id,
+            inherited_div_flag,
+            genetic_code_id,
+            inherited_GC_flag,
+            mitochondrial_genetic_code_id,
+            inherited_MGC_flag,
+            GenBank_hidden_flag,
+            hidden_subtree_root_flag,
+            comments,
         )?;
     }
 

From 00963e66a0ba0adaa3a3e316ade689999c5b5698 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 21:23:23 +0000
Subject: [PATCH 3/8] a

---
 src/formats/ncbi.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs
index 9fdf406..d5a3b5f 100644
--- a/src/formats/ncbi.rs
+++ b/src/formats/ncbi.rs
@@ -47,7 +47,7 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         let GenBank_hidden_flag = fields.remove(0).trim().to_string();
         let hidden_subtree_root_flag = fields.remove(0).trim().to_string();
         let comments = if !fields.is_empty() {
-            fields.remove(0).trim().trim_end_matches("\t|").to_string()
+            fields.remove(0).trim().trim_end_matches('|').trim().to_string()
         } else {
             String::new()
         };
@@ -119,7 +119,7 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         let tax_id = fields.remove(0).trim().to_string();
         let name_txt = fields.remove(0).trim().to_string();
         let unique_name = fields.remove(0).trim().to_string();
-        let name_class = fields.remove(0).trim().trim_end_matches("\t|").to_string();
+        let name_class = fields.remove(0).trim().trim_end_matches('|').trim().to_string();
 
         if let Some(&idx) = tax_to_idx.get(&tax_id) {
             if name_class == "scientific name" {

From bd9f14ec2c4f98c0504f54cdbb03df9973bb3445 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 22:58:39 +0000
Subject: [PATCH 4/8] added json support

---
 src/formats/json.rs | 168 ++++++++++++++++++++++++++++++++++++++++++++
 src/python.rs       |  19 +++++
 taxonomy.pyi        |  71 +++++++++++++++++++
 3 files changed, 258 insertions(+)

diff --git a/src/formats/json.rs b/src/formats/json.rs
index 9d1d81a..2b4a9bd 100644
--- a/src/formats/json.rs
+++ b/src/formats/json.rs
@@ -369,6 +369,80 @@ where
     Ok(())
 }
 
+/// Memory-efficient streaming tree export that writes JSON incrementally
+/// instead of building the entire tree structure in memory first.
+pub fn save_tree_streaming<'t, W: Write, T: 't, X: Taxonomy<'t, T>>(
+    mut writer: W,
+    taxonomy: &'t X,
+    root_node: Option<T>,
+) -> TaxonomyResult<()>
+where
+    T: Clone + Debug + Display + Eq + Hash + PartialEq,
+{
+    let tax_id = root_node
+        .or_else(|| {
+            if taxonomy.is_empty() {
+                None
+            } else {
+                Some(taxonomy.root())
+            }
+        })
+        .ok_or(Error::new(ErrorKind::InvalidTaxonomy(
+            "Taxonomy must have a root node.".to_string(),
+        )))?;
+
+    write_node_streaming(taxonomy, tax_id, &mut writer)?;
+    writer.flush()?;
+    Ok(())
+}
+
+fn write_node_streaming<'t, W: Write, T: 't>(
+    tax: &'t impl Taxonomy<'t, T>,
+    tax_id: T,
+    writer: &mut W,
+) -> TaxonomyResult<()>
+where
+    T: Clone + Debug + Display + Eq + Hash + PartialEq,
+{
+    write!(writer, "{{")?;
+
+    // Write id
+    write!(writer, "\"id\":")?;
+    to_writer(&mut *writer, &tax_id.to_string())?;
+
+    // Write name
+    write!(writer, ",\"name\":")?;
+    to_writer(&mut *writer, &tax.name(tax_id.clone())?)?;
+
+    // Write rank
+    write!(writer, ",\"rank\":")?;
+    to_writer(&mut *writer, tax.rank(tax_id.clone())?.to_ncbi_rank())?;
+
+    // Write extra data fields
+    let data = tax.data(tax_id.clone())?;
+    for (key, value) in data.iter() {
+        write!(writer, ",{}", serde_json::to_string(key)?)?;
+        write!(writer, ":")?;
+        to_writer(&mut *writer, value)?;
+    }
+
+    // Write children
+    let children = tax.children(tax_id)?;
+    if !children.is_empty() {
+        write!(writer, ",\"children\":[")?;
+        for (i, child) in children.into_iter().enumerate() {
+            if i > 0 {
+                write!(writer, ",")?;
+            }
+            write_node_streaming(tax, child, writer)?;
+        }
+        write!(writer, "]")?;
+    }
+
+    write!(writer, "}}")?;
+    Ok(())
+}
+
 fn serialize_as_tree<'t, T: 't>(
     taxonomy: &'t impl Taxonomy<'t, T>,
     root_node: Option<T>,
@@ -720,4 +794,98 @@ mod tests {
         let example = r#"{"id": "1", "rank": null, "name": ""}"#;
         assert!(load(Cursor::new(example), None).is_ok());
     }
+
+    #[test]
+    fn streaming_tree_matches_regular_tree() {
+        // Create a taxonomy with multiple nodes and extra data
+        let example = r#"{
+            "id": "1",
+            "name": "root",
+            "rank": "no rank",
+            "custom_field": "value1",
+            "children": [
+                {
+                    "id": "2",
+                    "name": "Bacteria",
+                    "rank": "superkingdom",
+                    "genetic_code_id": "11",
+                    "children": [
+                        {
+                            "id": "562",
+                            "name": "Escherichia coli",
+                            "rank": "species",
+                            "genetic_code_id": "11",
+                            "embl_code": "EC",
+                            "division_id": "0",
+                            "name_common_name": "E. coli"
+                        }
+                    ]
+                }
+            ]
+        }"#;
+
+        let tax = load(Cursor::new(example), None).unwrap();
+
+        // Save using regular method
+        let mut regular_output = Vec::new();
+        save::<_, &str, _>(&mut regular_output, &tax, JsonFormat::Tree, None).unwrap();
+
+        // Save using streaming method
+        let mut streaming_output = Vec::new();
+        save_tree_streaming::<_, &str, _>(&mut streaming_output, &tax, None).unwrap();
+
+        // Parse both outputs
+        let regular_json: Value = serde_json::from_slice(&regular_output).unwrap();
+        let streaming_json: Value = serde_json::from_slice(&streaming_output).unwrap();
+
+        // They should be equivalent
+        assert_eq!(regular_json, streaming_json);
+
+        // Verify we can reload from streaming output
+        let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap();
+        assert_eq!(Taxonomy::<&str>::len(&tax2), 3);
+        assert_eq!(Taxonomy::<&str>::name(&tax2, "562").unwrap(), "Escherichia coli");
+
+        // Verify extra data is preserved
+        let data = Taxonomy::<&str>::data(&tax2, "562").unwrap();
+        assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11"));
+        assert_eq!(data.get("embl_code").and_then(|v| v.as_str()), Some("EC"));
+        assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli"));
+    }
+
+    #[test]
+    fn streaming_handles_large_data_fields() {
+        // Test that streaming handles nodes with lots of extra fields (like NCBI data)
+        let example = r#"{
+            "id": "562",
+            "name": "Escherichia coli",
+            "rank": "species",
+            "genetic_code_id": "11",
+            "division_id": "0",
+            "inherited_div_flag": "1",
+            "mitochondrial_genetic_code_id": "0",
+            "inherited_MGC_flag": "1",
+            "GenBank_hidden_flag": "0",
+            "hidden_subtree_root_flag": "0",
+            "comments": "",
+            "name_scientific_name": "Escherichia coli",
+            "name_common_name": "E. coli",
+            "name_synonym": "Bacterium coli",
+            "name_authority": "Escherichia coli (Migula 1895) Castellani and Chalmers 1919"
+        }"#;
+
+        let tax = load(Cursor::new(example), None).unwrap();
+
+        let mut streaming_output = Vec::new();
+        save_tree_streaming::<_, &str, _>(&mut streaming_output, &tax, None).unwrap();
+
+        // Reload and verify all fields preserved
+        let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap();
+        let data = Taxonomy::<&str>::data(&tax2, "562").unwrap();
+
+        assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11"));
+        assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli"));
+        assert_eq!(data.get("name_synonym").and_then(|v| v.as_str()), Some("Bacterium coli"));
+        assert_eq!(data.get("comments").and_then(|v| v.as_str()), Some(""));
+    }
 }
diff --git a/src/python.rs b/src/python.rs
index 940cd58..5b830cf 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -329,6 +329,25 @@ impl Taxonomy {
         Ok(PyBytes::new(py, &bytes).into())
     }
 
+    /// to_json_tree_streaming(self)
+    /// --
+    ///
+    /// Export a Taxonomy as a JSON-encoded byte string in a tree format.
+    /// This version uses streaming/incremental writing for much better memory
+    /// efficiency with large taxonomies (e.g., full NCBI taxonomy with 2.5M+ nodes).
+    ///
+    /// Unlike to_json_tree(), which builds the entire tree structure in memory
+    /// before serialization, this writes JSON directly as it traverses the tree.
+    fn to_json_tree_streaming(&self, py: Python<'_>) -> PyResult<PyObject> {
+        let mut bytes = Vec::new();
+        py_try!(json::save_tree_streaming::<_, &str, _>(
+            &mut bytes,
+            &self.tax,
+            None
+        ));
+        Ok(PyBytes::new(py, &bytes).into())
+    }
+
     /// to_json_node_links(self)
     /// --
     ///
diff --git a/taxonomy.pyi b/taxonomy.pyi
index 91eb779..25ee7ce 100644
--- a/taxonomy.pyi
+++ b/taxonomy.pyi
@@ -19,6 +19,38 @@ class TaxonomyNode:
     def __eq__(self, other: object) -> bool: ...
     def __ne__(self, other: object) -> bool: ...
 
+    def get_data(self) -> dict:
+        """Get all extra data fields as a dictionary."""
+        ...
+
+    def get_data_keys(self) -> List[str]:
+        """Get list of all available data field keys."""
+        ...
+
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get a data field value with optional default."""
+        ...
+
+    @property
+    def genetic_code_id(self) -> Optional[str]:
+        """NCBI genetic code ID."""
+        ...
+
+    @property
+    def embl_code(self) -> Optional[str]:
+        """NCBI EMBL code."""
+        ...
+
+    @property
+    def division_id(self) -> Optional[str]:
+        """NCBI division ID."""
+        ...
+
+    @property
+    def mitochondrial_genetic_code_id(self) -> Optional[str]:
+        """NCBI mitochondrial genetic code ID."""
+        ...
+
 class Taxonomy:
     """
     The Taxonomy object provides the primary interface for exploring a
@@ -67,6 +99,19 @@ class Taxonomy:
         """Export a Taxonomy as a JSON-encoded byte string in a tree format"""
         ...
 
+    def to_json_tree_streaming(self) -> bytes:
+        """
+        Export a Taxonomy as a JSON-encoded byte string in a tree format.
+
+        This version uses streaming/incremental writing for much better memory
+        efficiency with large taxonomies (e.g., full NCBI taxonomy with 2.5M+ nodes).
+
+        Unlike to_json_tree(), which builds the entire tree structure in memory
+        before serialization, this writes JSON directly as it traverses the tree.
+        Use this for large NCBI taxonomies to avoid high memory usage.
+        """
+        ...
+
     def to_json_node_links(self) -> bytes:
         """Export a Taxonomy as a JSON-encoded byte string in a node link format"""
         ...
@@ -123,6 +168,32 @@ class Taxonomy:
         """
         ...
 
+    def data(self, tax_id: str) -> dict:
+        """
+        Get all extra data fields for a taxonomy node as a dictionary.
+
+        Args:
+            tax_id: The taxonomy ID to look up
+
+        Returns:
+            Dictionary containing all additional fields from the taxonomy
+        """
+        ...
+
+    def get_field(self, tax_id: str, field: str, default: Any = None) -> Any:
+        """
+        Get a specific data field for a taxonomy node.
+
+        Args:
+            tax_id: The taxonomy ID to look up
+            field: Field name to retrieve
+            default: Default value if field doesn't exist (default: None)
+
+        Returns:
+            Value of the field or default if not found
+        """
+        ...
+
     def internal_index(self, tax_id: str) -> int:
         """Return the internal integer ID generated by the taxonomy library"""
         ...

From 374dcf1893fc76215faf7baaa49d9356ccb47416 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 23:08:51 +0000
Subject: [PATCH 5/8] comments

---
 src/python.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/python.rs b/src/python.rs
index 5b830cf..2c5fc05 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -318,6 +318,9 @@ impl Taxonomy {
     /// --
     ///
     /// Export a Taxonomy as a JSON-encoded byte string in a tree format
+    /// Suggest using to_json_tree_streaming instead for large trees
+    /// with additional data available in the tree, this function may now OOM for users
+    /// perhaps limit this function to outputting the few fields previously available
     fn to_json_tree(&self, py: Python<'_>) -> PyResult<PyObject> {
         let mut bytes = Vec::new();
         py_try!(json::save::<_, &str, _>(

From b143758ee6897e5b5a854557d024782f293a47b0 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 23:24:37 +0000
Subject: [PATCH 6/8] resolved newick format

---
 src/python.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python.rs b/src/python.rs
index 2c5fc05..e2b5daa 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -381,7 +381,7 @@ impl Taxonomy {
 
     /// to_newick(self)
     /// --
-    ///
+    /// Uses taxonomy ID for node names (e.g, for E. coli the node name is 562)
     /// Export a Taxonomy as a Newick-encoded byte string.
     fn to_newick(&self, py: Python<'_>) -> PyResult<PyObject> {
         let mut bytes = Vec::new();

From 03b308e7ace1d7080ad7b7e6167c0e2619521f07 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Fri, 20 Feb 2026 23:31:05 +0000
Subject: [PATCH 7/8] a

---
 src/formats/json.rs | 30 ++++++++++++----
 src/formats/ncbi.rs | 86 ++++++++++++++++++++++++++++++++++++---------
 src/python.rs       | 12 ++++---
 3 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/src/formats/json.rs b/src/formats/json.rs
index 2b4a9bd..5839b1e 100644
--- a/src/formats/json.rs
+++ b/src/formats/json.rs
@@ -844,13 +844,22 @@ mod tests {
         // Verify we can reload from streaming output
         let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap();
         assert_eq!(Taxonomy::<&str>::len(&tax2), 3);
-        assert_eq!(Taxonomy::<&str>::name(&tax2, "562").unwrap(), "Escherichia coli");
+        assert_eq!(
+            Taxonomy::<&str>::name(&tax2, "562").unwrap(),
+            "Escherichia coli"
+        );
 
         // Verify extra data is preserved
         let data = Taxonomy::<&str>::data(&tax2, "562").unwrap();
-        assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11"));
+        assert_eq!(
+            data.get("genetic_code_id").and_then(|v| v.as_str()),
+            Some("11")
+        );
         assert_eq!(data.get("embl_code").and_then(|v| v.as_str()), Some("EC"));
-        assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli"));
+        assert_eq!(
+            data.get("name_common_name").and_then(|v| v.as_str()),
+            Some("E. coli")
+        );
     }
 
     #[test]
@@ -883,9 +892,18 @@ mod tests {
         let tax2 = load(Cursor::new(&streaming_output[..]), None).unwrap();
         let data = Taxonomy::<&str>::data(&tax2, "562").unwrap();
 
-        assert_eq!(data.get("genetic_code_id").and_then(|v| v.as_str()), Some("11"));
-        assert_eq!(data.get("name_common_name").and_then(|v| v.as_str()), Some("E. coli"));
-        assert_eq!(data.get("name_synonym").and_then(|v| v.as_str()), Some("Bacterium coli"));
+        assert_eq!(
+            data.get("genetic_code_id").and_then(|v| v.as_str()),
+            Some("11")
+        );
+        assert_eq!(
+            data.get("name_common_name").and_then(|v| v.as_str()),
+            Some("E. coli")
+        );
+        assert_eq!(
+            data.get("name_synonym").and_then(|v| v.as_str()),
+            Some("Bacterium coli")
+        );
         assert_eq!(data.get("comments").and_then(|v| v.as_str()), Some(""));
     }
 }
diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs
index d5a3b5f..0011b0e 100644
--- a/src/formats/ncbi.rs
+++ b/src/formats/ncbi.rs
@@ -47,7 +47,12 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         let GenBank_hidden_flag = fields.remove(0).trim().to_string();
         let hidden_subtree_root_flag = fields.remove(0).trim().to_string();
         let comments = if !fields.is_empty() {
-            fields.remove(0).trim().trim_end_matches('|').trim().to_string()
+            fields
+                .remove(0)
+                .trim()
+                .trim_end_matches('|')
+                .trim()
+                .to_string()
         } else {
             String::new()
         };
@@ -59,31 +64,58 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         // Store all node fields in data HashMap
         let mut node_data = HashMap::new();
         if !embl_code.is_empty() {
-            node_data.insert("embl_code".to_string(), serde_json::Value::String(embl_code));
+            node_data.insert(
+                "embl_code".to_string(),
+                serde_json::Value::String(embl_code),
+            );
         }
         if !division_id.is_empty() {
-            node_data.insert("division_id".to_string(), serde_json::Value::String(division_id));
+            node_data.insert(
+                "division_id".to_string(),
+                serde_json::Value::String(division_id),
+            );
         }
         if !inherited_div_flag.is_empty() {
-            node_data.insert("inherited_div_flag".to_string(), serde_json::Value::String(inherited_div_flag));
+            node_data.insert(
+                "inherited_div_flag".to_string(),
+                serde_json::Value::String(inherited_div_flag),
+            );
         }
         if !genetic_code_id.is_empty() {
-            node_data.insert("genetic_code_id".to_string(), serde_json::Value::String(genetic_code_id));
+            node_data.insert(
+                "genetic_code_id".to_string(),
+                serde_json::Value::String(genetic_code_id),
+            );
         }
         if !inherited_GC_flag.is_empty() {
-            node_data.insert("inherited_GC_flag".to_string(), serde_json::Value::String(inherited_GC_flag));
+            node_data.insert(
+                "inherited_GC_flag".to_string(),
+                serde_json::Value::String(inherited_GC_flag),
+            );
         }
         if !mitochondrial_genetic_code_id.is_empty() {
-            node_data.insert("mitochondrial_genetic_code_id".to_string(), serde_json::Value::String(mitochondrial_genetic_code_id));
+            node_data.insert(
+                "mitochondrial_genetic_code_id".to_string(),
+                serde_json::Value::String(mitochondrial_genetic_code_id),
+            );
         }
         if !inherited_MGC_flag.is_empty() {
-            node_data.insert("inherited_MGC_flag".to_string(), serde_json::Value::String(inherited_MGC_flag));
+            node_data.insert(
+                "inherited_MGC_flag".to_string(),
+                serde_json::Value::String(inherited_MGC_flag),
+            );
         }
         if !GenBank_hidden_flag.is_empty() {
-            node_data.insert("GenBank_hidden_flag".to_string(), serde_json::Value::String(GenBank_hidden_flag));
+            node_data.insert(
+                "GenBank_hidden_flag".to_string(),
+                serde_json::Value::String(GenBank_hidden_flag),
+            );
         }
         if !hidden_subtree_root_flag.is_empty() {
-            node_data.insert("hidden_subtree_root_flag".to_string(), serde_json::Value::String(hidden_subtree_root_flag));
+            node_data.insert(
+                "hidden_subtree_root_flag".to_string(),
+                serde_json::Value::String(hidden_subtree_root_flag),
+            );
         }
         if !comments.is_empty() {
             node_data.insert("comments".to_string(), serde_json::Value::String(comments));
@@ -119,7 +151,12 @@ pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy
         let tax_id = fields.remove(0).trim().to_string();
         let name_txt = fields.remove(0).trim().to_string();
         let unique_name = fields.remove(0).trim().to_string();
-        let name_class = fields.remove(0).trim().trim_end_matches('|').trim().to_string();
+        let name_class = fields
+            .remove(0)
+            .trim()
+            .trim_end_matches('|')
+            .trim()
+            .to_string();
 
         if let Some(&idx) = tax_to_idx.get(&tax_id) {
             if name_class == "scientific name" {
@@ -258,19 +295,32 @@ where
             .unwrap_or("");
 
         // Write scientific name
-        write_names_row(&mut name_writer, &format!("{}", &key), &name, "", "scientific name")?;
+        write_names_row(
+            &mut name_writer,
+            &format!("{}", &key),
+            &name,
+            "",
+            "scientific name",
+        )?;
 
         // Write all alternative names from data
         for (data_key, value) in node_data.iter() {
             if data_key.starts_with("name_") && data_key != "name_scientific_name" {
                 let name_class = data_key.strip_prefix("name_").unwrap().replace("_", " ");
                 let name_txt = value.as_str().unwrap_or("");
-                let unique_name_key = format!("unique_name_{}", data_key.strip_prefix("name_").unwrap());
+                let unique_name_key =
+                    format!("unique_name_{}", data_key.strip_prefix("name_").unwrap());
                 let unique_name = node_data
                     .get(&unique_name_key)
                     .and_then(|v| v.as_str())
                     .unwrap_or("");
-                write_names_row(&mut name_writer, &format!("{}", &key), name_txt, unique_name, &name_class)?;
+                write_names_row(
+                    &mut name_writer,
+                    &format!("{}", &key),
+                    name_txt,
+                    unique_name,
+                    &name_class,
+                )?;
             }
         }
 
@@ -450,14 +500,18 @@ mod tests {
         // Check that data fields are preserved through save/load cycle
         let data_562_after = Taxonomy::<&str>::data(&tax2, "562").unwrap();
         assert_eq!(
-            data_562_after.get("genetic_code_id").and_then(|v| v.as_str()),
+            data_562_after
+                .get("genetic_code_id")
+                .and_then(|v| v.as_str()),
             Some("11")
         );
 
         // Check that alternative names are preserved
         assert!(data_562_after.contains_key("name_common_name"));
         assert_eq!(
-            data_562_after.get("name_common_name").and_then(|v| v.as_str()),
+            data_562_after
+                .get("name_common_name")
+                .and_then(|v| v.as_str()),
             Some("E. coli")
         );
     }
diff --git a/src/python.rs b/src/python.rs
index e2b5daa..27e5b42 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -344,9 +344,7 @@ impl Taxonomy {
     fn to_json_tree_streaming(&self, py: Python<'_>) -> PyResult<PyObject> {
         let mut bytes = Vec::new();
         py_try!(json::save_tree_streaming::<_, &str, _>(
-            &mut bytes,
-            &self.tax,
-            None
+            &mut bytes, &self.tax, None
         ));
         Ok(PyBytes::new(py, &bytes).into())
     }
@@ -527,7 +525,13 @@ impl Taxonomy {
     ///
     /// Returns:
     ///     Value of the field or default if not found
-    fn get_field(&self, tax_id: &str, field: &str, default: Option<&PyAny>, py: Python<'_>) -> PyResult<PyObject> {
+    fn get_field(
+        &self,
+        tax_id: &str,
+        field: &str,
+        default: Option<&PyAny>,
+        py: Python<'_>,
+    ) -> PyResult<PyObject> {
         let data = py_try!(self.tax.data(tax_id));
         if let Some(value) = data.get(field) {
             Ok(json_value_to_pyobject(value))

From 4bd9740f4f88e49f95a103b8157afad1d5504a82 Mon Sep 17 00:00:00 2001
From: Robert Baldwin <rbaldwin@bugseq.com>
Date: Sat, 21 Feb 2026 21:20:05 +0000
Subject: [PATCH 8/8] fixed json teest

---
 src/formats/json.rs | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/formats/json.rs b/src/formats/json.rs
index 5839b1e..8e2dd68 100644
--- a/src/formats/json.rs
+++ b/src/formats/json.rs
@@ -426,18 +426,16 @@ where
         to_writer(&mut *writer, value)?;
     }
 
-    // Write children
+    // Write children (always include, even if empty, to match regular format)
     let children = tax.children(tax_id)?;
-    if !children.is_empty() {
-        write!(writer, ",\"children\":[")?;
-        for (i, child) in children.into_iter().enumerate() {
-            if i > 0 {
-                write!(writer, ",")?;
-            }
-            write_node_streaming(tax, child, writer)?;
+    write!(writer, ",\"children\":[")?;
+    for (i, child) in children.into_iter().enumerate() {
+        if i > 0 {
+            write!(writer, ",")?;
         }
-        write!(writer, "]")?;
+        write_node_streaming(tax, child, writer)?;
     }
+    write!(writer, "]")?;
 
     write!(writer, "}}")?;
     Ok(())