From 1784c51747737ee79033368ac22b510d88479a1f Mon Sep 17 00:00:00 2001 From: dts Date: Wed, 29 Apr 2026 12:10:49 +0200 Subject: [PATCH 1/2] fix: harden storage codec against silent truncation and OOB writes Three independent safety fixes in the storage path; all are defensive checks at boundaries that previously cast unconditionally. 1. Section encoder (atompack/src/storage/soa.rs) - write_section() now returns Result and rejects keys > 255 bytes (key_len is a u8 field) and payloads > u32::MAX (payload_len is u32). Previously these would silently truncate and produce an unreadable record. - n_sections is accumulated as usize and checked against u16::MAX before writing the on-disk u16. Previously the cast wrapped silently on pathological inputs. 2. Record-size index fields (atompack/src/storage/mod.rs) - The four `compressed.len() as u32` / `uncompressed_size as u32` casts in append_soa_records and append_owned_soa_records now go through a small helper that errors if the size exceeds u32::MAX instead of silently corrupting the index for >4 GiB single records. 3. Per-molecule slot length check (atompack-py/src/database_flat.rs) - The flat-batch hot loop validates the slot_bytes schema against the first molecule, but the per-molecule slot memcpy did not re-validate subsequent molecules' payload lengths. A malformed later molecule could OOB-write into an adjacent buffer slot from a parallel rayon thread. Added a length assertion before the memcpy. Tests: - New test_overlong_property_key_rejected_at_write covers fix #1 end-to-end. - All existing tests (123) pass unchanged, confirming no behavior regression on the happy path. --- atompack-py/src/database_flat.rs | 13 ++++++ atompack-py/tests/test_database.py | 12 ++++++ atompack/src/storage/mod.rs | 21 ++++++++-- atompack/src/storage/soa.rs | 65 ++++++++++++++++++++++-------- 4 files changed, 90 insertions(+), 21 deletions(-) diff --git a/atompack-py/src/database_flat.rs b/atompack-py/src/database_flat.rs index c02b187..6dcd835 100644 --- a/atompack-py/src/database_flat.rs +++ b/atompack-py/src/database_flat.rs @@ -199,6 +199,19 @@ pub(super) fn get_molecules_flat_soa_impl<'py>( expected ))); } + } else if schema_entry.slot_bytes != 0 + && sec.payload.len() != schema_entry.slot_bytes + { + // Per-molecule slot: schema is set from molecule 0; if a later + // molecule's payload disagrees, the memcpy below would OOB-write + // into an adjacent slot from a parallel rayon thread. + return Err(invalid_data(format!( + "Per-molecule section '{}' has invalid payload length {} for molecule {} (schema slot is {})", + sec.key, + sec.payload.len(), + i, + schema_entry.slot_bytes + ))); } if schema_entry.slot_bytes == 0 { diff --git a/atompack-py/tests/test_database.py b/atompack-py/tests/test_database.py index 7fbb075..8c5a453 100644 --- a/atompack-py/tests/test_database.py +++ b/atompack-py/tests/test_database.py @@ -577,3 +577,15 @@ def test_get_molecules_flat_empty(tmp_path: Path) -> None: assert batch["n_atoms"].shape == (0,) assert batch["positions"].shape == (0, 3) assert batch["atomic_numbers"].shape == (0,) + + +def test_overlong_property_key_rejected_at_write(tmp_path: Path) -> None: + # Section keys are stored with a u8 length field; >255-byte keys would + # silently truncate before this fix, producing an unreadable record. + # The encoder now errors clearly instead. + mol = _make_molecule(-1.0) + mol.set_property("x" * 256, 1.0) + + db = atompack.Database(str(tmp_path / "overlong.atp")) + with pytest.raises(ValueError, match=r"max is 255|too long|255 bytes"): + db.add_molecule(mol) diff --git a/atompack/src/storage/mod.rs b/atompack/src/storage/mod.rs index fe9e1be..811abad 100644 --- a/atompack/src/storage/mod.rs +++ b/atompack/src/storage/mod.rs @@ -97,6 +97,19 @@ impl SharedMmapBytes { } } +/// Convert a record byte length to the on-disk `u32` index field, erroring +/// instead of silently truncating when the record exceeds 4 GiB. +fn record_size_u32(len: usize, what: &str) -> Result { + len.try_into().map_err(|_| { + Error::InvalidData(format!( + "{} size {} bytes exceeds u32::MAX ({}); index field cannot represent it", + what, + len, + u32::MAX + )) + }) +} + pub struct AtomDatabase { path: PathBuf, compression: CompressionType, @@ -344,7 +357,7 @@ impl AtomDatabase { let compressed_records: Vec<(Vec, u32, u32)> = records .par_iter() .map(|(bytes, num_atoms)| { - let uncompressed_size = bytes.len() as u32; + let uncompressed_size = record_size_u32(bytes.len(), "uncompressed record")?; let compressed = compress(bytes, compression)?; Ok((compressed, uncompressed_size, *num_atoms)) }) @@ -361,7 +374,7 @@ impl AtomDatabase { new_indices.push(MoleculeIndex { offset, - compressed_size: compressed_data.len() as u32, + compressed_size: record_size_u32(compressed_data.len(), "compressed record")?, uncompressed_size, num_atoms, }); @@ -390,7 +403,7 @@ impl AtomDatabase { let compressed_records: Vec<(Vec, u32, u32)> = records .into_par_iter() .map(|(bytes, num_atoms)| { - let uncompressed_size = bytes.len() as u32; + let uncompressed_size = record_size_u32(bytes.len(), "uncompressed record")?; let compressed = compress(&bytes, compression)?; Ok((compressed, uncompressed_size, num_atoms)) }) @@ -406,7 +419,7 @@ impl AtomDatabase { new_indices.push(MoleculeIndex { offset, - compressed_size: compressed_data.len() as u32, + compressed_size: record_size_u32(compressed_data.len(), "compressed record")?, uncompressed_size, num_atoms, }); diff --git a/atompack/src/storage/soa.rs b/atompack/src/storage/soa.rs index 6741929..8433a9f 100644 --- a/atompack/src/storage/soa.rs +++ b/atompack/src/storage/soa.rs @@ -1,13 +1,37 @@ use super::*; /// Write a single tagged section: [kind:u8][key_len:u8][key][type_tag:u8][payload_len:u32][payload] -fn write_section(buf: &mut Vec, kind: u8, key: &str, type_tag: u8, payload: &[u8]) { +/// +/// Errors instead of silently truncating when the key exceeds the 255-byte +/// `key_len: u8` field or the payload exceeds the `u32` length field. +fn write_section( + buf: &mut Vec, + kind: u8, + key: &str, + type_tag: u8, + payload: &[u8], +) -> Result<()> { + let key_len: u8 = key.len().try_into().map_err(|_| { + Error::InvalidData(format!( + "Section key '{}...' is {} bytes; max is 255", + &key[..32.min(key.len())], + key.len() + )) + })?; + let payload_len: u32 = payload.len().try_into().map_err(|_| { + Error::InvalidData(format!( + "Section '{}' payload is {} bytes; max is u32::MAX", + key, + payload.len() + )) + })?; buf.push(kind); - buf.push(key.len() as u8); + buf.push(key_len); buf.extend_from_slice(key.as_bytes()); buf.push(type_tag); - buf.extend_from_slice(&(payload.len() as u32).to_le_bytes()); + buf.extend_from_slice(&payload_len.to_le_bytes()); buf.extend_from_slice(payload); + Ok(()) } fn property_value_type_tag(value: &PropertyValue) -> u8 { @@ -236,7 +260,7 @@ pub(super) fn serialize_molecule_soa(molecule: &Molecule) -> Result> { } buf.extend_from_slice(&molecule.atomic_numbers); - let mut n_sections: u16 = 0; + let mut n_sections: usize = 0; if molecule.charges.is_some() { n_sections += 1; } @@ -261,21 +285,28 @@ pub(super) fn serialize_molecule_soa(molecule: &Molecule) -> Result> { if molecule.velocities.is_some() { n_sections += 1; } - n_sections += molecule.atom_properties.len() as u16; - n_sections += molecule.properties.len() as u16; - buf.extend_from_slice(&n_sections.to_le_bytes()); + n_sections += molecule.atom_properties.len(); + n_sections += molecule.properties.len(); + let n_sections_u16: u16 = n_sections.try_into().map_err(|_| { + Error::InvalidData(format!( + "Molecule has {} sections; on-disk format limit is {}", + n_sections, + u16::MAX + )) + })?; + buf.extend_from_slice(&n_sections_u16.to_le_bytes()); if let Some(ref charges) = molecule.charges { let mut payload = Vec::with_capacity(charges.len() * 8); extend_f64(&mut payload, charges); - write_section(&mut buf, KIND_BUILTIN, "charges", TYPE_F64_ARRAY, &payload); + write_section(&mut buf, KIND_BUILTIN, "charges", TYPE_F64_ARRAY, &payload)?; } if let Some(ref cell) = molecule.cell { let mut payload = Vec::with_capacity(72); for row in cell { extend_f64(&mut payload, row); } - write_section(&mut buf, KIND_BUILTIN, "cell", TYPE_MAT3X3_F64, &payload); + write_section(&mut buf, KIND_BUILTIN, "cell", TYPE_MAT3X3_F64, &payload)?; } if let Some(energy) = molecule.energy { write_section( @@ -284,28 +315,28 @@ pub(super) fn serialize_molecule_soa(molecule: &Molecule) -> Result> { "energy", TYPE_FLOAT, &energy.to_le_bytes(), - ); + )?; } if let Some(ref forces) = molecule.forces { let mut payload = Vec::with_capacity(forces.len() * 12); for f in forces { extend_f32(&mut payload, f); } - write_section(&mut buf, KIND_BUILTIN, "forces", TYPE_VEC3_F32, &payload); + write_section(&mut buf, KIND_BUILTIN, "forces", TYPE_VEC3_F32, &payload)?; } if let Some(ref name) = molecule.name { - write_section(&mut buf, KIND_BUILTIN, "name", TYPE_STRING, name.as_bytes()); + write_section(&mut buf, KIND_BUILTIN, "name", TYPE_STRING, name.as_bytes())?; } if let Some(ref pbc) = molecule.pbc { let payload = [pbc[0] as u8, pbc[1] as u8, pbc[2] as u8]; - write_section(&mut buf, KIND_BUILTIN, "pbc", TYPE_BOOL3, &payload); + write_section(&mut buf, KIND_BUILTIN, "pbc", TYPE_BOOL3, &payload)?; } if let Some(ref stress) = molecule.stress { let mut payload = Vec::with_capacity(72); for row in stress { extend_f64(&mut payload, row); } - write_section(&mut buf, KIND_BUILTIN, "stress", TYPE_MAT3X3_F64, &payload); + write_section(&mut buf, KIND_BUILTIN, "stress", TYPE_MAT3X3_F64, &payload)?; } if let Some(ref velocities) = molecule.velocities { let mut payload = Vec::with_capacity(velocities.len() * 12); @@ -318,7 +349,7 @@ pub(super) fn serialize_molecule_soa(molecule: &Molecule) -> Result> { "velocities", TYPE_VEC3_F32, &payload, - ); + )?; } let mut atom_keys: Vec<&String> = molecule.atom_properties.keys().collect(); @@ -332,7 +363,7 @@ pub(super) fn serialize_molecule_soa(molecule: &Molecule) -> Result> { key, property_value_type_tag(value), &payload, - ); + )?; } let mut prop_keys: Vec<&String> = molecule.properties.keys().collect(); @@ -346,7 +377,7 @@ pub(super) fn serialize_molecule_soa(molecule: &Molecule) -> Result> { key, property_value_type_tag(value), &payload, - ); + )?; } Ok(buf) From 02b7f370ebc20a8c3ec999bc2ac58f7cfe51251f Mon Sep 17 00:00:00 2001 From: dts Date: Wed, 29 Apr 2026 14:46:12 +0200 Subject: [PATCH 2/2] test: cover fix #3 + add type_tag schema check Independent review caught two follow-ups for the storage-safety PR: - get_molecules_flat now also validates that subsequent molecules' section type_tag matches the schema (taken from molecule 0). Previously a same-key-same-size-different-dtype mismatch would have silently reinterpreted bytes downstream as a different dtype. - Two new tests cover fix #3 (per-mol slot length mismatch, the unsafe memcpy hazard) and the new type-tag check. Built without a multi-GiB corpus by using two molecules with same-key but different per-molecule property dtypes/lengths. --- atompack-py/src/database_flat.rs | 8 ++++++ atompack-py/tests/test_database.py | 43 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/atompack-py/src/database_flat.rs b/atompack-py/src/database_flat.rs index 6dcd835..3395120 100644 --- a/atompack-py/src/database_flat.rs +++ b/atompack-py/src/database_flat.rs @@ -187,6 +187,14 @@ pub(super) fn get_molecules_flat_soa_impl<'py>( i, sec.key ))); } + if sec.type_tag != schema_entry.type_tag { + // Same key + kind but different type tag would silently + // reinterpret the bytes downstream as a different dtype. + return Err(invalid_data(format!( + "Section '{}' type tag mismatch at molecule {}: schema is {}, got {}", + sec.key, i, schema_entry.type_tag, sec.type_tag + ))); + } if schema_entry.per_atom { let expected = n.checked_mul(schema_entry.elem_bytes).ok_or_else(|| { invalid_data(format!("Section '{}' payload length overflow", sec.key)) diff --git a/atompack-py/tests/test_database.py b/atompack-py/tests/test_database.py index 8c5a453..34067ea 100644 --- a/atompack-py/tests/test_database.py +++ b/atompack-py/tests/test_database.py @@ -589,3 +589,46 @@ def test_overlong_property_key_rejected_at_write(tmp_path: Path) -> None: db = atompack.Database(str(tmp_path / "overlong.atp")) with pytest.raises(ValueError, match=r"max is 255|too long|255 bytes"): db.add_molecule(mol) + + +def test_get_molecules_flat_rejects_per_mol_slot_mismatch(tmp_path: Path) -> None: + # get_molecules_flat derives slot_bytes from the first molecule's section + # payload. A later molecule with the same key but different payload length + # would have OOB-written into an adjacent buffer slot from a parallel + # rayon thread before this fix; the new length check catches it cleanly. + mol1 = _make_molecule(-1.0) + mol1.set_property("vec", np.array([1.0, 2.0], dtype=np.float64)) # 16 bytes + + mol2 = _make_molecule(-2.0) + mol2.set_property("vec", np.array([1.0, 2.0, 3.0], dtype=np.float64)) # 24 bytes + + path = tmp_path / "uneven.atp" + db = atompack.Database(str(path)) + db.add_molecule(mol1) + db.add_molecule(mol2) + db.flush() + + db_r = atompack.Database.open(str(path)) + with pytest.raises(ValueError, match=r"invalid payload length|schema slot"): + db_r.get_molecules_flat([0, 1]) + + +def test_get_molecules_flat_rejects_per_mol_type_tag_mismatch(tmp_path: Path) -> None: + # If the same key shows up with a different dtype across molecules, the + # bytes would have been reinterpreted as the schema's dtype downstream. + # The new type-tag check catches this before the memcpy. + mol1 = _make_molecule(-1.0) + mol1.set_property("scalar", 1.5) # TYPE_FLOAT (f64) + + mol2 = _make_molecule(-2.0) + mol2.set_property("scalar", 7) # TYPE_INT (i64), same 8-byte size + + path = tmp_path / "tagmix.atp" + db = atompack.Database(str(path)) + db.add_molecule(mol1) + db.add_molecule(mol2) + db.flush() + + db_r = atompack.Database.open(str(path)) + with pytest.raises(ValueError, match=r"type tag mismatch"): + db_r.get_molecules_flat([0, 1])