From 9905394856660ba1832eeb499117b76e93598586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Buschk=C3=A4mper?= Date: Wed, 15 Apr 2026 12:41:31 +0000 Subject: [PATCH 1/3] Removed unneeded sqlite index. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Björn Buschkämper --- .../energon/flavors/webdataset/indexing.py | 5 +- tests/test_webdataset_indexing.py | 86 +++++++++++++++++++ 2 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 tests/test_webdataset_indexing.py diff --git a/src/megatron/energon/flavors/webdataset/indexing.py b/src/megatron/energon/flavors/webdataset/indexing.py index 6317a2be..9dca7fc3 100644 --- a/src/megatron/energon/flavors/webdataset/indexing.py +++ b/src/megatron/energon/flavors/webdataset/indexing.py @@ -219,10 +219,7 @@ def close(self): if self.enable_sample_tables: # Create the index after adding all the samples for better speed - # Index on sample_key for fast lookups - self.db.execute( - "CREATE UNIQUE INDEX IF NOT EXISTS idx_samples_sample_key ON samples(sample_key)" - ) + # sample_key uniqueness already creates an implicit SQLite index via the table schema. # Create index on the samples table. Help the planner if it chooses `samples` as the probe side of the join self.db.execute( diff --git a/tests/test_webdataset_indexing.py b/tests/test_webdataset_indexing.py new file mode 100644 index 00000000..db6bca98 --- /dev/null +++ b/tests/test_webdataset_indexing.py @@ -0,0 +1,86 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: BSD-3-Clause + +import sqlite3 +import tempfile +import unittest +from pathlib import Path + +from megatron.energon.epathlib import EPath +from megatron.energon.flavors.webdataset.indexing import ( + DuplicateSampleKeyError, + SqliteIndexReader, + SqliteIndexWriter, +) + + +class TestSqliteIndexWriter(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.sqlite_path = Path(self.temp_dir.name) / "index.sqlite" + + def tearDown(self): + self.temp_dir.cleanup() + + def test_close_does_not_create_duplicate_sample_key_index(self): + writer = SqliteIndexWriter(EPath(self.sqlite_path)) + writer.append_sample( + tar_file_id=0, + sample_key="sample-0", + sample_index=0, + byte_offset=0, + byte_size=16, + ) + writer.close() + + with sqlite3.connect(self.sqlite_path) as db: + sample_indexes = { + row[1]: row + for row in db.execute("PRAGMA index_list(samples)") + } + + self.assertIn("sqlite_autoindex_samples_1", sample_indexes) + self.assertIn("idx_samples_by_tar_and_idx", sample_indexes) + self.assertNotIn("idx_samples_sample_key", sample_indexes) + + def test_sample_key_lookup_still_works_with_implicit_unique_index(self): + writer = SqliteIndexWriter(EPath(self.sqlite_path)) + writer.append_sample( + tar_file_id=3, + sample_key="sample-lookup", + sample_index=7, + byte_offset=128, + byte_size=64, + ) + writer.close() + + reader = SqliteIndexReader(EPath(self.sqlite_path)) + try: + pointer = reader.get_sample_pointer_by_key("sample-lookup") + finally: + reader.close() + + self.assertEqual(pointer.tar_file_id, 3) + self.assertEqual(pointer.byte_offset, 128) + self.assertEqual(pointer.byte_size, 64) + + def test_duplicate_sample_key_still_raises(self): + writer = SqliteIndexWriter(EPath(self.sqlite_path)) + writer.append_sample( + tar_file_id=0, + sample_key="sample-dup", + sample_index=0, + byte_offset=0, + byte_size=16, + ) + + with self.assertRaises(DuplicateSampleKeyError): + writer.append_sample( + tar_file_id=0, + sample_key="sample-dup", + sample_index=1, + byte_offset=16, + byte_size=16, + ) + + writer.close() From c8bb7ac67ef7256cd56106974ad126fc60d2a31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Buschk=C3=A4mper?= Date: Wed, 15 Apr 2026 15:47:19 +0200 Subject: [PATCH 2/3] Improved unit test by removing hardcoded auto-generated index name. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Björn Buschkämper --- tests/test_webdataset_indexing.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/test_webdataset_indexing.py b/tests/test_webdataset_indexing.py index db6bca98..b45d90dc 100644 --- a/tests/test_webdataset_indexing.py +++ b/tests/test_webdataset_indexing.py @@ -38,10 +38,26 @@ def test_close_does_not_create_duplicate_sample_key_index(self): row[1]: row for row in db.execute("PRAGMA index_list(samples)") } + unique_sample_key_indexes = [] + for index_name, row in sample_indexes.items(): + is_unique = bool(row[2]) + if not is_unique: + continue + index_columns = [ + index_info_row[2] + for index_info_row in db.execute( + f"PRAGMA index_info('{index_name}')" + ) + ] + if "sample_key" in index_columns: + unique_sample_key_indexes.append(index_name) - self.assertIn("sqlite_autoindex_samples_1", sample_indexes) self.assertIn("idx_samples_by_tar_and_idx", sample_indexes) self.assertNotIn("idx_samples_sample_key", sample_indexes) + self.assertTrue( + unique_sample_key_indexes, + "Expected a UNIQUE index on samples that includes sample_key", + ) def test_sample_key_lookup_still_works_with_implicit_unique_index(self): writer = SqliteIndexWriter(EPath(self.sqlite_path)) From 92fc14a2b5c9251a524c7292b69f83b4134fcbba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Buschk=C3=A4mper?= Date: Mon, 4 May 2026 13:15:44 +0000 Subject: [PATCH 3/3] Remove unneeded unit test for webdataset indexing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Björn Buschkämper --- tests/test_webdataset_indexing.py | 102 ------------------------------ 1 file changed, 102 deletions(-) delete mode 100644 tests/test_webdataset_indexing.py diff --git a/tests/test_webdataset_indexing.py b/tests/test_webdataset_indexing.py deleted file mode 100644 index b45d90dc..00000000 --- a/tests/test_webdataset_indexing.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. -# SPDX-License-Identifier: BSD-3-Clause - -import sqlite3 -import tempfile -import unittest -from pathlib import Path - -from megatron.energon.epathlib import EPath -from megatron.energon.flavors.webdataset.indexing import ( - DuplicateSampleKeyError, - SqliteIndexReader, - SqliteIndexWriter, -) - - -class TestSqliteIndexWriter(unittest.TestCase): - def setUp(self): - self.temp_dir = tempfile.TemporaryDirectory() - self.sqlite_path = Path(self.temp_dir.name) / "index.sqlite" - - def tearDown(self): - self.temp_dir.cleanup() - - def test_close_does_not_create_duplicate_sample_key_index(self): - writer = SqliteIndexWriter(EPath(self.sqlite_path)) - writer.append_sample( - tar_file_id=0, - sample_key="sample-0", - sample_index=0, - byte_offset=0, - byte_size=16, - ) - writer.close() - - with sqlite3.connect(self.sqlite_path) as db: - sample_indexes = { - row[1]: row - for row in db.execute("PRAGMA index_list(samples)") - } - unique_sample_key_indexes = [] - for index_name, row in sample_indexes.items(): - is_unique = bool(row[2]) - if not is_unique: - continue - index_columns = [ - index_info_row[2] - for index_info_row in db.execute( - f"PRAGMA index_info('{index_name}')" - ) - ] - if "sample_key" in index_columns: - unique_sample_key_indexes.append(index_name) - - self.assertIn("idx_samples_by_tar_and_idx", sample_indexes) - self.assertNotIn("idx_samples_sample_key", sample_indexes) - self.assertTrue( - unique_sample_key_indexes, - "Expected a UNIQUE index on samples that includes sample_key", - ) - - def test_sample_key_lookup_still_works_with_implicit_unique_index(self): - writer = SqliteIndexWriter(EPath(self.sqlite_path)) - writer.append_sample( - tar_file_id=3, - sample_key="sample-lookup", - sample_index=7, - byte_offset=128, - byte_size=64, - ) - writer.close() - - reader = SqliteIndexReader(EPath(self.sqlite_path)) - try: - pointer = reader.get_sample_pointer_by_key("sample-lookup") - finally: - reader.close() - - self.assertEqual(pointer.tar_file_id, 3) - self.assertEqual(pointer.byte_offset, 128) - self.assertEqual(pointer.byte_size, 64) - - def test_duplicate_sample_key_still_raises(self): - writer = SqliteIndexWriter(EPath(self.sqlite_path)) - writer.append_sample( - tar_file_id=0, - sample_key="sample-dup", - sample_index=0, - byte_offset=0, - byte_size=16, - ) - - with self.assertRaises(DuplicateSampleKeyError): - writer.append_sample( - tar_file_id=0, - sample_key="sample-dup", - sample_index=1, - byte_offset=16, - byte_size=16, - ) - - writer.close()