From fa41a842f946aeed0f33ab3bd89ceb61e9a45977 Mon Sep 17 00:00:00 2001 From: lbathen Date: Sat, 2 May 2026 09:26:21 -0700 Subject: [PATCH] Fixed packing error --- src/nemotron/data_prep/utils/splits.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/nemotron/data_prep/utils/splits.py b/src/nemotron/data_prep/utils/splits.py index 2b8cd88b8..f3ae5dfda 100644 --- a/src/nemotron/data_prep/utils/splits.py +++ b/src/nemotron/data_prep/utils/splits.py @@ -175,8 +175,12 @@ def realize_packed_shards_into_split_dirs( logger.warning(f"Shard file not found: {parquet_path_str}") continue - # Create symlink in split dir - link_path = split_dir / parquet_path.name + # Prefix symlink with dataset name to avoid collisions: multiple + # datasets produce identically-named shards (shard_000000.parquet, etc.), + # so bare filenames cause later datasets to overwrite earlier ones. + # Path convention: .../datasets/{dataset_name}/{plan_hash}/shard_NNNNNN.parquet + dataset_name = parquet_path.parent.parent.name + link_path = split_dir / f"{dataset_name}_{parquet_path.name}" if link_path.exists() or link_path.is_symlink(): # Remove existing link/file to update