diff --git a/src/nemotron/data_prep/utils/splits.py b/src/nemotron/data_prep/utils/splits.py index 2b8cd88b8..f3ae5dfda 100644 --- a/src/nemotron/data_prep/utils/splits.py +++ b/src/nemotron/data_prep/utils/splits.py @@ -175,8 +175,12 @@ def realize_packed_shards_into_split_dirs( logger.warning(f"Shard file not found: {parquet_path_str}") continue - # Create symlink in split dir - link_path = split_dir / parquet_path.name + # Prefix symlink with dataset name to avoid collisions: multiple + # datasets produce identically-named shards (shard_000000.parquet, etc.), + # so bare filenames cause later datasets to overwrite earlier ones. + # Path convention: .../datasets/{dataset_name}/{plan_hash}/shard_NNNNNN.parquet + dataset_name = parquet_path.parent.parent.name + link_path = split_dir / f"{dataset_name}_{parquet_path.name}" if link_path.exists() or link_path.is_symlink(): # Remove existing link/file to update