diff --git a/src/datasets/load.py b/src/datasets/load.py index 01b71f8ebdb..0973093c544 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -68,13 +68,13 @@ from .naming import camelcase_to_snakecase, snakecase_to_camelcase from .packaged_modules import ( _ALL_ALLOWED_EXTENSIONS, + _ALL_METADATA_FILENAMES, _EXTENSION_TO_MODULE, _MODULE_TO_EXTENSIONS, _MODULE_TO_METADATA_EXTENSIONS, _MODULE_TO_METADATA_FILE_NAMES, _PACKAGED_DATASETS_MODULES, ) -from .packaged_modules.folder_based_builder.folder_based_builder import FolderBasedBuilder from .splits import Split from .utils import _dataset_viewer from .utils.file_utils import ( @@ -225,7 +225,7 @@ def infer_module_for_data_files_list( - dict of builder kwargs """ extensions_counter = Counter( - ("." + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES) + ("." + suffix.lower(), xbasename(filepath) in _ALL_METADATA_FILENAMES) for filepath in data_files_list for suffix in xbasename(filepath).split(".")[1:] ) @@ -234,7 +234,17 @@ def infer_module_for_data_files_list( def sort_key(ext_count: tuple[tuple[str, bool], int]) -> tuple[int, bool]: """Sort by count and set ".parquet" as the favorite in case of a draw, and ignore metadata files""" (ext, is_metadata), count = ext_count - return (not is_metadata, count, ext == ".parquet", ext == ".jsonl", ext == ".json", ext == ".csv", ext) + return ( + not is_metadata, + count, + ext == ".parquet", + ext == ".lance", + ext == ".arrow", + ext == ".jsonl", + ext == ".json", + ext == ".csv", + ext, + ) for (ext, _), _ in sorted(extensions_counter.items(), key=sort_key, reverse=True): if ext in _EXTENSION_TO_MODULE: diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 424aa432aff..7ff455cc0da 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -128,6 +128,7 @@ def _hash_python_lines(lines: list[str]) -> str: _MODULE_TO_METADATA_FILE_NAMES["meshfolder"] = meshfolder.MeshFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["pdffolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["niftifolder"] = imagefolder.ImageFolder.METADATA_FILENAMES +_MODULE_TO_METADATA_FILE_NAMES["lance"] = lance.Lance.METADATA_FILE_NAMES _MODULE_TO_METADATA_EXTENSIONS: Dict[str, List[str]] = {} for _module in _MODULE_TO_EXTENSIONS: @@ -139,3 +140,6 @@ def _hash_python_lines(lines: list[str]) -> str: _ALL_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + [".zip"] _ALL_METADATA_EXTENSIONS = sorted({_ext for _exts in _MODULE_TO_METADATA_EXTENSIONS.values() for _ext in _exts}) _ALL_ALLOWED_EXTENSIONS = _ALL_EXTENSIONS + _ALL_METADATA_EXTENSIONS +_ALL_METADATA_FILENAMES = sorted( + {file_name for file_names in _MODULE_TO_METADATA_FILE_NAMES.values() for file_name in file_names} +) diff --git a/src/datasets/packaged_modules/lance/lance.py b/src/datasets/packaged_modules/lance/lance.py index 411f9398f82..017458ec94e 100644 --- a/src/datasets/packaged_modules/lance/lance.py +++ b/src/datasets/packaged_modules/lance/lance.py @@ -92,6 +92,7 @@ def _fix_local_version_file(uri: str) -> str: class Lance(datasets.ArrowBasedBuilder, datasets.builder._CountableBuilderMixin): BUILDER_CONFIG_CLASS = LanceConfig METADATA_EXTENSIONS = [".idx", ".txn", ".manifest"] + METADATA_FILE_NAMES = ["latest_version_hint.json"] def _info(self): return datasets.DatasetInfo(features=self.config.features)