huggingface · lhoestq · May 27, 2026 · Mar 8, 2026 · Mar 10, 2026 · Mar 11, 2026
diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx
@@ -134,6 +134,33 @@ And in this case the numpy arrays are encoded into PNG (or TIFF if the pixels va
 For multi-channels arrays like RGB or RGBA, only uint8 is supported. If you use a larger precision, you get a warning and the array is downcasted to uint8.
 For gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32.
 
+## Mesh feature
+
+Mesh datasets have a column with type [`Mesh`], which loads 3D mesh files with `trimesh`.
+
+When you load a mesh dataset and call the mesh column, the [`Mesh`] feature automatically decodes the mesh file:
+
+```py
+>>> from datasets import Dataset, Features, Mesh
+
+>>> dataset = Dataset.from_dict({"mesh": ["path/to/model.glb"]}, features=Features({"mesh": Mesh()}))
+>>> dataset[0]["mesh"]
+<trimesh.scene.scene.Scene object at 0x125506CF8>
+```
+
+Depending on the file content, `trimesh` may return a `trimesh.Trimesh` object or a `trimesh.Scene` object. GLB files commonly decode to scenes, while STL and PLY files commonly decode to meshes.
+
+With `decode=False`, the [`Mesh`] type gives you the path or bytes of the mesh file without decoding it with `trimesh`:
+
+```py
+>>> dataset = dataset.cast_column("mesh", Mesh(decode=False))
+>>> dataset[0]["mesh"]
+{'bytes': None,
+ 'path': 'path/to/model.glb'}
+```
+
+For embedded bytes, the stored `path` is used to infer the mesh file type.
+
 ## Json feature
 
 Datasets are based on Arrow which is a columnar format, and therefore they expect every example to have the same type and subtypes, and dictionaries to have the same keys and values types.

diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx
@@ -277,6 +277,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable
 
 [[autodoc]] datasets.Video
 
+### Mesh
+
+[[autodoc]] datasets.Mesh
+
 ### Json
 
 [[autodoc]] datasets.Json

diff --git a/setup.py b/setup.py
@@ -145,6 +145,10 @@
     "Pillow>=9.4.0",  # When PIL.Image.ExifTags was introduced
 ]
 
+MESH_REQUIRE = [
+    "trimesh>=4.10.0",
+]
+
 BENCHMARKS_REQUIRE = [
     "tensorflow==2.12.0",
     "torch==2.0.1",
@@ -188,6 +192,7 @@
     "Pillow>=9.4.0",  # When PIL.Image.ExifTags was introduced
     "torchcodec>=0.7.0; python_version < '3.14'",  # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet
     "nibabel>=5.3.1",
+    "trimesh>=4.10.0",
 ]
 
 NUMPY2_INCOMPATIBLE_LIBRARIES = [
@@ -214,6 +219,7 @@
 EXTRAS_REQUIRE = {
     "audio": AUDIO_REQUIRE,
     "vision": VISION_REQUIRE,
+    "mesh": MESH_REQUIRE,
     "tensorflow": [
         "tensorflow>=2.6.0",
     ],

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -140,6 +140,7 @@
 TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
 PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
 NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None
+TRIMESH_AVAILABLE = importlib.util.find_spec("trimesh") is not None
 
 # Optional compression tools
 RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None

diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py
@@ -12,6 +12,7 @@
     "Sequence",
     "Value",
     "Image",
+    "Mesh",
     "Translation",
     "TranslationVariableLanguages",
     "Video",
@@ -21,6 +22,7 @@
 from .audio import Audio
 from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, Json, LargeList, List, Sequence, Value
 from .image import Image
+from .mesh import Mesh
 from .nifti import Nifti
 from .pdf import Pdf
 from .translation import Translation, TranslationVariableLanguages

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -43,6 +43,7 @@
 from ..utils.py_utils import asdict, first_non_null_value, zip_dict
 from .audio import Audio
 from .image import Image, encode_pil_image
+from .mesh import Mesh
 from .nifti import Nifti, encode_nibabel_image
 from .pdf import Pdf, encode_pdfplumber_pdf
 from .translation import Translation, TranslationVariableLanguages
@@ -1361,6 +1362,7 @@ def __repr__(self):
     Array5D,
     Audio,
     Image,
+    Mesh,
     Video,
     Pdf,
     Nifti,
@@ -1522,6 +1524,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni
     Array5D.__name__: Array5D,
     Audio.__name__: Audio,
     Image.__name__: Image,
+    Mesh.__name__: Mesh,
     Video.__name__: Video,
     Pdf.__name__: Pdf,
     Nifti.__name__: Nifti,

diff --git a/src/datasets/features/mesh.py b/src/datasets/features/mesh.py
@@ -0,0 +1,271 @@
+import os
+from dataclasses import dataclass, field
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
+
+import pyarrow as pa
+
+from .. import config
+from ..download.download_config import DownloadConfig
+from ..table import array_cast
+from ..utils.file_utils import is_local_path, xopen
+from ..utils.py_utils import string_to_dict
+
+
+if TYPE_CHECKING:
+    import trimesh
+
+    from .features import FeatureType
+
+
+@dataclass
+class Mesh:
+    """Mesh [`Feature`] to read 3D mesh data from a file.
+
+    Input: The Mesh feature accepts as input:
+    - A `str`: Absolute path to the mesh file (i.e. random access is allowed).
+    - A `pathlib.Path`: path to the mesh file (i.e. random access is allowed).
+    - A `dict` with the keys:
+
+        - `path`: String with relative path of the mesh file to the archive file.
+        - `bytes`: Bytes of the mesh file.
+
+      This is useful for parquet or webdataset files which embed mesh files.
+
+    - A `trimesh.Trimesh` or `trimesh.Scene`: 3D mesh or scene object.
+
+    Output: The Mesh feature outputs data as `trimesh.Trimesh` or `trimesh.Scene` objects.
+
+    Args:
+        decode (`bool`, defaults to `True`):
+            Whether to decode the mesh data. If `False`,
+            returns the underlying dictionary in the format `{"path": mesh_path, "bytes": mesh_bytes}`.
+            Mesh decoding uses `trimesh` and supports `.glb`, `.ply`, and `.stl` files.
+    """
+
+    decode: bool = True
+    id: Optional[str] = field(default=None, repr=False)
+    # Automatically constructed
+    dtype: ClassVar[str] = "trimesh.Trimesh"
+    pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()})
+    _type: str = field(default="Mesh", init=False, repr=False)
+
+    def __call__(self):
+        return self.pa_type
+
+    def encode_example(self, value: Union[str, bytes, bytearray, dict, "trimesh.Trimesh", "trimesh.Scene"]) -> dict:
+        """Encode example into a format for Arrow.
+
+        Args:
+            value (`str`, `bytes`, `dict`, `trimesh.Trimesh`, or `trimesh.Scene`):
+                Data passed as input to Mesh feature.
+
+        Returns:
+            `dict` with "path" and "bytes" fields
+        """
+        if config.TRIMESH_AVAILABLE:
+            import trimesh
+        else:
+            trimesh = None
+
+        if isinstance(value, str):
+            return {"path": value, "bytes": None}
+        elif isinstance(value, Path):
+            return {"path": str(value.absolute()), "bytes": None}
+        elif isinstance(value, (bytes, bytearray)):
+            return {"path": None, "bytes": value}
+        elif trimesh is not None and isinstance(value, (trimesh.Trimesh, trimesh.Scene)):
+            return encode_trimesh_mesh(value)
+        elif isinstance(value, dict) and value.get("path") is not None and os.path.isfile(value["path"]):
+            # we set "bytes": None to not duplicate the data if they're already available locally
+            return {"bytes": None, "path": value.get("path")}
+        elif isinstance(value, dict) and (value.get("bytes") is not None or value.get("path") is not None):
+            # store the mesh bytes, and path is used to infer the mesh format using the file extension
+            return {"bytes": value.get("bytes"), "path": value.get("path")}
+        else:
+            raise ValueError(
+                f"A mesh sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
+            )
+
+    def decode_example(self, value: dict, token_per_repo_id=None) -> Union["trimesh.Trimesh", "trimesh.Scene"]:
+        """Decode example mesh file.
+
+        Args:
+            value (`dict`):
+                A dictionary with keys:
+
+                - `path`: String with absolute or relative mesh file path.
+                - `bytes`: The bytes of the mesh file.
+            token_per_repo_id (`dict`, *optional*):
+                To access and decode
+                mesh files from private repositories on the Hub, you can pass
+                a dictionary repo_id (`str`) -> token (`bool` or `str`).
+
+        Returns:
+            `trimesh.Trimesh` or `trimesh.Scene`
+        """
+        if not self.decode:
+            raise RuntimeError("Decoding is disabled for this feature. Please use Mesh(decode=True) instead.")
+
+        if config.TRIMESH_AVAILABLE:
+            import trimesh
+        else:
+            raise ImportError("To support decoding meshes, please install 'trimesh'.")
+
+        if token_per_repo_id is None:
+            token_per_repo_id = {}
+
+        path, bytes_ = value["path"], value["bytes"]
+        if bytes_ is None:
+            if path is None:
+                raise ValueError(f"A mesh should have one of 'path' or 'bytes' but both are None in {value}.")
+            if is_local_path(path):
+                file_type = _infer_mesh_file_type(path)
+                if file_type is None:
+                    raise ValueError("A mesh path should have a .glb, .ply, or .stl extension.")
+                return trimesh.load(path, file_type=file_type)
+            source_url = path.split("::")[-1]
+            pattern = (
+                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
+            )
+            source_url_fields = string_to_dict(source_url, pattern)
+            token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
+            download_config = DownloadConfig(token=token)
+            with xopen(path, "rb", download_config=download_config) as f:
+                bytes_ = f.read()
+
+        file_type = _infer_mesh_file_type(path)
+        if file_type is None:
+            raise ValueError(
+                "Decoding mesh bytes requires a 'path' value with a .glb, .ply, or .stl extension "
+                "to infer the mesh file type."
+            )
+        return trimesh.load(BytesIO(bytes_), file_type=file_type)
+
+    def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
+        """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
+        from .features import Value
+
+        return (
+            self
+            if self.decode
+            else {
+                "bytes": Value("binary"),
+                "path": Value("string"),
+            }
+        )
+
+    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.StructArray:
+        """Cast an Arrow array to the Mesh arrow storage type.
+        The Arrow types that can be converted to the Mesh pyarrow storage type are:
+
+        - `pa.string()` - it must contain the "path" data
+        - `pa.large_string()` - it must contain the "path" data (will be cast to string if possible)
+        - `pa.binary()` - it must contain the mesh bytes
+        - `pa.struct({"bytes": pa.binary()})`
+        - `pa.struct({"path": pa.string()})`
+        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
+
+        Args:
+            storage (`Union[pa.StringArray, pa.StructArray]`):
+                PyArrow array to cast.
+
+        Returns:
+            `pa.StructArray`: Array in the Mesh arrow storage type, that is
+                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
+        """
+        if pa.types.is_large_string(storage.type):
+            try:
+                storage = storage.cast(pa.string())
+            except pa.ArrowInvalid as e:
+                raise ValueError(
+                    f"Failed to cast large_string to string for Mesh feature. "
+                    f"This can happen if string values exceed 2GB. "
+                    f"Original error: {e}"
+                ) from e
+        if pa.types.is_string(storage.type):
+            bytes_array = pa.array([None] * len(storage), type=pa.binary())
+            storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_large_binary(storage.type):
+            storage = array_cast(
+                storage, pa.binary()
+            )  # this can fail in case of big meshes, paths should be used instead
+            path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_binary(storage.type):
+            path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_struct(storage.type):
+            if storage.type.get_field_index("bytes") >= 0:
+                bytes_array = storage.field("bytes")
+            else:
+                bytes_array = pa.array([None] * len(storage), type=pa.binary())
+            if storage.type.get_field_index("path") >= 0:
+                path_array = storage.field("path")
+            else:
+                path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null())
+
+        return array_cast(storage, self.pa_type)
+
+    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:
+        """Embed mesh files into the Arrow array.
+
+        Args:
+            storage (`pa.StructArray`):
+                PyArrow array to embed.
+
+        Returns:
+            `pa.StructArray`: Array in the Mesh arrow storage type, that is
+                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
+        """
+        if token_per_repo_id is None:
+            token_per_repo_id = {}
+
+        def path_to_bytes(path):
+            if path is None:
+                return None
+            source_url = path.split("::")[-1]
+            pattern = (
+                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
+            )
+            source_url_fields = string_to_dict(source_url, pattern)
+            token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
+            download_config = DownloadConfig(token=token)
+            with xopen(path, "rb", download_config=download_config) as f:
+                return f.read()
+
+        bytes_array = pa.array(
+            [
+                (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None
+                for x in storage.to_pylist()
+            ],
+            type=pa.binary(),
+        )
+        path_array = pa.array(
+            [os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()],
+            type=pa.string(),
+        )
+        storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null())
+        return array_cast(storage, self.pa_type)
+
+
+def _infer_mesh_file_type(path: Optional[str]) -> Optional[str]:
+    supported_file_types = {"glb", "ply", "stl"}
+    if path is None:
+        return None
+    path_without_archive = path.split("::", 1)[0]
+    path_without_query = path_without_archive.split("?", 1)[0]
+    extension = os.path.splitext(path_without_query)[1].lower().lstrip(".")
+    return extension if extension in supported_file_types else None
+
+
+def encode_trimesh_mesh(mesh: Union["trimesh.Trimesh", "trimesh.Scene"]) -> dict[str, Optional[bytes | str]]:
+    """Encode a trimesh mesh or scene object into GLB bytes."""
+    metadata = getattr(mesh, "metadata", None) or {}
+    path = metadata.get("file_path") or metadata.get("file_name") if isinstance(metadata, dict) else None
+    if path is not None and os.path.isfile(path):
+        return {"path": path, "bytes": None}
+    bytes_ = mesh.export(file_type="glb")
+    return {"path": os.path.basename(path) if path else "mesh.glb", "bytes": bytes_}