Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/source/about_dataset_features.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,33 @@ And in this case the numpy arrays are encoded into PNG (or TIFF if the pixels va
For multi-channels arrays like RGB or RGBA, only uint8 is supported. If you use a larger precision, you get a warning and the array is downcasted to uint8.
For gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32.

## Mesh feature

Mesh datasets have a column with type [`Mesh`], which loads 3D mesh files with `trimesh`.

When you load a mesh dataset and call the mesh column, the [`Mesh`] feature automatically decodes the mesh file:

```py
>>> from datasets import Dataset, Features, Mesh

>>> dataset = Dataset.from_dict({"mesh": ["path/to/model.glb"]}, features=Features({"mesh": Mesh()}))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's update the docs once with a cool mesh dataset on HF, do ou have an idea ?

Copy link
Copy Markdown
Contributor Author

@Vinay-Umrethe Vinay-Umrethe May 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhoestq

I've done a test which you can now find at VINAY-UMRETHE/My-Mesh-Dataset dataset repo which used Mesh() feature

However, while testing I noticed a error with embed_external_files which is fixed now but pending a merge, Created at #8224

Before you merge that, we can update the docs in that PR as well, this would finalize the whole Mesh-Support

Commits:

fix: embed_external_files=True

style: Match other test_features

>>> dataset[0]["mesh"]
<trimesh.scene.scene.Scene object at 0x125506CF8>
```

Depending on the file content, `trimesh` may return a `trimesh.Trimesh` object or a `trimesh.Scene` object. GLB files commonly decode to scenes, while STL and PLY files commonly decode to meshes.

With `decode=False`, the [`Mesh`] type gives you the path or bytes of the mesh file without decoding it with `trimesh`:

```py
>>> dataset = dataset.cast_column("mesh", Mesh(decode=False))
>>> dataset[0]["mesh"]
{'bytes': None,
'path': 'path/to/model.glb'}
```

For embedded bytes, the stored `path` is used to infer the mesh file type.

## Json feature

Datasets are based on Arrow which is a columnar format, and therefore they expect every example to have the same type and subtypes, and dictionaries to have the same keys and values types.
Expand Down
4 changes: 4 additions & 0 deletions docs/source/package_reference/main_classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable

[[autodoc]] datasets.Video

### Mesh

[[autodoc]] datasets.Mesh

### Json

[[autodoc]] datasets.Json
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@
"Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced
]

MESH_REQUIRE = [
"trimesh>=4.10.0",
]

BENCHMARKS_REQUIRE = [
"tensorflow==2.12.0",
"torch==2.0.1",
Expand Down Expand Up @@ -188,6 +192,7 @@
"Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced
"torchcodec>=0.7.0; python_version < '3.14'", # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet
"nibabel>=5.3.1",
"trimesh>=4.10.0",
]

NUMPY2_INCOMPATIBLE_LIBRARIES = [
Expand All @@ -214,6 +219,7 @@
EXTRAS_REQUIRE = {
"audio": AUDIO_REQUIRE,
"vision": VISION_REQUIRE,
"mesh": MESH_REQUIRE,
"tensorflow": [
"tensorflow>=2.6.0",
],
Expand Down
1 change: 1 addition & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@
TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None
TRIMESH_AVAILABLE = importlib.util.find_spec("trimesh") is not None

# Optional compression tools
RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
Expand Down
2 changes: 2 additions & 0 deletions src/datasets/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"Sequence",
"Value",
"Image",
"Mesh",
"Translation",
"TranslationVariableLanguages",
"Video",
Expand All @@ -21,6 +22,7 @@
from .audio import Audio
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, Json, LargeList, List, Sequence, Value
from .image import Image
from .mesh import Mesh
from .nifti import Nifti
from .pdf import Pdf
from .translation import Translation, TranslationVariableLanguages
Expand Down
3 changes: 3 additions & 0 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from ..utils.py_utils import asdict, first_non_null_value, zip_dict
from .audio import Audio
from .image import Image, encode_pil_image
from .mesh import Mesh
from .nifti import Nifti, encode_nibabel_image
from .pdf import Pdf, encode_pdfplumber_pdf
from .translation import Translation, TranslationVariableLanguages
Expand Down Expand Up @@ -1361,6 +1362,7 @@ def __repr__(self):
Array5D,
Audio,
Image,
Mesh,
Video,
Pdf,
Nifti,
Expand Down Expand Up @@ -1522,6 +1524,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni
Array5D.__name__: Array5D,
Audio.__name__: Audio,
Image.__name__: Image,
Mesh.__name__: Mesh,
Video.__name__: Video,
Pdf.__name__: Pdf,
Nifti.__name__: Nifti,
Expand Down
271 changes: 271 additions & 0 deletions src/datasets/features/mesh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
import os
from dataclasses import dataclass, field
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union

import pyarrow as pa

from .. import config
from ..download.download_config import DownloadConfig
from ..table import array_cast
from ..utils.file_utils import is_local_path, xopen
from ..utils.py_utils import string_to_dict


if TYPE_CHECKING:
import trimesh

from .features import FeatureType


@dataclass
class Mesh:
"""Mesh [`Feature`] to read 3D mesh data from a file.

Input: The Mesh feature accepts as input:
- A `str`: Absolute path to the mesh file (i.e. random access is allowed).
- A `pathlib.Path`: path to the mesh file (i.e. random access is allowed).
- A `dict` with the keys:

- `path`: String with relative path of the mesh file to the archive file.
- `bytes`: Bytes of the mesh file.

This is useful for parquet or webdataset files which embed mesh files.

- A `trimesh.Trimesh` or `trimesh.Scene`: 3D mesh or scene object.

Output: The Mesh feature outputs data as `trimesh.Trimesh` or `trimesh.Scene` objects.

Args:
decode (`bool`, defaults to `True`):
Whether to decode the mesh data. If `False`,
returns the underlying dictionary in the format `{"path": mesh_path, "bytes": mesh_bytes}`.
Mesh decoding uses `trimesh` and supports `.glb`, `.ply`, and `.stl` files.
"""

decode: bool = True
id: Optional[str] = field(default=None, repr=False)
# Automatically constructed
dtype: ClassVar[str] = "trimesh.Trimesh"
pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()})
_type: str = field(default="Mesh", init=False, repr=False)

def __call__(self):
return self.pa_type

def encode_example(self, value: Union[str, bytes, bytearray, dict, "trimesh.Trimesh", "trimesh.Scene"]) -> dict:
"""Encode example into a format for Arrow.

Args:
value (`str`, `bytes`, `dict`, `trimesh.Trimesh`, or `trimesh.Scene`):
Data passed as input to Mesh feature.

Returns:
`dict` with "path" and "bytes" fields
"""
if config.TRIMESH_AVAILABLE:
import trimesh
else:
trimesh = None

if isinstance(value, str):
return {"path": value, "bytes": None}
elif isinstance(value, Path):
return {"path": str(value.absolute()), "bytes": None}
elif isinstance(value, (bytes, bytearray)):
return {"path": None, "bytes": value}
elif trimesh is not None and isinstance(value, (trimesh.Trimesh, trimesh.Scene)):
return encode_trimesh_mesh(value)
elif isinstance(value, dict) and value.get("path") is not None and os.path.isfile(value["path"]):
# we set "bytes": None to not duplicate the data if they're already available locally
return {"bytes": None, "path": value.get("path")}
elif isinstance(value, dict) and (value.get("bytes") is not None or value.get("path") is not None):
# store the mesh bytes, and path is used to infer the mesh format using the file extension
return {"bytes": value.get("bytes"), "path": value.get("path")}
else:
raise ValueError(
f"A mesh sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
)

def decode_example(self, value: dict, token_per_repo_id=None) -> Union["trimesh.Trimesh", "trimesh.Scene"]:
"""Decode example mesh file.

Args:
value (`dict`):
A dictionary with keys:

- `path`: String with absolute or relative mesh file path.
- `bytes`: The bytes of the mesh file.
token_per_repo_id (`dict`, *optional*):
To access and decode
mesh files from private repositories on the Hub, you can pass
a dictionary repo_id (`str`) -> token (`bool` or `str`).

Returns:
`trimesh.Trimesh` or `trimesh.Scene`
"""
if not self.decode:
raise RuntimeError("Decoding is disabled for this feature. Please use Mesh(decode=True) instead.")

if config.TRIMESH_AVAILABLE:
import trimesh
else:
raise ImportError("To support decoding meshes, please install 'trimesh'.")

if token_per_repo_id is None:
token_per_repo_id = {}

path, bytes_ = value["path"], value["bytes"]
if bytes_ is None:
if path is None:
raise ValueError(f"A mesh should have one of 'path' or 'bytes' but both are None in {value}.")
if is_local_path(path):
file_type = _infer_mesh_file_type(path)
if file_type is None:
raise ValueError("A mesh path should have a .glb, .ply, or .stl extension.")
return trimesh.load(path, file_type=file_type)
source_url = path.split("::")[-1]
pattern = (
config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
)
source_url_fields = string_to_dict(source_url, pattern)
token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
download_config = DownloadConfig(token=token)
with xopen(path, "rb", download_config=download_config) as f:
bytes_ = f.read()

file_type = _infer_mesh_file_type(path)
if file_type is None:
raise ValueError(
"Decoding mesh bytes requires a 'path' value with a .glb, .ply, or .stl extension "
"to infer the mesh file type."
)
return trimesh.load(BytesIO(bytes_), file_type=file_type)

def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
"""If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
from .features import Value

return (
self
if self.decode
else {
"bytes": Value("binary"),
"path": Value("string"),
}
)

def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.StructArray:
"""Cast an Arrow array to the Mesh arrow storage type.
The Arrow types that can be converted to the Mesh pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.large_string()` - it must contain the "path" data (will be cast to string if possible)
- `pa.binary()` - it must contain the mesh bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
- `pa.struct({"bytes": pa.binary(), "path": pa.string()})` - order doesn't matter

Args:
storage (`Union[pa.StringArray, pa.StructArray]`):
PyArrow array to cast.

Returns:
`pa.StructArray`: Array in the Mesh arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
if pa.types.is_large_string(storage.type):
try:
storage = storage.cast(pa.string())
except pa.ArrowInvalid as e:
raise ValueError(
f"Failed to cast large_string to string for Mesh feature. "
f"This can happen if string values exceed 2GB. "
f"Original error: {e}"
) from e
if pa.types.is_string(storage.type):
bytes_array = pa.array([None] * len(storage), type=pa.binary())
storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
elif pa.types.is_large_binary(storage.type):
storage = array_cast(
storage, pa.binary()
) # this can fail in case of big meshes, paths should be used instead
path_array = pa.array([None] * len(storage), type=pa.string())
storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
elif pa.types.is_binary(storage.type):
path_array = pa.array([None] * len(storage), type=pa.string())
storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
elif pa.types.is_struct(storage.type):
if storage.type.get_field_index("bytes") >= 0:
bytes_array = storage.field("bytes")
else:
bytes_array = pa.array([None] * len(storage), type=pa.binary())
if storage.type.get_field_index("path") >= 0:
path_array = storage.field("path")
else:
path_array = pa.array([None] * len(storage), type=pa.string())
storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null())

return array_cast(storage, self.pa_type)

def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:
"""Embed mesh files into the Arrow array.

Args:
storage (`pa.StructArray`):
PyArrow array to embed.

Returns:
`pa.StructArray`: Array in the Mesh arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
if token_per_repo_id is None:
token_per_repo_id = {}

def path_to_bytes(path):
if path is None:
return None
source_url = path.split("::")[-1]
pattern = (
config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
)
source_url_fields = string_to_dict(source_url, pattern)
token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
download_config = DownloadConfig(token=token)
with xopen(path, "rb", download_config=download_config) as f:
return f.read()

bytes_array = pa.array(
[
(path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None
for x in storage.to_pylist()
],
type=pa.binary(),
)
path_array = pa.array(
[os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()],
type=pa.string(),
)
storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null())
return array_cast(storage, self.pa_type)


def _infer_mesh_file_type(path: Optional[str]) -> Optional[str]:
supported_file_types = {"glb", "ply", "stl"}
if path is None:
return None
path_without_archive = path.split("::", 1)[0]
path_without_query = path_without_archive.split("?", 1)[0]
extension = os.path.splitext(path_without_query)[1].lower().lstrip(".")
return extension if extension in supported_file_types else None


def encode_trimesh_mesh(mesh: Union["trimesh.Trimesh", "trimesh.Scene"]) -> dict[str, Optional[bytes | str]]:
"""Encode a trimesh mesh or scene object into GLB bytes."""
metadata = getattr(mesh, "metadata", None) or {}
path = metadata.get("file_path") or metadata.get("file_name") if isinstance(metadata, dict) else None
if path is not None and os.path.isfile(path):
return {"path": path, "bytes": None}
bytes_ = mesh.export(file_type="glb")
return {"path": os.path.basename(path) if path else "mesh.glb", "bytes": bytes_}
Loading
Loading