From 006ee0e7e952fd62d1d769f0e08775a4584ae9e5 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 23 Apr 2026 14:34:22 -0400 Subject: [PATCH 01/10] data/items: add explicit __bool__ to ItemList --- src/lenskit/data/_items.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lenskit/data/_items.py b/src/lenskit/data/_items.py index e90bea5b1..2aa70b5cb 100644 --- a/src/lenskit/data/_items.py +++ b/src/lenskit/data/_items.py @@ -1140,6 +1140,9 @@ def _take(self, sel: ILIndexer, *, ordered: bool | None = None) -> ItemList: def __len__(self): return self._len + def __bool__(self): + return self._len > 0 + def __getitem__(self, sel: ILIndexer) -> ItemList: """ Subset the item list. From b2c8751db56580fe768c6ee4a0af92a6383f6016 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 23 Apr 2026 14:34:48 -0400 Subject: [PATCH 02/10] data/vocab: fix passing string arrays to Vocabulary.numbers --- src/lenskit/data/_vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lenskit/data/_vocab.py b/src/lenskit/data/_vocab.py index d8458de4e..94e969ff4 100644 --- a/src/lenskit/data/_vocab.py +++ b/src/lenskit/data/_vocab.py @@ -167,7 +167,7 @@ def numbers( if pa.types.is_null(self._array.type): nums = pa.nulls(len(terms), type=pa.int32()) else: - term_arr = pa.array(terms, type=self._array.type) # type: ignore + term_arr = pa.array(terms) # type: ignore nums = self._index.get_indexes(term_arr) # type: ignore trace(self._log, "resolved %d IDs, %d invalid", len(terms), nums.null_count) From ecb898ae78ed9443c721e4c8a00dcb0840387ff5 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 23 Apr 2026 14:35:02 -0400 Subject: [PATCH 03/10] data: add list() and value() accessors to entity attributes --- src/lenskit/data/_attributes.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/lenskit/data/_attributes.py b/src/lenskit/data/_attributes.py index 037ad1613..e904b78f3 100644 --- a/src/lenskit/data/_attributes.py +++ b/src/lenskit/data/_attributes.py @@ -190,6 +190,22 @@ def is_sparse(self) -> bool: """ return self._spec.layout == AttrLayout.SPARSE + def value(self) -> Any: + """ + If the attribute is for a single entity, get its value as a Python object. + """ + n = len(self) + if n == 1: + return self.list()[0] + else: + raise ValueError(f"value() not defined for {n} values") + + def list(self) -> list[Any]: + """ + Get the attribute values as a Python list. + """ + return self.arrow().to_pylist() + def pandas( self, *, missing: Literal["null", "omit"] = "null" ) -> pd.Series | pd.DataFrame: # pragma: nocover From d96ec99e2ccb35c1356186a7230186f97d62fa50 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 11:20:50 -0400 Subject: [PATCH 04/10] data: fix type cast for string indexes --- src/accel/indirect/hashing/content_string.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/accel/indirect/hashing/content_string.rs b/src/accel/indirect/hashing/content_string.rs index 5afbb68e7..b8845c9f4 100644 --- a/src/accel/indirect/hashing/content_string.rs +++ b/src/accel/indirect/hashing/content_string.rs @@ -10,8 +10,10 @@ use arrow::array::{ make_array, Array, ArrayData, AsArray, GenericStringArray, OffsetSizeTrait, StringBuilder, StringViewArray, }; +use arrow::compute::cast; use arrow::pyarrow::PyArrowType; use arrow_schema::DataType; +use log::*; use pyo3::exceptions::PyTypeError; use pyo3::types::PyAnyMethods; use rustc_hash::FxHasher; @@ -71,10 +73,12 @@ impl IndirectHashContent for StringContentArray { DataType::Utf8View => Box::new(arr.as_string_view().clone()), DataType::LargeUtf8 => Box::new(arr.as_string::().clone()), t => { - return Err(PyTypeError::new_err(format!( - "unsupported array type {:?}", - t - ))) + trace!("converting search array from {}", t); + let arr = cast(&arr, &DataType::Utf8) + .map_err(|e| PyTypeError::new_err(format!("error casting arrays: {}", e)))? + .as_string::() + .clone(); + Box::new(arr) } } } else { From d1d7257d04db6c7422363aa7193d835c3d8fd61c Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 16:22:40 -0400 Subject: [PATCH 05/10] data: add tests for new attribute methods --- tests/data/test_attribute.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/data/test_attribute.py diff --git a/tests/data/test_attribute.py b/tests/data/test_attribute.py new file mode 100644 index 000000000..d764faac2 --- /dev/null +++ b/tests/data/test_attribute.py @@ -0,0 +1,40 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University. +# Copyright (C) 2023-2026 Drexel University. +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import pandas as pd + +from pytest import raises + +from lenskit.data import Dataset, DatasetBuilder + + +def test_attribute_single_value(): + items = pd.DataFrame({"item_id": [42], "name": ["HACKEM MUCHE"]}) + dsb = DatasetBuilder() + dsb.add_entities("item", items) + data = dsb.build() + assert data.item_count == 1 + + assert data.entities("item").attribute("name").value() == "HACKEM MUCHE" + assert data.entities("item").attribute("name").list() == ["HACKEM MUCHE"] + + +def test_attribute_pydata(): + items = pd.DataFrame({"item_id": [42, 67], "name": ["HACKEM MUCHE", "READ ME"]}) + dsb = DatasetBuilder() + dsb.add_entities("item", items) + data = dsb.build() + assert data.item_count == 2 + + assert data.entities("item").attribute("name").list() == ["HACKEM MUCHE", "READ ME"] + assert data.entities("item").select(ids=[67]).attribute("name").list() == ["READ ME"] + assert data.entities("item").select(ids=[42]).attribute("name").arrow().to_pylist() == [ + "HACKEM MUCHE" + ] + + with raises(ValueError): + assert data.entities("item").attribute("name").value() + assert data.entities("item").select(ids=[42]).attribute("name").value() == "HACKEM MUCHE" From acae6d3dd587b4e90ebf9cbe21a2f2f6b212e2bb Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 17:08:02 -0400 Subject: [PATCH 06/10] data: use Arrow to convert ILC to Pandas --- src/lenskit/data/_collection/_base.py | 8 ++------ tests/data/test_collection.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/lenskit/data/_collection/_base.py b/src/lenskit/data/_collection/_base.py index bf75517eb..529f9f658 100644 --- a/src/lenskit/data/_collection/_base.py +++ b/src/lenskit/data/_collection/_base.py @@ -213,12 +213,8 @@ def to_df(self) -> pd.DataFrame: DataWarning, stacklevel=2, ) - fields = list(self.key_fields) - return ( - pd.concat({k: il.to_df(numbers=False) for (k, il) in self.items()}, names=fields) - .reset_index(fields) - .reset_index(drop=True) - ) + tbl = self.to_arrow(layout="flat") + return tbl.to_pandas() def to_arrow( self, *, batch_size: int = 5000, layout: Literal["native", "flat"] = "native" diff --git a/tests/data/test_collection.py b/tests/data/test_collection.py index c54da967b..2194b8fc1 100644 --- a/tests/data/test_collection.py +++ b/tests/data/test_collection.py @@ -408,7 +408,7 @@ def test_recs_df_expected_column(demo_recs: DemoRecs): rec_df = demo_recs.recommendations.to_df() print(rec_df) print(demo_recs.recommendations[0]) - assert list(rec_df.columns) == ["user_id", "item_id", "score", "rank"] + assert list(rec_df.columns) == ["user_id", "item_id", "rank", "score"] def test_to_dataset(demo_recs: DemoRecs): From e1e65fc03ab2ccab1f05a6c7a2c4db233dd438e4 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 17:08:20 -0400 Subject: [PATCH 07/10] data: add JSON conversion to item lists --- src/lenskit/data/_items.py | 13 +++++++++++++ tests/data/test_itemlist.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/lenskit/data/_items.py b/src/lenskit/data/_items.py index 2aa70b5cb..9a534ba30 100644 --- a/src/lenskit/data/_items.py +++ b/src/lenskit/data/_items.py @@ -831,6 +831,19 @@ def to_df(self, *, ids: bool = True, numbers: bool = True) -> pd.DataFrame: cols.update((k, v.numpy()) for (k, v) in self._fields.items() if k not in ("score", "rank")) return pd.DataFrame(cols) + def to_json_data(self, *, ids: bool = True, numbers: bool = False): + """ + Convert this item list to JSON-compatible data. + + .. note:: + + This is not a full round-trip serialization — the original item list + cannot be reconstructed from JSON. + """ + + tbl = self.to_arrow(ids=ids, numbers=numbers) + return tbl.to_pylist() + @overload def to_arrow( self, diff --git a/tests/data/test_itemlist.py b/tests/data/test_itemlist.py index 16670ee8c..f0f471cef 100644 --- a/tests/data/test_itemlist.py +++ b/tests/data/test_itemlist.py @@ -16,7 +16,7 @@ import hypothesis.extra.numpy as nph import hypothesis.strategies as st from hypothesis import given, settings -from pytest import mark, raises, warns +from pytest import approx, mark, raises, warns from lenskit.data import Dataset, ItemList, Vocabulary from lenskit.diagnostics import DataWarning @@ -1051,3 +1051,34 @@ def test_item_list_update_add_for_some_items(): assert all(il.numbers() == base.numbers()) assert il.vocabulary is VOCAB assert np.array_equal(il.scores(), [np.nan, 100, 200, 300, np.nan], equal_nan=True) + + +def test_to_json(): + il = ItemList(ITEMS, vocabulary=VOCAB) + + data = il.to_json_data() + assert isinstance(data, list) + assert len(data) == len(il) + for i, row in enumerate(data): + assert row == {"item_id": ITEMS[i]} + + +def test_to_json_numbers(): + il = ItemList(ITEMS, vocabulary=VOCAB) + + data = il.to_json_data(ids=False, numbers=True) + assert isinstance(data, list) + assert len(data) == len(il) + for i, row in enumerate(data): + assert row == {"item_num": i} + + +def test_to_json_scores(rng: np.random.Generator): + scores = rng.random(len(ITEMS)) + il = ItemList(ITEMS, vocabulary=VOCAB, scores=scores) + + data = il.to_json_data() + assert isinstance(data, list) + assert len(data) == len(il) + for i, row in enumerate(data): + assert row == {"item_id": ITEMS[i], "score": approx(scores[i])} From 3212611d2c0f83dbb1ba5a2c75a633bdee59b203 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 17:19:26 -0400 Subject: [PATCH 08/10] data: add JSON export support to item list collection --- src/lenskit/data/_collection/_base.py | 31 +++++++++++++++++++++++++++ tests/data/test_collection.py | 20 +++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/src/lenskit/data/_collection/_base.py b/src/lenskit/data/_collection/_base.py index 529f9f658..5a08e4217 100644 --- a/src/lenskit/data/_collection/_base.py +++ b/src/lenskit/data/_collection/_base.py @@ -27,6 +27,7 @@ import pandas as pd import pyarrow as pa from pyarrow.parquet import ParquetDataset, ParquetWriter +from pydantic import JsonValue from lenskit.diagnostics import DataWarning from lenskit.logging import get_logger @@ -232,6 +233,36 @@ def to_arrow( """ return pa.Table.from_batches(self.record_batches(batch_size=batch_size, layout=layout)) + @overload + def to_json_data(self, *, object: Literal[True]) -> dict[ID, list[dict[str, JsonValue]]]: ... + @overload + def to_json_data(self, *, object: Literal[False] = False) -> list[dict[str, JsonValue]]: ... + @overload + def to_json_data( + self, *, object: bool + ) -> list[dict[str, JsonValue]] | dict[ID, list[dict[str, JsonValue]]]: ... + def to_json_data( + self, *, object: bool = False + ) -> list[dict[str, JsonValue]] | dict[ID, list[dict[str, JsonValue]]]: + """ + Convert this item list collection to JSON-compatible data. + + Args: + object: + If ``True``, construct as a dictionary whose keys are the item + list keys. This mode is only supported when there is a single + key field. + + If ``False``, return a list of objects. + """ + if object: + if len(self.key_fields) > 1: + raise ValueError("cannot convert multi-key collection to object JSON") + return {k[0]: items.to_json_data() for (k, items) in self.items()} + else: + tbl = self.to_arrow() + return tbl.to_pylist() + @overload def to_dataset( self, diff --git a/tests/data/test_collection.py b/tests/data/test_collection.py index 2194b8fc1..41e4b183c 100644 --- a/tests/data/test_collection.py +++ b/tests/data/test_collection.py @@ -431,3 +431,23 @@ def test_to_dataset(demo_recs: DemoRecs): assert np.all(ds_user_counts == src_user_counts) assert np.all(ds_item_counts == src_item_counts) + + +def test_to_json_array(demo_recs: DemoRecs): + arr = demo_recs.recommendations.to_json_data() + assert len(arr) == len(demo_recs.recommendations) + for i, row in enumerate(arr): + k, il = demo_recs.recommendations[i] + assert row["user_id"] == k.user_id + assert len(row["items"]) == len(il) + assert np.all([i["item_id"] for i in row["items"]] == il.ids()) + + +def test_to_json_object(demo_recs: DemoRecs): + arr = demo_recs.recommendations.to_json_data(object=True) + assert len(arr) == len(demo_recs.recommendations) + for uid, items in arr.items(): + il = demo_recs.recommendations.lookup(user_id=uid) + assert il is not None + assert len(items) == len(il) + assert np.all([i["item_id"] for i in items] == il.ids()) From 53e8faf66574529bf437761271d26a5b59c96928 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 17:45:21 -0400 Subject: [PATCH 09/10] data: debug to_json --- src/lenskit/data/_adapt.py | 16 +++++++++++++++- src/lenskit/data/_collection/_base.py | 11 ++++++++++- tests/data/test_collection.py | 12 ++++++++++++ tests/data/test_convert_util.py | 23 +++++++++++++++++++++++ 4 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 tests/data/test_convert_util.py diff --git a/src/lenskit/data/_adapt.py b/src/lenskit/data/_adapt.py index f64490a70..f71f35bbf 100644 --- a/src/lenskit/data/_adapt.py +++ b/src/lenskit/data/_adapt.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: MIT """ -Adaptation code for dealing with legacy data layouts. +Adaptation code for dealing with legacy data layouts and adapting data types. """ # pyright: basic @@ -14,6 +14,7 @@ import logging import warnings from typing import ( + Any, Collection, Iterable, Literal, @@ -22,6 +23,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from ._builder import DatasetBuilder from ._dataset import Dataset @@ -39,6 +41,18 @@ ITEM_COMPAT_COLUMN = AliasedColumn(ITEM_COLUMN, ["item"], warn=True) +def py_scalar(x) -> Any: + """ + Utility function to adapt data to a Python scalar. + """ + if isinstance(x, np.generic): + return x.item() + elif isinstance(x, pa.Scalar): + return x.as_py() + else: + return x + + def column_name(col: Column) -> str: match col: case str(name): diff --git a/src/lenskit/data/_collection/_base.py b/src/lenskit/data/_collection/_base.py index 5a08e4217..7eb5d58ba 100644 --- a/src/lenskit/data/_collection/_base.py +++ b/src/lenskit/data/_collection/_base.py @@ -6,6 +6,7 @@ from __future__ import annotations +import json import warnings from abc import ABC, abstractmethod from collections.abc import Sequence @@ -32,6 +33,7 @@ from lenskit.diagnostics import DataWarning from lenskit.logging import get_logger +from .._adapt import py_scalar from .._arrow import explode_column from .._builder import DatasetBuilder from .._container import DataContainer @@ -258,11 +260,18 @@ def to_json_data( if object: if len(self.key_fields) > 1: raise ValueError("cannot convert multi-key collection to object JSON") - return {k[0]: items.to_json_data() for (k, items) in self.items()} + return {py_scalar(k[0]): items.to_json_data() for (k, items) in self.items()} else: tbl = self.to_arrow() return tbl.to_pylist() + def to_json(self, *, object: bool = False) -> str: + """ + Convert this item list collection to JSON. Wrapper around :meth:`to_json_data`. + """ + data = self.to_json_data(object=object) + return json.dumps(data) + @overload def to_dataset( self, diff --git a/tests/data/test_collection.py b/tests/data/test_collection.py index 41e4b183c..9ee1093cd 100644 --- a/tests/data/test_collection.py +++ b/tests/data/test_collection.py @@ -4,6 +4,7 @@ # Licensed under the MIT license, see LICENSE.md for details. # SPDX-License-Identifier: MIT +import json import logging import pickle import warnings @@ -451,3 +452,14 @@ def test_to_json_object(demo_recs: DemoRecs): assert il is not None assert len(items) == len(il) assert np.all([i["item_id"] for i in items] == il.ids()) + + +def test_to_json_object_str(demo_recs: DemoRecs): + data = demo_recs.recommendations.to_json(object=True) + arr = json.loads(data) + assert len(arr) == len(demo_recs.recommendations) + for uid, items in arr.items(): + il = demo_recs.recommendations.lookup(user_id=int(uid)) + assert il is not None + assert len(items) == len(il) + assert np.all([i["item_id"] for i in items] == il.ids()) diff --git a/tests/data/test_convert_util.py b/tests/data/test_convert_util.py new file mode 100644 index 000000000..e7f988473 --- /dev/null +++ b/tests/data/test_convert_util.py @@ -0,0 +1,23 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University. +# Copyright (C) 2023-2026 Drexel University. +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import numpy as np + +from lenskit.data._adapt import py_scalar + + +def test_py_scalar_pyint(): + assert py_scalar(42) == 42 + + +def test_py_scalar_pystr(): + assert py_scalar("foo") == "foo" + + +def test_py_scalar_npint(): + x = py_scalar(np.int32(100)) + assert x == 100 + assert isinstance(x, int) From dc34caed4b5d980543aa4ad6e1fa9171e7de8544 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Fri, 24 Apr 2026 19:58:11 -0400 Subject: [PATCH 10/10] fix test --- docs/guide/batch.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guide/batch.rst b/docs/guide/batch.rst index 6c8d68d7a..3177ddb4e 100644 --- a/docs/guide/batch.rst +++ b/docs/guide/batch.rst @@ -45,8 +45,8 @@ Generate recommendations: >>> recs = recommend(pop_pipe, split.test.keys()) >>> recs.to_df() - user_id item_id score rank - 0 ... 1 + user_id item_id rank score + 0 ... 1 ... ... [3000 rows x 4 columns]