Skip to content
4 changes: 2 additions & 2 deletions docs/guide/batch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ Generate recommendations:

>>> recs = recommend(pop_pipe, split.test.keys())
>>> recs.to_df()
user_id item_id score rank
0 ... 1
user_id item_id rank score
0 ... 1 ...
...
[3000 rows x 4 columns]

Expand Down
12 changes: 8 additions & 4 deletions src/accel/indirect/hashing/content_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ use arrow::array::{
make_array, Array, ArrayData, AsArray, GenericStringArray, OffsetSizeTrait, StringBuilder,
StringViewArray,
};
use arrow::compute::cast;
use arrow::pyarrow::PyArrowType;
use arrow_schema::DataType;
use log::*;
use pyo3::exceptions::PyTypeError;
use pyo3::types::PyAnyMethods;
use rustc_hash::FxHasher;
Expand Down Expand Up @@ -71,10 +73,12 @@ impl<OS: OffsetSizeTrait> IndirectHashContent for StringContentArray<OS> {
DataType::Utf8View => Box::new(arr.as_string_view().clone()),
DataType::LargeUtf8 => Box::new(arr.as_string::<i64>().clone()),
t => {
return Err(PyTypeError::new_err(format!(
"unsupported array type {:?}",
t
)))
trace!("converting search array from {}", t);
let arr = cast(&arr, &DataType::Utf8)
.map_err(|e| PyTypeError::new_err(format!("error casting arrays: {}", e)))?
.as_string::<i32>()
.clone();
Box::new(arr)
}
}
} else {
Expand Down
16 changes: 15 additions & 1 deletion src/lenskit/data/_adapt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# SPDX-License-Identifier: MIT

"""
Adaptation code for dealing with legacy data layouts.
Adaptation code for dealing with legacy data layouts and adapting data types.
"""

# pyright: basic
Expand All @@ -14,6 +14,7 @@
import logging
import warnings
from typing import (
Any,
Collection,
Iterable,
Literal,
Expand All @@ -22,6 +23,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa

from ._builder import DatasetBuilder
from ._dataset import Dataset
Expand All @@ -39,6 +41,18 @@
ITEM_COMPAT_COLUMN = AliasedColumn(ITEM_COLUMN, ["item"], warn=True)


def py_scalar(x) -> Any:
"""
Utility function to adapt data to a Python scalar.
"""
if isinstance(x, np.generic):
return x.item()
elif isinstance(x, pa.Scalar):
return x.as_py()
else:
return x


def column_name(col: Column) -> str:
match col:
case str(name):
Expand Down
16 changes: 16 additions & 0 deletions src/lenskit/data/_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,22 @@ def is_sparse(self) -> bool:
"""
return self._spec.layout == AttrLayout.SPARSE

def value(self) -> Any:
"""
If the attribute is for a single entity, get its value as a Python object.
"""
n = len(self)
if n == 1:
return self.list()[0]
else:
raise ValueError(f"value() not defined for {n} values")

def list(self) -> list[Any]:
"""
Get the attribute values as a Python list.
"""
return self.arrow().to_pylist()

def pandas(
self, *, missing: Literal["null", "omit"] = "null"
) -> pd.Series | pd.DataFrame: # pragma: nocover
Expand Down
48 changes: 42 additions & 6 deletions src/lenskit/data/_collection/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import json
import warnings
from abc import ABC, abstractmethod
from collections.abc import Sequence
Expand All @@ -27,10 +28,12 @@
import pandas as pd
import pyarrow as pa
from pyarrow.parquet import ParquetDataset, ParquetWriter
from pydantic import JsonValue

from lenskit.diagnostics import DataWarning
from lenskit.logging import get_logger

from .._adapt import py_scalar
from .._arrow import explode_column
from .._builder import DatasetBuilder
from .._container import DataContainer
Expand Down Expand Up @@ -213,12 +216,8 @@ def to_df(self) -> pd.DataFrame:
DataWarning,
stacklevel=2,
)
fields = list(self.key_fields)
return (
pd.concat({k: il.to_df(numbers=False) for (k, il) in self.items()}, names=fields)
.reset_index(fields)
.reset_index(drop=True)
)
tbl = self.to_arrow(layout="flat")
return tbl.to_pandas()

def to_arrow(
self, *, batch_size: int = 5000, layout: Literal["native", "flat"] = "native"
Expand All @@ -236,6 +235,43 @@ def to_arrow(
"""
return pa.Table.from_batches(self.record_batches(batch_size=batch_size, layout=layout))

@overload
def to_json_data(self, *, object: Literal[True]) -> dict[ID, list[dict[str, JsonValue]]]: ...
@overload
def to_json_data(self, *, object: Literal[False] = False) -> list[dict[str, JsonValue]]: ...
@overload
def to_json_data(
self, *, object: bool
) -> list[dict[str, JsonValue]] | dict[ID, list[dict[str, JsonValue]]]: ...
def to_json_data(
self, *, object: bool = False
) -> list[dict[str, JsonValue]] | dict[ID, list[dict[str, JsonValue]]]:
"""
Convert this item list collection to JSON-compatible data.

Args:
object:
If ``True``, construct as a dictionary whose keys are the item
list keys. This mode is only supported when there is a single
key field.

If ``False``, return a list of objects.
"""
if object:
if len(self.key_fields) > 1:
raise ValueError("cannot convert multi-key collection to object JSON")
return {py_scalar(k[0]): items.to_json_data() for (k, items) in self.items()}
else:
tbl = self.to_arrow()
return tbl.to_pylist()

def to_json(self, *, object: bool = False) -> str:
"""
Convert this item list collection to JSON. Wrapper around :meth:`to_json_data`.
"""
data = self.to_json_data(object=object)
return json.dumps(data)

@overload
def to_dataset(
self,
Expand Down
16 changes: 16 additions & 0 deletions src/lenskit/data/_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,19 @@ def to_df(self, *, ids: bool = True, numbers: bool = True) -> pd.DataFrame:
cols.update((k, v.numpy()) for (k, v) in self._fields.items() if k not in ("score", "rank"))
return pd.DataFrame(cols)

def to_json_data(self, *, ids: bool = True, numbers: bool = False):
"""
Convert this item list to JSON-compatible data.

.. note::

This is not a full round-trip serialization — the original item list
cannot be reconstructed from JSON.
"""

tbl = self.to_arrow(ids=ids, numbers=numbers)
return tbl.to_pylist()

@overload
def to_arrow(
self,
Expand Down Expand Up @@ -1140,6 +1153,9 @@ def _take(self, sel: ILIndexer, *, ordered: bool | None = None) -> ItemList:
def __len__(self):
return self._len

def __bool__(self):
return self._len > 0

def __getitem__(self, sel: ILIndexer) -> ItemList:
"""
Subset the item list.
Expand Down
2 changes: 1 addition & 1 deletion src/lenskit/data/_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def numbers(
if pa.types.is_null(self._array.type):
nums = pa.nulls(len(terms), type=pa.int32())
else:
term_arr = pa.array(terms, type=self._array.type) # type: ignore
term_arr = pa.array(terms) # type: ignore
nums = self._index.get_indexes(term_arr) # type: ignore

trace(self._log, "resolved %d IDs, %d invalid", len(terms), nums.null_count)
Expand Down
40 changes: 40 additions & 0 deletions tests/data/test_attribute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This file is part of LensKit.
# Copyright (C) 2018-2023 Boise State University.
# Copyright (C) 2023-2026 Drexel University.
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

import pandas as pd

from pytest import raises

from lenskit.data import Dataset, DatasetBuilder


def test_attribute_single_value():
items = pd.DataFrame({"item_id": [42], "name": ["HACKEM MUCHE"]})
dsb = DatasetBuilder()
dsb.add_entities("item", items)
data = dsb.build()
assert data.item_count == 1

assert data.entities("item").attribute("name").value() == "HACKEM MUCHE"
assert data.entities("item").attribute("name").list() == ["HACKEM MUCHE"]


def test_attribute_pydata():
items = pd.DataFrame({"item_id": [42, 67], "name": ["HACKEM MUCHE", "READ ME"]})
dsb = DatasetBuilder()
dsb.add_entities("item", items)
data = dsb.build()
assert data.item_count == 2

assert data.entities("item").attribute("name").list() == ["HACKEM MUCHE", "READ ME"]
assert data.entities("item").select(ids=[67]).attribute("name").list() == ["READ ME"]
assert data.entities("item").select(ids=[42]).attribute("name").arrow().to_pylist() == [
"HACKEM MUCHE"
]

with raises(ValueError):
assert data.entities("item").attribute("name").value()
assert data.entities("item").select(ids=[42]).attribute("name").value() == "HACKEM MUCHE"
34 changes: 33 additions & 1 deletion tests/data/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

import json
import logging
import pickle
import warnings
Expand Down Expand Up @@ -408,7 +409,7 @@ def test_recs_df_expected_column(demo_recs: DemoRecs):
rec_df = demo_recs.recommendations.to_df()
print(rec_df)
print(demo_recs.recommendations[0])
assert list(rec_df.columns) == ["user_id", "item_id", "score", "rank"]
assert list(rec_df.columns) == ["user_id", "item_id", "rank", "score"]


def test_to_dataset(demo_recs: DemoRecs):
Expand All @@ -431,3 +432,34 @@ def test_to_dataset(demo_recs: DemoRecs):

assert np.all(ds_user_counts == src_user_counts)
assert np.all(ds_item_counts == src_item_counts)


def test_to_json_array(demo_recs: DemoRecs):
arr = demo_recs.recommendations.to_json_data()
assert len(arr) == len(demo_recs.recommendations)
for i, row in enumerate(arr):
k, il = demo_recs.recommendations[i]
assert row["user_id"] == k.user_id
assert len(row["items"]) == len(il)
assert np.all([i["item_id"] for i in row["items"]] == il.ids())


def test_to_json_object(demo_recs: DemoRecs):
arr = demo_recs.recommendations.to_json_data(object=True)
assert len(arr) == len(demo_recs.recommendations)
for uid, items in arr.items():
il = demo_recs.recommendations.lookup(user_id=uid)
assert il is not None
assert len(items) == len(il)
assert np.all([i["item_id"] for i in items] == il.ids())


def test_to_json_object_str(demo_recs: DemoRecs):
data = demo_recs.recommendations.to_json(object=True)
arr = json.loads(data)
assert len(arr) == len(demo_recs.recommendations)
for uid, items in arr.items():
il = demo_recs.recommendations.lookup(user_id=int(uid))
assert il is not None
assert len(items) == len(il)
assert np.all([i["item_id"] for i in items] == il.ids())
23 changes: 23 additions & 0 deletions tests/data/test_convert_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# This file is part of LensKit.
# Copyright (C) 2018-2023 Boise State University.
# Copyright (C) 2023-2026 Drexel University.
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

import numpy as np

from lenskit.data._adapt import py_scalar


def test_py_scalar_pyint():
assert py_scalar(42) == 42


def test_py_scalar_pystr():
assert py_scalar("foo") == "foo"


def test_py_scalar_npint():
x = py_scalar(np.int32(100))
assert x == 100
assert isinstance(x, int)
33 changes: 32 additions & 1 deletion tests/data/test_itemlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import hypothesis.extra.numpy as nph
import hypothesis.strategies as st
from hypothesis import given, settings
from pytest import mark, raises, warns
from pytest import approx, mark, raises, warns

from lenskit.data import Dataset, ItemList, Vocabulary
from lenskit.diagnostics import DataWarning
Expand Down Expand Up @@ -1051,3 +1051,34 @@ def test_item_list_update_add_for_some_items():
assert all(il.numbers() == base.numbers())
assert il.vocabulary is VOCAB
assert np.array_equal(il.scores(), [np.nan, 100, 200, 300, np.nan], equal_nan=True)


def test_to_json():
il = ItemList(ITEMS, vocabulary=VOCAB)

data = il.to_json_data()
assert isinstance(data, list)
assert len(data) == len(il)
for i, row in enumerate(data):
assert row == {"item_id": ITEMS[i]}


def test_to_json_numbers():
il = ItemList(ITEMS, vocabulary=VOCAB)

data = il.to_json_data(ids=False, numbers=True)
assert isinstance(data, list)
assert len(data) == len(il)
for i, row in enumerate(data):
assert row == {"item_num": i}


def test_to_json_scores(rng: np.random.Generator):
scores = rng.random(len(ITEMS))
il = ItemList(ITEMS, vocabulary=VOCAB, scores=scores)

data = il.to_json_data()
assert isinstance(data, list)
assert len(data) == len(il)
for i, row in enumerate(data):
assert row == {"item_id": ITEMS[i], "score": approx(scores[i])}
Loading