Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion service/clip_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def encode_images_having_nones(self, images: list[Image]):
return result, none_indices

def encode_texts(self, texts: list[str]):
inputs = self.tokenizer(texts, padding=True, return_tensors="pt")
inputs = self.tokenizer(texts, padding=True, return_tensors="pt", truncation=True)
with torch.no_grad():
return self.model.get_text_features(**inputs)

Expand Down
77 changes: 49 additions & 28 deletions service/dataset_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,61 @@

import pandas as pd
from feast import FeatureStore
import importlib
import sys

# Workaround to import models from workflow and backend
# that have different directory structures. (public/* vs /*)
# from module_models import data_util
DATA_UTIL_FILE_PATH = Path(__file__).parent.parent / "models" / "data_util.py"
spec = importlib.util.spec_from_file_location("module.name", str(DATA_UTIL_FILE_PATH))
data_util = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = data_util
spec.loader.exec_module(data_util)

class DatasetProvider:
def item_df(self):
return None

def user_df(self):
return None

def interaction_df(self):
return None

class DatasetProvider:

class LocalDatasetProvider(DatasetProvider):
def __init__(self, store=None, data_dir="./feature_repo/data"):
def __init__(self, data_dir, force_load):
self._item_df_path = Path(data_dir) / "recommendation_items.parquet"
self._user_df_path = Path(data_dir) / "recommendation_users.parquet"
self._interaction_df_path = (
Path(data_dir) / "recommendation_interactions.parquet"
Path(data_dir) / "recommendation_interactions.parquet"
)
self._loaded = False

if (
self._item_df_path.exists()
& self._user_df_path.exists()
& self._interaction_df_path.exists()
self._item_df_path.exists() & self._user_df_path.exists()
& self._interaction_df_path.exists() & force_load is False
):
self._item_df = pd.read_parquet(self._item_df_path)
self._user_df = pd.read_parquet(self._user_df_path)
self._interaction_df = pd.read_parquet(self._interaction_df_path)
return
self._loaded = True

# Use Feast item, user and interaction services to create the dataframes
assert store is not None
self._load_from_store(store)
def item_df(self):
return self._item_df

def user_df(self):
return self._user_df

def interaction_df(self):
return self._interaction_df

def _save_dfs_to_parquet(self):
self._item_df.to_parquet(self._item_df_path)
self._user_df.to_parquet(self._user_df_path)
self._interaction_df.to_parquet(self._interaction_df_path)


class LocalDatasetProvider(DatasetProvider):

def __init__(self, store=None, data_dir="./feature_repo/data"):
super().__init__(data_dir, False)
if self._loaded is False:
assert store is not None
self._load_from_store(store)
self._save_dfs_to_parquet()

def _load_from_store(self, store: FeatureStore):
# load feature services
Expand Down Expand Up @@ -77,15 +98,15 @@ def _load_from_store(self, store: FeatureStore):
self._interaction_df = store.get_historical_features(
entity_df=item_user_interactions_df, features=interaction_service
).to_df()
self._item_df.to_parquet(self._item_df_path)
self._user_df.to_parquet(self._user_df_path)
self._interaction_df.to_parquet(self._interaction_df_path)

def item_df(self):
return self._item_df

def user_df(self):
return self._user_df
class RemoteDatasetProvider(DatasetProvider):

def __init__(self, url: str, data_dir="./feature_repo/data", force_load=False):
super().__init__(data_dir, force_load)
if self._loaded is False:
df = pd.read_csv(url)
self._item_df, self._user_df, self._interaction_df = data_util.clean_dataset(df)
self._save_dfs_to_parquet()


def interaction_df(self):
return self._interaction_df
1 change: 1 addition & 0 deletions tests/data/long_text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The quick brown fox jumps over the lazy dog. In a world where technology evolves rapidly, artificial intelligence and machine learning are transforming industries at an unprecedented pace. From healthcare to finance, automation is streamlining processes, improving efficiency, and enabling new possibilities. As data becomes more abundant, the ability to extract meaningful insights is crucial for decision-making. Collaboration between humans and intelligent systems is shaping the future, fostering innovation and creativity. Ethical considerations, transparency, and fairness are essential to ensure responsible development and deployment of AI technologies. By embracing change and continuous learning, individuals and organizations can thrive in this dynamic landscape, unlocking new opportunities and driving progress for society as a whole.
70 changes: 41 additions & 29 deletions tests/test_clip_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,46 +78,53 @@ def images_having_nones(more_images: list):
return result


@pytest.fixture
def long_text():
txt_path = Path(__file__).parent.joinpath("data").joinpath("long_text.txt")
with open(txt_path, "r", encoding="utf-8") as f:
return f.read()


def test_text_encoding(clip_encoder, more_texts):
result_batched = clip_encoder.encode_texts_batched(more_texts, batch_size=3)
result_simple = clip_encoder.encode_texts(more_texts)
assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05)


# def test_image_encoding(clip_encoder, more_images, images_having_nones):
# # non batched
# result_simple = clip_encoder.encode_images(more_images)
# # batched with no nones
# result_batched, none_indices = clip_encoder.encode_images_batched_having_nones(
# more_images, batch_size=3
# )
# assert none_indices == []
# # we expect the same result
# assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05)
def test_image_encoding(clip_encoder, more_images, images_having_nones):
# non batched
result_simple = clip_encoder.encode_images(more_images)
# batched with no nones
result_batched, none_indices = clip_encoder.encode_images_batched_having_nones(
more_images, batch_size=3
)
assert none_indices == []
# we expect the same result
assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05)

# # batched with nones
# embeddings, none_indices = clip_encoder.encode_images_batched_having_nones(
# images_having_nones, batch_size=3
# )
# assert none_indices == [3, 7, 9]
# # we expect the same results for non-nones
# for i, _ in enumerate(embeddings):
# if i not in none_indices:
# assert torch.allclose(embeddings[i], result_simple[i], 1e-05, 1e-05)
# batched with nones
embeddings, none_indices = clip_encoder.encode_images_batched_having_nones(
images_having_nones, batch_size=3
)
assert none_indices == [3, 7, 9]
# we expect the same results for non-nones
for i, _ in enumerate(embeddings):
if i not in none_indices:
assert torch.allclose(embeddings[i], result_simple[i], 1e-05, 1e-05)


# def test_image_and_text_encoding(clip_encoder, more_texts, images_having_nones):
# clip_embeddings = clip_encoder.encode_texts_and_images(
# more_texts, images_having_nones, 4
# )
# assert clip_embeddings is not None
def test_image_and_text_encoding(clip_encoder, more_texts, images_having_nones):
clip_embeddings = clip_encoder.encode_texts_and_images(
more_texts, images_having_nones, 4
)
assert clip_embeddings is not None


# def test_item_df_embedding_generation(clip_encoder, item_df):
# item_clip_features_embed = clip_encoder.clip_embeddings(item_df)
# # produced the object to be used by the workflow:
# # store.push('item_clip_features_embed', item_clip_features_embed, to=PushMode.ONLINE, allow_registry_cache=False)
# assert item_clip_features_embed is not None
def test_item_df_embedding_generation(clip_encoder, item_df):
item_clip_features_embed = clip_encoder.clip_embeddings(item_df)
# produced the object to be used by the workflow:
# store.push('item_clip_features_embed', item_clip_features_embed, to=PushMode.ONLINE, allow_registry_cache=False)
assert item_clip_features_embed is not None


@pytest.fixture
Expand All @@ -131,3 +138,8 @@ def item_wrong_img_df():
def test_wrong_url(clip_encoder, item_wrong_img_df):
clip_embeddings = clip_encoder.create_clip_embeddings(item_wrong_img_df)
assert clip_embeddings


def test_long_text(clip_encoder, long_text):
embeddings = clip_encoder.encode_texts([long_text])
assert embeddings is not None
Loading