From 8dd508238bdb0840a4d22f62b0597aef0e80a1b3 Mon Sep 17 00:00:00 2001 From: Fabio Massimo Ercoli Date: Mon, 14 Jul 2025 12:55:58 +0200 Subject: [PATCH 1/4] APPENG-3315 Restore clip test This has been fixed by APPENG-3363 --- tests/test_clip_encoder.py | 68 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/tests/test_clip_encoder.py b/tests/test_clip_encoder.py index 57d58bb..905f5f3 100644 --- a/tests/test_clip_encoder.py +++ b/tests/test_clip_encoder.py @@ -84,40 +84,40 @@ def test_text_encoding(clip_encoder, more_texts): assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05) -# def test_image_encoding(clip_encoder, more_images, images_having_nones): -# # non batched -# result_simple = clip_encoder.encode_images(more_images) -# # batched with no nones -# result_batched, none_indices = clip_encoder.encode_images_batched_having_nones( -# more_images, batch_size=3 -# ) -# assert none_indices == [] -# # we expect the same result -# assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05) - -# # batched with nones -# embeddings, none_indices = clip_encoder.encode_images_batched_having_nones( -# images_having_nones, batch_size=3 -# ) -# assert none_indices == [3, 7, 9] -# # we expect the same results for non-nones -# for i, _ in enumerate(embeddings): -# if i not in none_indices: -# assert torch.allclose(embeddings[i], result_simple[i], 1e-05, 1e-05) - - -# def test_image_and_text_encoding(clip_encoder, more_texts, images_having_nones): -# clip_embeddings = clip_encoder.encode_texts_and_images( -# more_texts, images_having_nones, 4 -# ) -# assert clip_embeddings is not None - - -# def test_item_df_embedding_generation(clip_encoder, item_df): -# item_clip_features_embed = clip_encoder.clip_embeddings(item_df) -# # produced the object to be used by the workflow: -# # store.push('item_clip_features_embed', item_clip_features_embed, to=PushMode.ONLINE, allow_registry_cache=False) -# assert item_clip_features_embed is not None +def test_image_encoding(clip_encoder, more_images, images_having_nones): + # non batched + result_simple = clip_encoder.encode_images(more_images) + # batched with no nones + result_batched, none_indices = clip_encoder.encode_images_batched_having_nones( + more_images, batch_size=3 + ) + assert none_indices == [] + # we expect the same result + assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05) + + # batched with nones + embeddings, none_indices = clip_encoder.encode_images_batched_having_nones( + images_having_nones, batch_size=3 + ) + assert none_indices == [3, 7, 9] + # we expect the same results for non-nones + for i, _ in enumerate(embeddings): + if i not in none_indices: + assert torch.allclose(embeddings[i], result_simple[i], 1e-05, 1e-05) + + +def test_image_and_text_encoding(clip_encoder, more_texts, images_having_nones): + clip_embeddings = clip_encoder.encode_texts_and_images( + more_texts, images_having_nones, 4 + ) + assert clip_embeddings is not None + + +def test_item_df_embedding_generation(clip_encoder, item_df): + item_clip_features_embed = clip_encoder.clip_embeddings(item_df) + # produced the object to be used by the workflow: + # store.push('item_clip_features_embed', item_clip_features_embed, to=PushMode.ONLINE, allow_registry_cache=False) + assert item_clip_features_embed is not None @pytest.fixture From 7a9abc48c36068152ec06f187d9fbce6114dffa8 Mon Sep 17 00:00:00 2001 From: Fabio Massimo Ercoli Date: Mon, 14 Jul 2025 13:17:33 +0200 Subject: [PATCH 2/4] APPENG-3315 Truncate large text clip encoding --- service/clip_encoder.py | 2 +- tests/data/long_text.txt | 1 + tests/test_clip_encoder.py | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 tests/data/long_text.txt diff --git a/service/clip_encoder.py b/service/clip_encoder.py index b1e8075..91076b5 100644 --- a/service/clip_encoder.py +++ b/service/clip_encoder.py @@ -110,7 +110,7 @@ def encode_images_having_nones(self, images: list[Image]): return result, none_indices def encode_texts(self, texts: list[str]): - inputs = self.tokenizer(texts, padding=True, return_tensors="pt") + inputs = self.tokenizer(texts, padding=True, return_tensors="pt", truncation=True) with torch.no_grad(): return self.model.get_text_features(**inputs) diff --git a/tests/data/long_text.txt b/tests/data/long_text.txt new file mode 100644 index 0000000..2aca1c4 --- /dev/null +++ b/tests/data/long_text.txt @@ -0,0 +1 @@ +The quick brown fox jumps over the lazy dog. In a world where technology evolves rapidly, artificial intelligence and machine learning are transforming industries at an unprecedented pace. From healthcare to finance, automation is streamlining processes, improving efficiency, and enabling new possibilities. As data becomes more abundant, the ability to extract meaningful insights is crucial for decision-making. Collaboration between humans and intelligent systems is shaping the future, fostering innovation and creativity. Ethical considerations, transparency, and fairness are essential to ensure responsible development and deployment of AI technologies. By embracing change and continuous learning, individuals and organizations can thrive in this dynamic landscape, unlocking new opportunities and driving progress for society as a whole. \ No newline at end of file diff --git a/tests/test_clip_encoder.py b/tests/test_clip_encoder.py index 905f5f3..aedfb27 100644 --- a/tests/test_clip_encoder.py +++ b/tests/test_clip_encoder.py @@ -78,6 +78,13 @@ def images_having_nones(more_images: list): return result +@pytest.fixture +def long_text(): + txt_path = Path(__file__).parent.joinpath("data").joinpath("long_text.txt") + with open(txt_path, "r", encoding="utf-8") as f: + return f.read() + + def test_text_encoding(clip_encoder, more_texts): result_batched = clip_encoder.encode_texts_batched(more_texts, batch_size=3) result_simple = clip_encoder.encode_texts(more_texts) @@ -131,3 +138,8 @@ def item_wrong_img_df(): def test_wrong_url(clip_encoder, item_wrong_img_df): clip_embeddings = clip_encoder.create_clip_embeddings(item_wrong_img_df) assert clip_embeddings + + +def test_long_text(clip_encoder, long_text): + embeddings = clip_encoder.encode_texts([long_text]) + assert embeddings is not None \ No newline at end of file From a817c53eb5c6c0ad4396e0ee0c609149d5630e55 Mon Sep 17 00:00:00 2001 From: Fabio Massimo Ercoli Date: Thu, 10 Jul 2025 12:58:06 +0200 Subject: [PATCH 3/4] APPENG-3315 Introduce remote ds --- service/dataset_provider.py | 68 ++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/service/dataset_provider.py b/service/dataset_provider.py index 584e2a9..0324346 100644 --- a/service/dataset_provider.py +++ b/service/dataset_provider.py @@ -4,39 +4,51 @@ import pandas as pd from feast import FeatureStore +from models import data_util -class DatasetProvider: - def item_df(self): - return None - - def user_df(self): - return None - - def interaction_df(self): - return None +class DatasetProvider: -class LocalDatasetProvider(DatasetProvider): - def __init__(self, store=None, data_dir="./feature_repo/data"): + def __init__(self, data_dir, force_load): self._item_df_path = Path(data_dir) / "recommendation_items.parquet" self._user_df_path = Path(data_dir) / "recommendation_users.parquet" self._interaction_df_path = ( - Path(data_dir) / "recommendation_interactions.parquet" + Path(data_dir) / "recommendation_interactions.parquet" ) + self._loaded = False if ( - self._item_df_path.exists() - & self._user_df_path.exists() - & self._interaction_df_path.exists() + self._item_df_path.exists() & self._user_df_path.exists() + & self._interaction_df_path.exists() & force_load is False ): self._item_df = pd.read_parquet(self._item_df_path) self._user_df = pd.read_parquet(self._user_df_path) self._interaction_df = pd.read_parquet(self._interaction_df_path) - return + self._loaded = True - # Use Feast item, user and interaction services to create the dataframes - assert store is not None - self._load_from_store(store) + def item_df(self): + return self._item_df + + def user_df(self): + return self._user_df + + def interaction_df(self): + return self._interaction_df + + def _save_dfs_to_parquet(self): + self._item_df.to_parquet(self._item_df_path) + self._user_df.to_parquet(self._user_df_path) + self._interaction_df.to_parquet(self._interaction_df_path) + + +class LocalDatasetProvider(DatasetProvider): + + def __init__(self, store=None, data_dir="./feature_repo/data"): + super().__init__(data_dir, False) + if self._loaded is False: + assert store is not None + self._load_from_store(store) + self._save_dfs_to_parquet() def _load_from_store(self, store: FeatureStore): # load feature services @@ -77,15 +89,15 @@ def _load_from_store(self, store: FeatureStore): self._interaction_df = store.get_historical_features( entity_df=item_user_interactions_df, features=interaction_service ).to_df() - self._item_df.to_parquet(self._item_df_path) - self._user_df.to_parquet(self._user_df_path) - self._interaction_df.to_parquet(self._interaction_df_path) - def item_df(self): - return self._item_df - def user_df(self): - return self._user_df +class RemoteDatasetProvider(DatasetProvider): + + def __init__(self, url: str, data_dir="./feature_repo/data", force_load=False): + super().__init__(data_dir, force_load) + if self._loaded is False: + df = pd.read_csv(url) + self._item_df, self._user_df, self._interaction_df = data_util.clean_dataset(df) + self._save_dfs_to_parquet() + - def interaction_df(self): - return self._interaction_df From f5ec6a31bee64f384fd46bf24e5dc56d31819302 Mon Sep 17 00:00:00 2001 From: Fabio Massimo Ercoli Date: Wed, 16 Jul 2025 15:13:40 +0200 Subject: [PATCH 4/4] APPENG-3315 Workaround import from backend / workflow --- service/dataset_provider.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/service/dataset_provider.py b/service/dataset_provider.py index 0324346..9d85a2d 100644 --- a/service/dataset_provider.py +++ b/service/dataset_provider.py @@ -3,8 +3,17 @@ import pandas as pd from feast import FeatureStore - -from models import data_util +import importlib +import sys + +# Workaround to import models from workflow and backend +# that have different directory structures. (public/* vs /*) +# from module_models import data_util +DATA_UTIL_FILE_PATH = Path(__file__).parent.parent / "models" / "data_util.py" +spec = importlib.util.spec_from_file_location("module.name", str(DATA_UTIL_FILE_PATH)) +data_util = importlib.util.module_from_spec(spec) +sys.modules["module.name"] = data_util +spec.loader.exec_module(data_util) class DatasetProvider: