diff --git a/service/clip_encoder.py b/service/clip_encoder.py index b1e8075..91076b5 100644 --- a/service/clip_encoder.py +++ b/service/clip_encoder.py @@ -110,7 +110,7 @@ def encode_images_having_nones(self, images: list[Image]): return result, none_indices def encode_texts(self, texts: list[str]): - inputs = self.tokenizer(texts, padding=True, return_tensors="pt") + inputs = self.tokenizer(texts, padding=True, return_tensors="pt", truncation=True) with torch.no_grad(): return self.model.get_text_features(**inputs) diff --git a/service/dataset_provider.py b/service/dataset_provider.py index 584e2a9..9d85a2d 100644 --- a/service/dataset_provider.py +++ b/service/dataset_provider.py @@ -3,40 +3,61 @@ import pandas as pd from feast import FeatureStore +import importlib +import sys +# Workaround to import models from workflow and backend +# that have different directory structures. (public/* vs /*) +# from module_models import data_util +DATA_UTIL_FILE_PATH = Path(__file__).parent.parent / "models" / "data_util.py" +spec = importlib.util.spec_from_file_location("module.name", str(DATA_UTIL_FILE_PATH)) +data_util = importlib.util.module_from_spec(spec) +sys.modules["module.name"] = data_util +spec.loader.exec_module(data_util) -class DatasetProvider: - def item_df(self): - return None - - def user_df(self): - return None - - def interaction_df(self): - return None +class DatasetProvider: -class LocalDatasetProvider(DatasetProvider): - def __init__(self, store=None, data_dir="./feature_repo/data"): + def __init__(self, data_dir, force_load): self._item_df_path = Path(data_dir) / "recommendation_items.parquet" self._user_df_path = Path(data_dir) / "recommendation_users.parquet" self._interaction_df_path = ( - Path(data_dir) / "recommendation_interactions.parquet" + Path(data_dir) / "recommendation_interactions.parquet" ) + self._loaded = False if ( - self._item_df_path.exists() - & self._user_df_path.exists() - & self._interaction_df_path.exists() + self._item_df_path.exists() & self._user_df_path.exists() + & self._interaction_df_path.exists() & force_load is False ): self._item_df = pd.read_parquet(self._item_df_path) self._user_df = pd.read_parquet(self._user_df_path) self._interaction_df = pd.read_parquet(self._interaction_df_path) - return + self._loaded = True - # Use Feast item, user and interaction services to create the dataframes - assert store is not None - self._load_from_store(store) + def item_df(self): + return self._item_df + + def user_df(self): + return self._user_df + + def interaction_df(self): + return self._interaction_df + + def _save_dfs_to_parquet(self): + self._item_df.to_parquet(self._item_df_path) + self._user_df.to_parquet(self._user_df_path) + self._interaction_df.to_parquet(self._interaction_df_path) + + +class LocalDatasetProvider(DatasetProvider): + + def __init__(self, store=None, data_dir="./feature_repo/data"): + super().__init__(data_dir, False) + if self._loaded is False: + assert store is not None + self._load_from_store(store) + self._save_dfs_to_parquet() def _load_from_store(self, store: FeatureStore): # load feature services @@ -77,15 +98,15 @@ def _load_from_store(self, store: FeatureStore): self._interaction_df = store.get_historical_features( entity_df=item_user_interactions_df, features=interaction_service ).to_df() - self._item_df.to_parquet(self._item_df_path) - self._user_df.to_parquet(self._user_df_path) - self._interaction_df.to_parquet(self._interaction_df_path) - def item_df(self): - return self._item_df - def user_df(self): - return self._user_df +class RemoteDatasetProvider(DatasetProvider): + + def __init__(self, url: str, data_dir="./feature_repo/data", force_load=False): + super().__init__(data_dir, force_load) + if self._loaded is False: + df = pd.read_csv(url) + self._item_df, self._user_df, self._interaction_df = data_util.clean_dataset(df) + self._save_dfs_to_parquet() + - def interaction_df(self): - return self._interaction_df diff --git a/tests/data/long_text.txt b/tests/data/long_text.txt new file mode 100644 index 0000000..2aca1c4 --- /dev/null +++ b/tests/data/long_text.txt @@ -0,0 +1 @@ +The quick brown fox jumps over the lazy dog. In a world where technology evolves rapidly, artificial intelligence and machine learning are transforming industries at an unprecedented pace. From healthcare to finance, automation is streamlining processes, improving efficiency, and enabling new possibilities. As data becomes more abundant, the ability to extract meaningful insights is crucial for decision-making. Collaboration between humans and intelligent systems is shaping the future, fostering innovation and creativity. Ethical considerations, transparency, and fairness are essential to ensure responsible development and deployment of AI technologies. By embracing change and continuous learning, individuals and organizations can thrive in this dynamic landscape, unlocking new opportunities and driving progress for society as a whole. \ No newline at end of file diff --git a/tests/test_clip_encoder.py b/tests/test_clip_encoder.py index 57d58bb..aedfb27 100644 --- a/tests/test_clip_encoder.py +++ b/tests/test_clip_encoder.py @@ -78,46 +78,53 @@ def images_having_nones(more_images: list): return result +@pytest.fixture +def long_text(): + txt_path = Path(__file__).parent.joinpath("data").joinpath("long_text.txt") + with open(txt_path, "r", encoding="utf-8") as f: + return f.read() + + def test_text_encoding(clip_encoder, more_texts): result_batched = clip_encoder.encode_texts_batched(more_texts, batch_size=3) result_simple = clip_encoder.encode_texts(more_texts) assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05) -# def test_image_encoding(clip_encoder, more_images, images_having_nones): -# # non batched -# result_simple = clip_encoder.encode_images(more_images) -# # batched with no nones -# result_batched, none_indices = clip_encoder.encode_images_batched_having_nones( -# more_images, batch_size=3 -# ) -# assert none_indices == [] -# # we expect the same result -# assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05) +def test_image_encoding(clip_encoder, more_images, images_having_nones): + # non batched + result_simple = clip_encoder.encode_images(more_images) + # batched with no nones + result_batched, none_indices = clip_encoder.encode_images_batched_having_nones( + more_images, batch_size=3 + ) + assert none_indices == [] + # we expect the same result + assert torch.allclose(result_batched, result_simple, 1e-05, 1e-05) -# # batched with nones -# embeddings, none_indices = clip_encoder.encode_images_batched_having_nones( -# images_having_nones, batch_size=3 -# ) -# assert none_indices == [3, 7, 9] -# # we expect the same results for non-nones -# for i, _ in enumerate(embeddings): -# if i not in none_indices: -# assert torch.allclose(embeddings[i], result_simple[i], 1e-05, 1e-05) + # batched with nones + embeddings, none_indices = clip_encoder.encode_images_batched_having_nones( + images_having_nones, batch_size=3 + ) + assert none_indices == [3, 7, 9] + # we expect the same results for non-nones + for i, _ in enumerate(embeddings): + if i not in none_indices: + assert torch.allclose(embeddings[i], result_simple[i], 1e-05, 1e-05) -# def test_image_and_text_encoding(clip_encoder, more_texts, images_having_nones): -# clip_embeddings = clip_encoder.encode_texts_and_images( -# more_texts, images_having_nones, 4 -# ) -# assert clip_embeddings is not None +def test_image_and_text_encoding(clip_encoder, more_texts, images_having_nones): + clip_embeddings = clip_encoder.encode_texts_and_images( + more_texts, images_having_nones, 4 + ) + assert clip_embeddings is not None -# def test_item_df_embedding_generation(clip_encoder, item_df): -# item_clip_features_embed = clip_encoder.clip_embeddings(item_df) -# # produced the object to be used by the workflow: -# # store.push('item_clip_features_embed', item_clip_features_embed, to=PushMode.ONLINE, allow_registry_cache=False) -# assert item_clip_features_embed is not None +def test_item_df_embedding_generation(clip_encoder, item_df): + item_clip_features_embed = clip_encoder.clip_embeddings(item_df) + # produced the object to be used by the workflow: + # store.push('item_clip_features_embed', item_clip_features_embed, to=PushMode.ONLINE, allow_registry_cache=False) + assert item_clip_features_embed is not None @pytest.fixture @@ -131,3 +138,8 @@ def item_wrong_img_df(): def test_wrong_url(clip_encoder, item_wrong_img_df): clip_embeddings = clip_encoder.create_clip_embeddings(item_wrong_img_df) assert clip_embeddings + + +def test_long_text(clip_encoder, long_text): + embeddings = clip_encoder.encode_texts([long_text]) + assert embeddings is not None \ No newline at end of file