From b40e25a6c3ca8422de65c91d5e972010b5112747 Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sat, 25 Apr 2026 19:20:45 +0200 Subject: [PATCH 1/3] chore: code cleaning and improve logging --- src/data/loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/data/loader.py b/src/data/loader.py index 5e5323a..e9fc2a0 100644 --- a/src/data/loader.py +++ b/src/data/loader.py @@ -1,5 +1,7 @@ # src/data/loader.py +"""Data loading module for MovieLens 1M dataset.""" + import pandas as pd from typing import Dict from pathlib import Path @@ -22,7 +24,7 @@ def load_dataset(folder: Path = FILE_PATH) -> Dict[str, pd.DataFrame]: """ Load all .dat files and assign headers based on filename. - + Args: folder (Path): The folder containing the .dat files. From 8b601cb0f03a2c1f000110e84e06902ba085fb46 Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sat, 25 Apr 2026 19:52:43 +0200 Subject: [PATCH 2/3] chore: improve logging & remove unwanted code blocks & comments --- src/data/loader.py | 2 -- src/data/preprocessor.py | 36 ++++++++++++++++-------------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/data/loader.py b/src/data/loader.py index e9fc2a0..57cde5c 100644 --- a/src/data/loader.py +++ b/src/data/loader.py @@ -24,10 +24,8 @@ def load_dataset(folder: Path = FILE_PATH) -> Dict[str, pd.DataFrame]: """ Load all .dat files and assign headers based on filename. - Args: folder (Path): The folder containing the .dat files. - Returns: Dict[str, pd.DataFrame]: A dictionary mapping dataset names to DataFrames. """ diff --git a/src/data/preprocessor.py b/src/data/preprocessor.py index d372504..1d5ac3d 100644 --- a/src/data/preprocessor.py +++ b/src/data/preprocessor.py @@ -1,12 +1,6 @@ # src/data/preprocessor.py -""" -Preprocessing module for MovieLens 1M dataset. - -Responsibilities: -- Filter low activity movies -- Train/test split -""" +"""Preprocessing module for MovieLens 1M dataset.""" import pandas as pd from sklearn.model_selection import train_test_split @@ -26,6 +20,7 @@ def _get_missing_values(dataframe: pd.DataFrame) -> tuple: """ missing_values = dataframe.isnull().sum().sum() missing_percentage = (missing_values / len(dataframe)) * 100 + logger.info(f"Missing values: {missing_values}, Missing percentage: {missing_percentage:.2f}%") return missing_values, missing_percentage @@ -41,6 +36,7 @@ def _remove_duplicates(dataframe: pd.DataFrame) -> pd.DataFrame: dataframe with duplicates removed """ duplicates = dataframe.duplicated().sum() + logger.info(f"Duplicated rows: {duplicates}") return dataframe.drop_duplicates() @@ -49,32 +45,31 @@ def _remove_duplicates(dataframe: pd.DataFrame) -> pd.DataFrame: def filter_movies(ratings: pd.DataFrame, min_ratings: int = 10) -> pd.DataFrame: """ Filter out movies with fewer than min_ratings ratings. - + Args: ratings: ratings dataframe min_ratings: minimum number of ratings required Returns: filtered ratings dataframe """ + logger.info(f"Movies before filtering: {ratings['movie_id'].nunique()}") + + # Filter logic: count ratings per movie and keep only those with enough ratings movie_counts = ratings.groupby("movie_id")["rating"].count() valid_movies = movie_counts[movie_counts >= min_ratings].index + + # Log the number of movies removed due to low ratings filtered = ratings[ratings["movie_id"].isin(valid_movies)] - movies_removed = ratings["movie_id"].nunique() - filtered["movie_id"].nunique() + logger.info(f"Movies after filtering: {filtered['movie_id'].nunique()}") - logger.info(f"Filtering movies with less than {min_ratings} ratings...") - logger.info(f"Before filtering: {ratings['movie_id'].nunique()}") - logger.info(f"After filtering: {filtered['movie_id'].nunique()}") + movies_removed = ratings["movie_id"].nunique() - filtered["movie_id"].nunique() logger.info(f"Movies removed: {movies_removed}") return filtered # Train/test split for ratings -def train_test_split_ratings( - ratings: pd.DataFrame, - test_size: float = 0.2, - random_state: int = 42 -) -> tuple: +def train_test_split_ratings(ratings: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> tuple: """ Split ratings into train and test sets. @@ -89,7 +84,7 @@ def train_test_split_ratings( ratings, test_size=test_size, random_state=random_state, - stratify=ratings["user_id"] + stratify=ratings["user_id"] # stratify by user_id to ensure all users are represented in both sets ) logger.info(f"Train size: {train.shape}") @@ -111,18 +106,19 @@ def preprocess_pipeline(ratings: pd.DataFrame,movies: pd.DataFrame, users: pd.Da tuple: train, test, movies, users """ # check missing values for all - logger.info("Checking for missing values...") + logger.info("Checking Missing Valuses...") _get_missing_values(ratings) _get_missing_values(movies) _get_missing_values(users) # remove duplicates from all - logger.info("Duplicate rows check and removal...") + logger.info("Duplicate Check and Removal...") ratings = _remove_duplicates(ratings) movies = _remove_duplicates(movies) users = _remove_duplicates(users) # filter low activity movies from ratings + logger.info("Filtering low activity movies...") ratings = filter_movies(ratings) # train test split on ratings From 13aaf3ad56b47f773372f577b53926da79e17f9c Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sat, 25 Apr 2026 20:13:29 +0200 Subject: [PATCH 3/3] feat: add normalisation for user item matrix --- src/features/build_features.py | 97 +++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 12 deletions(-) diff --git a/src/features/build_features.py b/src/features/build_features.py index 6ba4cb9..d4eeb08 100644 --- a/src/features/build_features.py +++ b/src/features/build_features.py @@ -9,26 +9,26 @@ - build_interaction_features: Builds interaction features from the ratings DataFrame. """ -from pathlib import Path - import pandas as pd +import numpy as np +from pathlib import Path +from scipy.sparse import csr_matrix from src.utils import get_logger # Initialize logger logger = get_logger(__name__) + def build_user_item_matrix(train: pd.DataFrame) -> tuple: """ Build sparse User-Item Matrix from training data. Args: - train: training dataframe - + train: training dataframe (ratings train data) Returns: tuple: sparse matrix, user_map, item_map """ - from scipy.sparse import csr_matrix # create mappings user_ids = train["user_id"].unique() @@ -48,13 +48,55 @@ def build_user_item_matrix(train: pd.DataFrame) -> tuple: shape=(len(user_map), len(item_map)) ) + # calculate sparsity + sparsity = 1 - user_item_matrix.nnz / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) + + # log matrix info logger.info(f"Matrix shape: {user_item_matrix.shape}") - logger.info(f"Sparsity: {1 - user_item_matrix.nnz / (user_item_matrix.shape[0] * user_item_matrix.shape[1]):.4f}") + logger.info(f"Sparsity: {sparsity:.4f}") return user_item_matrix, user_map, item_map -# Note: This function is used by the feature pipeline and can also be reused for future feature engineering steps. +# Marix normalization function +def normalize_matrix(user_item_matrix: csr_matrix) -> tuple: + """ + Normalize user ratings by subtracting user mean. + + Args: + user_item_matrix: sparse user-item matrix + Returns: + tuple: normalized matrix, user means + """ + # lil_matrix for efficient row operations, then convert back to csr_matrix + from scipy.sparse import lil_matrix + + # initialize user means array + user_means = np.zeros(user_item_matrix.shape[0]) + + # calculate user means + for i in range(user_item_matrix.shape[0]): + user_ratings = user_item_matrix[i].toarray().flatten() + rated = user_ratings[user_ratings > 0] + if len(rated) > 0: + user_means[i] = rated.mean() + + # convert to lil_matrix for efficient row operations + normalized = lil_matrix(user_item_matrix.shape, dtype=float) + + for i in range(user_item_matrix.shape[0]): + row = user_item_matrix[i].toarray().flatten() + nonzero_idx = np.where(row > 0)[0] + if len(nonzero_idx) > 0 and user_means[i] > 0: + normalized[i, nonzero_idx] = row[nonzero_idx] - user_means[i] + + # convert back to csr_matrix for efficient computations later + normalized = normalized.tocsr() + + return normalized, user_means + + +# Functions to save features to disk def save_features(user_item_matrix, user_map: dict, item_map: dict) -> None: """ Save sparse matrix and mappings to disk. @@ -63,8 +105,7 @@ def save_features(user_item_matrix, user_map: dict, item_map: dict) -> None: user_item_matrix: sparse user-item matrix user_map: mapping of user_id to row index item_map: mapping of movie_id to col index - """ - + """ from scipy.sparse import save_npz matrix_path = Path(__file__).parent.parent.parent / "data" / "processed" @@ -87,6 +128,29 @@ def save_features(user_item_matrix, user_map: dict, item_map: dict) -> None: logger.info(f"Saved matrix to {matrix_path}") logger.info(f"Saved mappings to {mappings_path}") + +# Function to save normalized matrix and user means +def save_normalized_matrix(normalized_matrix: csr_matrix, user_means: np.ndarray) -> None: + """ + Save normalized matrix to disk. + + Args: + normalized_matrix: normalized sparse user-item matrix + user_means: array of user means + """ + from scipy.sparse import save_npz + + matrix_path = Path(__file__).parent.parent.parent / "data" / "processed" + matrix_path.mkdir(parents=True, exist_ok=True) + + save_npz(matrix_path / "normalized_matrix.npz", normalized_matrix) + np.save(matrix_path / "user_means.npy", user_means) + + logger.info(f"Saved normalized matrix to {matrix_path}") + logger.info(f"Saved user means to {matrix_path}") + + +# Main pipeline function to run all feature building steps def build_features_pipeline(train: pd.DataFrame) -> tuple: """ Run all feature building steps in order. @@ -94,14 +158,23 @@ def build_features_pipeline(train: pd.DataFrame) -> tuple: Args: train: training dataframe Returns: - tuple: user_item_matrix, user_map, item_map + tuple: user_item_matrix, user_map, item_map, user_means, normalized_matrix """ + logger.info("=== START: FEATURE BUILD PIPELINE ===") + # build matrix + logger.info("Building user-item matrix...") user_item_matrix, user_map, item_map = build_user_item_matrix(train) + # normalize matrix + logger.info("Normalizing matrix...") + normalized_matrix, user_means = normalize_matrix(user_item_matrix) + # save to disk + logger.info("Saving features to disk...") save_features(user_item_matrix, user_map, item_map) + save_normalized_matrix(normalized_matrix, user_means) - logger.info("Feature pipeline completed successfully.") + logger.info("=== END: FEATURE BUILD PIPELINE ===") - return user_item_matrix, user_map, item_map \ No newline at end of file + return user_item_matrix, user_map, item_map, user_means, normalized_matrix \ No newline at end of file