From 62046a0c7526d3f200c1b2a2b597ba07a6977e60 Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sun, 26 Apr 2026 02:27:01 +0200 Subject: [PATCH 1/5] feat: add SVD model training with normalization and MLflow tracking --- src/models/train.py | 195 ++++++++++++++++++++++++++++++++------------ 1 file changed, 145 insertions(+), 50 deletions(-) diff --git a/src/models/train.py b/src/models/train.py index d1757a2..5573f1b 100644 --- a/src/models/train.py +++ b/src/models/train.py @@ -1,7 +1,6 @@ # src/models/train.py -""" Training module for the movie recommendation system. -This module contains functions to train the recommendation model using the training data.""" +"""Training module for the movie recommendation system.""" import joblib import mlflow @@ -16,61 +15,89 @@ logger = get_logger(__name__) -# Define the path to save the trained model MODELS_PATH = Path(__file__).parent.parent.parent / "models" + def train_model(user_item_matrix: csr_matrix, k: int = 50) -> NearestNeighbors: """ - Train ItemKNN model on user-item matrix. - + Train ItemKNN model on normalized user-item matrix. + Args: - user_item_matrix: sparse user-item matrix + user_item_matrix: normalized sparse matrix k: number of neighbors Returns: trained NearestNeighbors model """ logger.info(f"Training ItemKNN with K={k}") - model = NearestNeighbors( n_neighbors=k, metric="cosine", algorithm="brute" ) - model.fit(user_item_matrix.T) - logger.info(f"Model trained successfully!") - return model -def save_model(model: NearestNeighbors, model_name: str = "itemknn") -> None: + +def train_svd_model(normalized_matrix: csr_matrix, n_factors: int = 50) -> tuple: + """ + Train SVD model on normalized user-item matrix. + + Args: + normalized_matrix: normalized sparse matrix + n_factors: number of latent factors + Returns: + tuple: U, sigma, Vt, predicted_ratings + """ + from scipy.sparse.linalg import svds + + logger.info(f"Training SVD with {n_factors} factors") + + U, sigma, Vt = svds(normalized_matrix.astype(float), k=n_factors) + sigma = np.diag(sigma) + + predicted_ratings = np.dot(np.dot(U, sigma), Vt) + + if hasattr(predicted_ratings, "toarray"): + predicted_ratings = predicted_ratings.toarray() + + predicted_ratings = np.asarray(predicted_ratings) + + logger.info(f"SVD trained successfully!") + return U, sigma, Vt, predicted_ratings + + +def save_model(model, model_name: str = "itemknn") -> None: """ Save trained model to models/ folder. - + Args: - model: trained NearestNeighbors model + model: trained model or predicted ratings matrix model_name: name of the model file """ MODELS_PATH.mkdir(parents=True, exist_ok=True) - model_path = MODELS_PATH / f"{model_name}.joblib" joblib.dump(model, model_path) - logger.info(f"Model saved to {model_path} successfully!") -def _evaluate_model( + +def _evaluate_itemknn( model: NearestNeighbors, user_item_matrix: csr_matrix, + normalized_matrix: csr_matrix, + user_means: np.ndarray, test: pd.DataFrame, user_map: dict, item_map: dict ) -> float: """ - Evaluate model using RMSE on test data. - + Evaluate ItemKNN model using RMSE on test sample. + Args: - model: trained model - user_item_matrix: sparse matrix + model: trained ItemKNN model + user_item_matrix: original sparse matrix + normalized_matrix: normalized sparse matrix + user_means: array of user means test: test dataframe user_map: user_id to index mapping item_map: movie_id to index mapping @@ -82,7 +109,9 @@ def _evaluate_model( actuals = [] predictions = [] - for _, row in test.head(1000).iterrows(): + test_sample = test.sample(1000, random_state=42) + + for _, row in test_sample.iterrows(): user_idx = user_map.get(row["user_id"]) item_idx = item_map.get(row["movie_id"]) @@ -90,7 +119,7 @@ def _evaluate_model( continue distances, indices = model.kneighbors( - user_item_matrix.T[item_idx], + normalized_matrix.T[item_idx], n_neighbors=model.n_neighbors ) @@ -100,68 +129,134 @@ def _evaluate_model( if weights.sum() > 0 and similar_ratings.sum() > 0: pred = np.average(similar_ratings, weights=weights) else: - pred = 0 + pred = user_means[user_idx] + + actuals.append(row["rating"]) + predictions.append(pred) + + rmse = root_mean_squared_error(actuals, predictions) + logger.info(f"ItemKNN RMSE: {rmse:.4f}") + return rmse + + +def _evaluate_svd( + predicted_ratings: np.ndarray, + user_means: np.ndarray, + test: pd.DataFrame, + user_map: dict, + item_map: dict +) -> float: + """ + Evaluate SVD model using RMSE on test sample. + + Args: + predicted_ratings: full predicted ratings matrix + user_means: array of user means + test: test dataframe + user_map: user_id to index mapping + item_map: movie_id to index mapping + Returns: + RMSE score + """ + from sklearn.metrics import root_mean_squared_error + + actuals = [] + predictions = [] + + test_sample = test.sample(1000, random_state=42) + + for _, row in test_sample.iterrows(): + user_idx = user_map.get(row["user_id"]) + item_idx = item_map.get(row["movie_id"]) + + if user_idx is None or item_idx is None: + continue + + pred = predicted_ratings[user_idx, item_idx] + user_means[user_idx] actuals.append(row["rating"]) predictions.append(pred) rmse = root_mean_squared_error(actuals, predictions) - logger.info(f"RMSE: {rmse:.4f}") + logger.info(f"SVD RMSE: {rmse:.4f}") return rmse + def train_pipeline( user_item_matrix: csr_matrix, - test, + normalized_matrix: csr_matrix, + user_means: np.ndarray, + test: pd.DataFrame, user_map: dict, item_map: dict, - k_values: list = [10, 20, 50] -) -> NearestNeighbors: + k_values: list = [10, 20, 50], + n_factors_list: list = [50, 100, 200] +) -> tuple: """ - Train multiple ItemKNN models with different K values, - track with MLflow and save the best model. - + Train ItemKNN and SVD models, track with MLflow, save best model. + Args: - user_item_matrix: sparse user-item matrix + user_item_matrix: original sparse matrix + normalized_matrix: normalized sparse matrix + user_means: array of user means test: test dataframe user_map: user_id to index mapping item_map: movie_id to index mapping - k_values: list of K values to try + k_values: list of K values for ItemKNN + n_factors_list: list of factor values for SVD Returns: - best trained model + tuple: best_model, best_model_type, best_predicted_ratings """ mlflow.set_tracking_uri("sqlite:///mlflow.db") mlflow.set_experiment("movie-recommendation-sys") best_model = None - best_k = None + best_model_type = None best_rmse = float("inf") + best_predicted_ratings = None + # train ItemKNN models for k in k_values: with mlflow.start_run(run_name=f"ItemKNN_K{k}"): - - # train model - model = train_model(user_item_matrix, k=k) - - # evaluate model - rmse = _evaluate_model(model, user_item_matrix, test, user_map, item_map) - - # log to mlflow + model = train_model(normalized_matrix, k=k) + rmse = _evaluate_itemknn( + model, user_item_matrix, normalized_matrix, + user_means, test, user_map, item_map + ) + mlflow.log_param("model", "ItemKNN") mlflow.log_param("k", k) mlflow.log_param("metric", "cosine") mlflow.log_param("algorithm", "brute") mlflow.log_metric("rmse", rmse) + logger.info(f"ItemKNN K={k} -> RMSE: {rmse:.4f}") - logger.info(f"K={k} → RMSE: {rmse:.4f}") - - # track best model if rmse < best_rmse: best_rmse = rmse best_model = model - best_k = k + best_model_type = f"itemknn_k{k}" - # save best model - save_model(best_model, model_name=f"itemknn_k{best_k}") + # train SVD models + for n_factors in n_factors_list: + with mlflow.start_run(run_name=f"SVD_{n_factors}factors"): + U, sigma, Vt, predicted_ratings = train_svd_model( + normalized_matrix, n_factors=n_factors + ) + rmse = _evaluate_svd( + predicted_ratings, user_means, test, user_map, item_map + ) + mlflow.log_param("model", "SVD") + mlflow.log_param("n_factors", n_factors) + mlflow.log_metric("rmse", rmse) + logger.info(f"SVD {n_factors} factors -> RMSE: {rmse:.4f}") - logger.info(f"Best model: ItemKNN K={best_k} RMSE={best_rmse:.4f} ✅") + if rmse < best_rmse: + best_rmse = rmse + best_model = predicted_ratings + best_model_type = f"svd_{n_factors}factors" + best_predicted_ratings = predicted_ratings + + # save best model + save_model(best_model, model_name=best_model_type) + logger.info(f"Best model: {best_model_type} RMSE={best_rmse:.4f}") - return best_model \ No newline at end of file + return best_model, best_model_type, best_predicted_ratings \ No newline at end of file From ca7ae03e41a532bba947ba9ffecbd130df999cf2 Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sun, 26 Apr 2026 02:27:26 +0200 Subject: [PATCH 2/5] feat: add SVD evaluation on full test set, RMSE improved to 0.965 --- src/models/evaluate.py | 91 +++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/src/models/evaluate.py b/src/models/evaluate.py index 1f1fab5..b38fd3d 100644 --- a/src/models/evaluate.py +++ b/src/models/evaluate.py @@ -1,11 +1,5 @@ """ Evaluation module for movie recommendation system. - -Responsibilities: -- Evaluate best model on test set -- Calculate RMSE -- Save evaluation report -- Log metrics to MLflow """ import json @@ -30,32 +24,23 @@ def save_evaluation_report(report: dict, report_name: str = "evaluation_report.j report_path = REPORTS_PATH / report_name with open(report_path, "w") as f: json.dump(report, f, indent=4) - logger.info(f"Evaluation report saved to {report_path} ✅") + logger.info(f"Evaluation report saved to {report_path}") -def calculate_rmse( +def calculate_rmse_knn( model: NearestNeighbors, user_item_matrix: csr_matrix, + normalized_matrix: csr_matrix, + user_means: np.ndarray, test: pd.DataFrame, user_map: dict, item_map: dict ) -> float: - """ - Calculate RMSE on test sample. - - Args: - model: trained model - user_item_matrix: sparse matrix - test: test dataframe - user_map: user_id to index mapping - item_map: movie_id to index mapping - Returns: - RMSE score - """ + """Calculate RMSE for KNN model on test sample.""" actuals = [] predictions = [] - test_sample = test.sample(5000, random_state=42) + test_sample = test.sample(1000, random_state=42) for _, row in test_sample.iterrows(): user_idx = user_map.get(row["user_id"]) @@ -65,7 +50,7 @@ def calculate_rmse( continue distances, indices = model.kneighbors( - user_item_matrix.T[item_idx], + normalized_matrix.T[item_idx], n_neighbors=model.n_neighbors ) @@ -75,47 +60,71 @@ def calculate_rmse( if weights.sum() > 0 and similar_ratings.sum() > 0: pred = np.average(similar_ratings, weights=weights) else: - pred = 0 + pred = user_means[user_idx] actuals.append(row["rating"]) predictions.append(pred) rmse = root_mean_squared_error(actuals, predictions) - logger.info(f"RMSE on test sample: {rmse:.4f}") + logger.info(f"KNN RMSE on test sample: {rmse:.4f}") + return rmse + + +def calculate_rmse_svd( + predicted_ratings: np.ndarray, + user_means: np.ndarray, + test: pd.DataFrame, + user_map: dict, + item_map: dict +) -> float: + """Calculate RMSE for SVD model on full test set.""" + actuals = [] + predictions = [] + + for _, row in test.iterrows(): + user_idx = user_map.get(row["user_id"]) + item_idx = item_map.get(row["movie_id"]) + + if user_idx is None or item_idx is None: + continue + + pred = predicted_ratings[user_idx, item_idx] + user_means[user_idx] + actuals.append(row["rating"]) + predictions.append(pred) + + rmse = root_mean_squared_error(actuals, predictions) + logger.info(f"SVD RMSE on full test set: {rmse:.4f}") return rmse def evaluate_pipeline( - model: NearestNeighbors, + model, user_item_matrix: csr_matrix, + normalized_matrix: csr_matrix, + user_means: np.ndarray, test: pd.DataFrame, user_map: dict, item_map: dict, + best_model_type: str = "itemknn", k: int = 10 ) -> dict: - """ - Run evaluation pipeline on best model. - - Args: - model: best trained model - user_item_matrix: sparse matrix - test: test dataframe - user_map: user_id to index mapping - item_map: movie_id to index mapping - k: number of recommendations - Returns: - dict: evaluation metrics - """ + """Run evaluation pipeline on best model.""" logger.info("Starting evaluation pipeline...") - rmse = calculate_rmse(model, user_item_matrix, test, user_map, item_map) - report = {"rmse": round(rmse, 4)} + if "svd" in best_model_type: + rmse = calculate_rmse_svd(model, user_means, test, user_map, item_map) + else: + rmse = calculate_rmse_knn( + model, user_item_matrix, normalized_matrix, + user_means, test, user_map, item_map + ) + report = {"rmse": round(rmse, 4), "model": best_model_type} save_evaluation_report(report) mlflow.set_tracking_uri("sqlite:///mlflow.db") with mlflow.start_run(run_name="best_model_evaluation"): - mlflow.log_metrics(report) + mlflow.log_metrics({"rmse": round(rmse, 4)}) logger.info(f"Evaluation report: {report}") From 10a2f3ac4e5ebfa629af03a123ba2e617a148f91 Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sun, 26 Apr 2026 02:27:42 +0200 Subject: [PATCH 3/5] feat: add SVD prediction pipeline with user mean denormalization --- src/models/predict.py | 120 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 13 deletions(-) diff --git a/src/models/predict.py b/src/models/predict.py index 315b5a1..330f940 100644 --- a/src/models/predict.py +++ b/src/models/predict.py @@ -69,6 +69,22 @@ def load_artifacts() -> tuple: return user_item_matrix, user_map, item_map, idx_to_item + +def load_svd_artifacts()-> tuple: + """ + Load SVD predicted ratings matrix and user means. + + Returns. + tuple: predicted ratings matrix, user means + """ + + predicted_ratings = joblib.load(MODELS_PATH / "svd_50factors.joblib") + user_means = np.load(DATA_PATH / "user_means.npy") + + logger.info("SVD artifacts loaded successfully") + return predicted_ratings, user_means + + # Evaluation module for the movie recommendation system. def recommend_movies( user_id: int, @@ -143,6 +159,69 @@ def recommend_movies( logger.info(f"Generated {len(result)} recommendations for User {user_id}") return result + +def recommend_movies_svd( + user_id: int, + predicted_ratings: np.ndarray, + user_means: np.ndarray, + user_map: dict, + idx_to_item: dict, + user_item_matrix, + movies: pd.DataFrame, + n: int = 10 +) -> pd.DataFrame: + """ + Generate recommendations using SVD predicted ratings. + + Args: + user_id: user to recommend for + predicted_ratings: full predicted ratings matrix + user_means: array of user means + user_map: user_id to index mapping + idx_to_item: index to movie_id mapping + user_item_matrix: to find unwatched movies + movies: movies dataframe for titles + n: number of recommendations + Returns: + dataframe with top N recommendations + """ + user_idx = user_map.get(user_id) + if user_idx is None: + logger.error(f"User {user_id} not found!") + return None + + # get user ratings + user_ratings = user_item_matrix[user_idx].toarray().flatten() + + # get predicted ratings for this user + user_predicted = predicted_ratings[user_idx].copy() + + # add user mean back + user_predicted = user_predicted + user_means[user_idx] + + # set watched movies to -1 + user_predicted[user_ratings > 0] = -1 + + # get top N indices + top_indices = np.argsort(user_predicted)[::-1][:n] + + recommendations = [] + for item_idx in top_indices: + movie_id = idx_to_item.get(item_idx) + score = user_predicted[item_idx] + title = movies[movies["movie_id"] == movie_id]["title"].values + if len(title) > 0: + recommendations.append({ + "movie_id": movie_id, + "title": title[0], + "predicted_score": round(float(score), 2) + }) + + result = pd.DataFrame(recommendations) + logger.info(f"Generated {len(result)} SVD recommendations for User {user_id}") + return result + + def predict_pipeline(user_id: int, movies: pd.DataFrame, n: int = 10) -> pd.DataFrame: """ Run full prediction pipeline for a user. @@ -154,20 +233,35 @@ def predict_pipeline(user_id: int, movies: pd.DataFrame, n: int = 10) -> pd.Data Returns: dataframe with top N recommendations """ - # load model and artifacts - model = load_model() + # load artifacts user_item_matrix, user_map, item_map, idx_to_item = load_artifacts() - # generate recommendations - recommendations = recommend_movies( - user_id=user_id, - model=model, - user_item_matrix=user_item_matrix, - user_map=user_map, - item_map=item_map, - idx_to_item=idx_to_item, - movies=movies, - n=n - ) + # try SVD first + svd_path = MODELS_PATH / "svd_50factors.joblib" + if svd_path.exists(): + predicted_ratings, user_means = load_svd_artifacts() + recommendations = recommend_movies_svd( + user_id=user_id, + predicted_ratings=predicted_ratings, + user_means=user_means, + user_map=user_map, + idx_to_item=idx_to_item, + user_item_matrix=user_item_matrix, + movies=movies, + n=n + ) + else: + # fallback to ItemKNN + model = load_model() + recommendations = recommend_movies( + user_id=user_id, + model=model, + user_item_matrix=user_item_matrix, + user_map=user_map, + item_map=item_map, + idx_to_item=idx_to_item, + movies=movies, + n=n + ) return recommendations \ No newline at end of file From 095661869b05e5bcd4cb9ae10d819a696c0c93ab Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sun, 26 Apr 2026 02:30:10 +0200 Subject: [PATCH 4/5] chore: results for the best model --- reports/evaluation_report.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reports/evaluation_report.json b/reports/evaluation_report.json index 7a76afc..2ee7fef 100644 --- a/reports/evaluation_report.json +++ b/reports/evaluation_report.json @@ -1,3 +1,4 @@ { - "rmse": 2.5369 + "rmse": 0.965, + "model": "svd_50factors" } \ No newline at end of file From 795d071159f47605ec1d9042dddd2ece847723f9 Mon Sep 17 00:00:00 2001 From: harmandeep2993 Date: Sun, 26 Apr 2026 02:33:01 +0200 Subject: [PATCH 5/5] chore: clean up main.py remove debug prints and unused imports --- main.py | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 041a76c..bca6c7f 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ -from src.data import load_dataset -from src.data import preprocess_pipeline +"""Main script for Movie Recommendation System using MovieLens 1M dataset.""" + +from src.data import load_dataset, preprocess_pipeline from src.features import build_features_pipeline from src.models import train_pipeline, evaluate_pipeline, predict_pipeline -from sklearn.neighbors import NearestNeighbors -# step 1 - load +# step 1 - load datasets datasets = load_dataset() # step 2 - preprocess @@ -15,20 +15,37 @@ ) # step 3 - build features -user_item_matrix, user_map, item_map = build_features_pipeline(ratings_train) -print("Data loading, preprocessing, and feature engineering completed.") +user_item_matrix, user_map, item_map, user_means, normalized_matrix = build_features_pipeline(ratings_train) +print("Data loading, preprocessing and feature engineering completed.") -# Step 4 - train model -best_model = train_pipeline(user_item_matrix, test=ratings_test, user_map=user_map, item_map=item_map, k_values=[10, 20, 50]) +# step 4 - train model +best_model, best_model_type, best_predicted_ratings = train_pipeline( + user_item_matrix, + normalized_matrix, + user_means, + test=ratings_test, + user_map=user_map, + item_map=item_map, + k_values=[10, 20, 50], + n_factors_list=[50, 100, 200] +) +print(f"Best model: {best_model_type}") print("Model training completed.") # step 5 - evaluate best model -evaluate_pipeline(best_model, user_item_matrix, ratings_test, user_map, item_map, k=10) +report = evaluate_pipeline( + best_predicted_ratings, + user_item_matrix, + normalized_matrix, + user_means, + ratings_test, + user_map, + item_map, + best_model_type=best_model_type, + k=10 +) print("Evaluation completed.") -# step 6 - generate recommendations for a specific user - -# user_id = 1 -# recommendations = predict_pipeline(user_id, movies, n=10) +# step 6 - generate recommendations for user 1 recommendations = predict_pipeline(user_id=1, movies=movies, n=10) print(f"Top 10 recommendations for User 1:\n{recommendations}") \ No newline at end of file