Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from src.data import load_dataset
from src.data import preprocess_pipeline
"""Main script for Movie Recommendation System using MovieLens 1M dataset."""

from src.data import load_dataset, preprocess_pipeline
from src.features import build_features_pipeline
from src.models import train_pipeline, evaluate_pipeline, predict_pipeline
from sklearn.neighbors import NearestNeighbors

# step 1 - load
# step 1 - load datasets
datasets = load_dataset()

# step 2 - preprocess
Expand All @@ -15,20 +15,37 @@
)

# step 3 - build features
user_item_matrix, user_map, item_map = build_features_pipeline(ratings_train)
print("Data loading, preprocessing, and feature engineering completed.")
user_item_matrix, user_map, item_map, user_means, normalized_matrix = build_features_pipeline(ratings_train)
print("Data loading, preprocessing and feature engineering completed.")

# Step 4 - train model
best_model = train_pipeline(user_item_matrix, test=ratings_test, user_map=user_map, item_map=item_map, k_values=[10, 20, 50])
# step 4 - train model
best_model, best_model_type, best_predicted_ratings = train_pipeline(
user_item_matrix,
normalized_matrix,
user_means,
test=ratings_test,
user_map=user_map,
item_map=item_map,
k_values=[10, 20, 50],
n_factors_list=[50, 100, 200]
)
print(f"Best model: {best_model_type}")
print("Model training completed.")

# step 5 - evaluate best model
evaluate_pipeline(best_model, user_item_matrix, ratings_test, user_map, item_map, k=10)
report = evaluate_pipeline(
best_predicted_ratings,
user_item_matrix,
normalized_matrix,
user_means,
ratings_test,
user_map,
item_map,
best_model_type=best_model_type,
k=10
)
print("Evaluation completed.")

# step 6 - generate recommendations for a specific user

# user_id = 1
# recommendations = predict_pipeline(user_id, movies, n=10)
# step 6 - generate recommendations for user 1
recommendations = predict_pipeline(user_id=1, movies=movies, n=10)
print(f"Top 10 recommendations for User 1:\n{recommendations}")
3 changes: 2 additions & 1 deletion reports/evaluation_report.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{
"rmse": 2.5369
"rmse": 0.965,
"model": "svd_50factors"
}
91 changes: 50 additions & 41 deletions src/models/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
"""
Evaluation module for movie recommendation system.

Responsibilities:
- Evaluate best model on test set
- Calculate RMSE
- Save evaluation report
- Log metrics to MLflow
"""

import json
Expand All @@ -30,32 +24,23 @@ def save_evaluation_report(report: dict, report_name: str = "evaluation_report.j
report_path = REPORTS_PATH / report_name
with open(report_path, "w") as f:
json.dump(report, f, indent=4)
logger.info(f"Evaluation report saved to {report_path}")
logger.info(f"Evaluation report saved to {report_path}")


def calculate_rmse(
def calculate_rmse_knn(
model: NearestNeighbors,
user_item_matrix: csr_matrix,
normalized_matrix: csr_matrix,
user_means: np.ndarray,
test: pd.DataFrame,
user_map: dict,
item_map: dict
) -> float:
"""
Calculate RMSE on test sample.

Args:
model: trained model
user_item_matrix: sparse matrix
test: test dataframe
user_map: user_id to index mapping
item_map: movie_id to index mapping
Returns:
RMSE score
"""
"""Calculate RMSE for KNN model on test sample."""
actuals = []
predictions = []

test_sample = test.sample(5000, random_state=42)
test_sample = test.sample(1000, random_state=42)

for _, row in test_sample.iterrows():
user_idx = user_map.get(row["user_id"])
Expand All @@ -65,7 +50,7 @@ def calculate_rmse(
continue

distances, indices = model.kneighbors(
user_item_matrix.T[item_idx],
normalized_matrix.T[item_idx],
n_neighbors=model.n_neighbors
)

Expand All @@ -75,47 +60,71 @@ def calculate_rmse(
if weights.sum() > 0 and similar_ratings.sum() > 0:
pred = np.average(similar_ratings, weights=weights)
else:
pred = 0
pred = user_means[user_idx]

actuals.append(row["rating"])
predictions.append(pred)

rmse = root_mean_squared_error(actuals, predictions)
logger.info(f"RMSE on test sample: {rmse:.4f}")
logger.info(f"KNN RMSE on test sample: {rmse:.4f}")
return rmse


def calculate_rmse_svd(
predicted_ratings: np.ndarray,
user_means: np.ndarray,
test: pd.DataFrame,
user_map: dict,
item_map: dict
) -> float:
"""Calculate RMSE for SVD model on full test set."""
actuals = []
predictions = []

for _, row in test.iterrows():
user_idx = user_map.get(row["user_id"])
item_idx = item_map.get(row["movie_id"])

if user_idx is None or item_idx is None:
continue

pred = predicted_ratings[user_idx, item_idx] + user_means[user_idx]
actuals.append(row["rating"])
predictions.append(pred)

rmse = root_mean_squared_error(actuals, predictions)
logger.info(f"SVD RMSE on full test set: {rmse:.4f}")
return rmse


def evaluate_pipeline(
model: NearestNeighbors,
model,
user_item_matrix: csr_matrix,
normalized_matrix: csr_matrix,
user_means: np.ndarray,
test: pd.DataFrame,
user_map: dict,
item_map: dict,
best_model_type: str = "itemknn",
k: int = 10
) -> dict:
"""
Run evaluation pipeline on best model.

Args:
model: best trained model
user_item_matrix: sparse matrix
test: test dataframe
user_map: user_id to index mapping
item_map: movie_id to index mapping
k: number of recommendations
Returns:
dict: evaluation metrics
"""
"""Run evaluation pipeline on best model."""
logger.info("Starting evaluation pipeline...")
rmse = calculate_rmse(model, user_item_matrix, test, user_map, item_map)

report = {"rmse": round(rmse, 4)}
if "svd" in best_model_type:
rmse = calculate_rmse_svd(model, user_means, test, user_map, item_map)
else:
rmse = calculate_rmse_knn(
model, user_item_matrix, normalized_matrix,
user_means, test, user_map, item_map
)

report = {"rmse": round(rmse, 4), "model": best_model_type}
save_evaluation_report(report)

mlflow.set_tracking_uri("sqlite:///mlflow.db")
with mlflow.start_run(run_name="best_model_evaluation"):
mlflow.log_metrics(report)
mlflow.log_metrics({"rmse": round(rmse, 4)})

logger.info(f"Evaluation report: {report}")

Expand Down
120 changes: 107 additions & 13 deletions src/models/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,22 @@ def load_artifacts() -> tuple:

return user_item_matrix, user_map, item_map, idx_to_item


def load_svd_artifacts()-> tuple:
"""
Load SVD predicted ratings matrix and user means.

Returns.
tuple: predicted ratings matrix, user means
"""

predicted_ratings = joblib.load(MODELS_PATH / "svd_50factors.joblib")
user_means = np.load(DATA_PATH / "user_means.npy")

logger.info("SVD artifacts loaded successfully")
return predicted_ratings, user_means


# Evaluation module for the movie recommendation system.
def recommend_movies(
user_id: int,
Expand Down Expand Up @@ -143,6 +159,69 @@ def recommend_movies(
logger.info(f"Generated {len(result)} recommendations for User {user_id}")
return result


def recommend_movies_svd(
user_id: int,
predicted_ratings: np.ndarray,
user_means: np.ndarray,
user_map: dict,
idx_to_item: dict,
user_item_matrix,
movies: pd.DataFrame,
n: int = 10
) -> pd.DataFrame:
"""
Generate recommendations using SVD predicted ratings.

Args:
user_id: user to recommend for
predicted_ratings: full predicted ratings matrix
user_means: array of user means
user_map: user_id to index mapping
idx_to_item: index to movie_id mapping
user_item_matrix: to find unwatched movies
movies: movies dataframe for titles
n: number of recommendations
Returns:
dataframe with top N recommendations
"""
user_idx = user_map.get(user_id)
if user_idx is None:
logger.error(f"User {user_id} not found!")
return None

# get user ratings
user_ratings = user_item_matrix[user_idx].toarray().flatten()

# get predicted ratings for this user
user_predicted = predicted_ratings[user_idx].copy()

# add user mean back
user_predicted = user_predicted + user_means[user_idx]

# set watched movies to -1
user_predicted[user_ratings > 0] = -1

# get top N indices
top_indices = np.argsort(user_predicted)[::-1][:n]

recommendations = []
for item_idx in top_indices:
movie_id = idx_to_item.get(item_idx)
score = user_predicted[item_idx]
title = movies[movies["movie_id"] == movie_id]["title"].values
if len(title) > 0:
recommendations.append({
"movie_id": movie_id,
"title": title[0],
"predicted_score": round(float(score), 2)
})

result = pd.DataFrame(recommendations)
logger.info(f"Generated {len(result)} SVD recommendations for User {user_id}")
return result


def predict_pipeline(user_id: int, movies: pd.DataFrame, n: int = 10) -> pd.DataFrame:
"""
Run full prediction pipeline for a user.
Expand All @@ -154,20 +233,35 @@ def predict_pipeline(user_id: int, movies: pd.DataFrame, n: int = 10) -> pd.Data
Returns:
dataframe with top N recommendations
"""
# load model and artifacts
model = load_model()
# load artifacts
user_item_matrix, user_map, item_map, idx_to_item = load_artifacts()

# generate recommendations
recommendations = recommend_movies(
user_id=user_id,
model=model,
user_item_matrix=user_item_matrix,
user_map=user_map,
item_map=item_map,
idx_to_item=idx_to_item,
movies=movies,
n=n
)
# try SVD first
svd_path = MODELS_PATH / "svd_50factors.joblib"
if svd_path.exists():
predicted_ratings, user_means = load_svd_artifacts()
recommendations = recommend_movies_svd(
user_id=user_id,
predicted_ratings=predicted_ratings,
user_means=user_means,
user_map=user_map,
idx_to_item=idx_to_item,
user_item_matrix=user_item_matrix,
movies=movies,
n=n
)
else:
# fallback to ItemKNN
model = load_model()
recommendations = recommend_movies(
user_id=user_id,
model=model,
user_item_matrix=user_item_matrix,
user_map=user_map,
item_map=item_map,
idx_to_item=idx_to_item,
movies=movies,
n=n
)

return recommendations
Loading
Loading