From 903338fc17b88ae2e446efedfc5e6d1f7d1804b1 Mon Sep 17 00:00:00 2001 From: onkar0127 Date: Tue, 23 Jun 2026 20:45:57 +0530 Subject: [PATCH 1/2] Fix feedback_store.csv paths, retrain.py syntax errors, and adminSeeder.js typos --- backend/retrain.py | 36 ++++++---------------------------- backend/seeders/adminSeeder.js | 4 ++-- backend/spam_insights.py | 2 +- 3 files changed, 9 insertions(+), 33 deletions(-) diff --git a/backend/retrain.py b/backend/retrain.py index d35d69f..371609d 100644 --- a/backend/retrain.py +++ b/backend/retrain.py @@ -27,14 +27,12 @@ import argparse import os - feature import pickle from collections import Counter import shutil import sys from datetime import datetime - main import pandas as pd import joblib @@ -57,31 +55,6 @@ def backup_existing_files(): backup_dir = os.path.join("backups", timestamp) files_to_backup = [MODEL_PATH, VECTORIZER_PATH, LABEL_ENCODER_PATH] existing = [f for f in files_to_backup if os.path.exists(f)] - - feature - label_encoder = LabelEncoder() - y = label_encoder.fit_transform(labels) - - # Hold out a test split when there's enough data per class to stratify; - # otherwise train on everything and skip the report. - counts = Counter(y) - can_split = len(samples) >= 5 and min(counts.values()) >= 2 - if can_split: - X_train, X_test, y_train, y_test = train_test_split( - texts, y, test_size=0.2, random_state=42, stratify=y - ) - else: - X_train, y_train = texts, y - X_test, y_test = [], [] - print("Not enough samples per class for a held-out test split; " - "training on all available data.") - - # Fit TF-IDF vectorizer with max_features=5000 to match model - print("\nFitting TfidfVectorizer (max_features=5000)...") - vectorizer = TfidfVectorizer(max_features=5000) - X_train_vec = vectorizer.fit_transform(X_train) - print(f"Vocabulary size: {len(vectorizer.vocabulary_)}") - if not existing: print("No existing model files found to back up (first-time training).") return @@ -100,7 +73,6 @@ def load_dataset(path): df = pd.read_csv(path) if "text" not in df.columns and "message" in df.columns: df.rename(columns={"message": "text"}, inplace=True) - main if "text" not in df.columns or "label" not in df.columns: print("Dataset CSV must have 'text' (or 'message') and 'label' columns.") @@ -138,15 +110,19 @@ def load_feedback(path): def main(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + default_feedback = os.path.join(script_dir, "output", "feedback_store.csv") + default_dataset = os.path.join(script_dir, "dataset.csv") + parser = argparse.ArgumentParser(description="Retrain spam detection model with feedback data") parser.add_argument( "--dataset", - default=os.environ.get("DATASET_PATH", "dataset.csv"), + default=os.environ.get("DATASET_PATH", default_dataset), help="Path to original training dataset CSV (default: dataset.csv or $DATASET_PATH)", ) parser.add_argument( "--feedback", - default="feedback_store.csv", + default=default_feedback, help="Path to feedback CSV collected from /feedback endpoint", ) parser.add_argument( diff --git a/backend/seeders/adminSeeder.js b/backend/seeders/adminSeeder.js index 88cabd7..9fd935f 100644 --- a/backend/seeders/adminSeeder.js +++ b/backend/seeders/adminSeeder.js @@ -14,8 +14,8 @@ const seedAdminUser = async () => { name: 'Admin' }); - conso;e.log('Admin user created successfully'); - cnsole.log(`Email: ${email}`); + console.log('Admin user created successfully'); + console.log(`Email: ${email}`); console.log(`Password: ${password}`); } }catch(error){ diff --git a/backend/spam_insights.py b/backend/spam_insights.py index b3dc96b..991eb40 100644 --- a/backend/spam_insights.py +++ b/backend/spam_insights.py @@ -95,7 +95,7 @@ def load_data(): # Try feedback_store.csv in backend directory base_dir = os.path.dirname(__file__) - feedback_path = os.path.join(base_dir, "feedback_store.csv") + feedback_path = os.path.join(base_dir, "output", "feedback_store.csv") if os.path.isfile(feedback_path): try: with open(feedback_path, newline="", encoding="utf-8") as f: From 479facfee7ec5c58c6760ba90064fb6aa26099fc Mon Sep 17 00:00:00 2001 From: onkar0127 Date: Tue, 23 Jun 2026 20:52:40 +0530 Subject: [PATCH 2/2] Fix EML decoding crash due to UnicodeDecodeError --- backend/api.py | 6 +++++- backend/tests/test_email_header_analyzer.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/backend/api.py b/backend/api.py index 88162d9..24fe8a3 100644 --- a/backend/api.py +++ b/backend/api.py @@ -378,7 +378,11 @@ def analyze_email_header(): file = request.files["file"] if file and file.filename != "": try: - headers = file.read().decode("utf-8") + raw_bytes = file.read() + try: + headers = raw_bytes.decode("utf-8") + except UnicodeDecodeError: + headers = raw_bytes.decode("latin-1", errors="replace") except Exception as e: return jsonify({"error": f"Failed to read EML file: {str(e)}"}), 400 else: diff --git a/backend/tests/test_email_header_analyzer.py b/backend/tests/test_email_header_analyzer.py index db76146..1a88bf2 100644 --- a/backend/tests/test_email_header_analyzer.py +++ b/backend/tests/test_email_header_analyzer.py @@ -153,3 +153,14 @@ def test_api_endpoint_multipart_eml_trusted(self, client): assert data["success"] is True assert data["trust_level"] == "Trusted" assert data["risk_score"] == 0 + + def test_api_endpoint_multipart_eml_non_utf8(self, client): + headers_non_utf8 = LEGIT_HEADERS + "X-Custom: ñ\n" + data = { + "file": (io.BytesIO(headers_non_utf8.encode("latin-1")), "latin1.eml") + } + response = client.post("/analyze-email-header", data=data, content_type="multipart/form-data") + assert response.status_code == 200 + data = response.get_json() + assert data["success"] is True + assert data["trust_level"] == "Trusted"