From 903338fc17b88ae2e446efedfc5e6d1f7d1804b1 Mon Sep 17 00:00:00 2001
From: onkar0127 <giteonkar2004@gmail.com>
Date: Tue, 23 Jun 2026 20:45:57 +0530
Subject: [PATCH 1/2] Fix feedback_store.csv paths, retrain.py syntax errors,
 and adminSeeder.js typos

---
 backend/retrain.py             | 36 ++++++----------------------------
 backend/seeders/adminSeeder.js |  4 ++--
 backend/spam_insights.py       |  2 +-
 3 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/backend/retrain.py b/backend/retrain.py
index d35d69f..371609d 100644
--- a/backend/retrain.py
+++ b/backend/retrain.py
@@ -27,14 +27,12 @@
 
 import argparse
 import os
-        feature
 import pickle
 from collections import Counter
 
 import shutil
 import sys
 from datetime import datetime
-        main
 
 import pandas as pd
 import joblib
@@ -57,31 +55,6 @@ def backup_existing_files():
     backup_dir = os.path.join("backups", timestamp)
     files_to_backup = [MODEL_PATH, VECTORIZER_PATH, LABEL_ENCODER_PATH]
     existing = [f for f in files_to_backup if os.path.exists(f)]
-
-        feature
-    label_encoder = LabelEncoder()
-    y = label_encoder.fit_transform(labels)
-
-    # Hold out a test split when there's enough data per class to stratify;
-    # otherwise train on everything and skip the report.
-    counts = Counter(y)
-    can_split = len(samples) >= 5 and min(counts.values()) >= 2
-    if can_split:
-        X_train, X_test, y_train, y_test = train_test_split(
-            texts, y, test_size=0.2, random_state=42, stratify=y
-        )
-    else:
-        X_train, y_train = texts, y
-        X_test, y_test = [], []
-        print("Not enough samples per class for a held-out test split; "
-              "training on all available data.")
-
-    # Fit TF-IDF vectorizer with max_features=5000 to match model
-    print("\nFitting TfidfVectorizer (max_features=5000)...")
-    vectorizer = TfidfVectorizer(max_features=5000)
-    X_train_vec = vectorizer.fit_transform(X_train)
-    print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
-
     if not existing:
         print("No existing model files found to back up (first-time training).")
         return
@@ -100,7 +73,6 @@ def load_dataset(path):
     df = pd.read_csv(path)
     if "text" not in df.columns and "message" in df.columns:
         df.rename(columns={"message": "text"}, inplace=True)
-        main
 
     if "text" not in df.columns or "label" not in df.columns:
         print("Dataset CSV must have 'text' (or 'message') and 'label' columns.")
@@ -138,15 +110,19 @@ def load_feedback(path):
 
 
 def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    default_feedback = os.path.join(script_dir, "output", "feedback_store.csv")
+    default_dataset = os.path.join(script_dir, "dataset.csv")
+
     parser = argparse.ArgumentParser(description="Retrain spam detection model with feedback data")
     parser.add_argument(
         "--dataset",
-        default=os.environ.get("DATASET_PATH", "dataset.csv"),
+        default=os.environ.get("DATASET_PATH", default_dataset),
         help="Path to original training dataset CSV (default: dataset.csv or $DATASET_PATH)",
     )
     parser.add_argument(
         "--feedback",
-        default="feedback_store.csv",
+        default=default_feedback,
         help="Path to feedback CSV collected from /feedback endpoint",
     )
     parser.add_argument(
diff --git a/backend/seeders/adminSeeder.js b/backend/seeders/adminSeeder.js
index 88cabd7..9fd935f 100644
--- a/backend/seeders/adminSeeder.js
+++ b/backend/seeders/adminSeeder.js
@@ -14,8 +14,8 @@ const seedAdminUser = async () => {
                 name: 'Admin'
             });
 
-            conso;e.log('Admin user created successfully');
-            cnsole.log(`Email: ${email}`);
+            console.log('Admin user created successfully');
+            console.log(`Email: ${email}`);
             console.log(`Password: ${password}`);
         }
     }catch(error){
diff --git a/backend/spam_insights.py b/backend/spam_insights.py
index b3dc96b..991eb40 100644
--- a/backend/spam_insights.py
+++ b/backend/spam_insights.py
@@ -95,7 +95,7 @@ def load_data():
     
     # Try feedback_store.csv in backend directory
     base_dir = os.path.dirname(__file__)
-    feedback_path = os.path.join(base_dir, "feedback_store.csv")
+    feedback_path = os.path.join(base_dir, "output", "feedback_store.csv")
     if os.path.isfile(feedback_path):
         try:
             with open(feedback_path, newline="", encoding="utf-8") as f:

From 479facfee7ec5c58c6760ba90064fb6aa26099fc Mon Sep 17 00:00:00 2001
From: onkar0127 <giteonkar2004@gmail.com>
Date: Tue, 23 Jun 2026 20:52:40 +0530
Subject: [PATCH 2/2] Fix EML decoding crash due to UnicodeDecodeError

---
 backend/api.py                              |  6 +++++-
 backend/tests/test_email_header_analyzer.py | 11 +++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/backend/api.py b/backend/api.py
index 88162d9..24fe8a3 100644
--- a/backend/api.py
+++ b/backend/api.py
@@ -378,7 +378,11 @@ def analyze_email_header():
             file = request.files["file"]
             if file and file.filename != "":
                 try:
-                    headers = file.read().decode("utf-8")
+                    raw_bytes = file.read()
+                    try:
+                        headers = raw_bytes.decode("utf-8")
+                    except UnicodeDecodeError:
+                        headers = raw_bytes.decode("latin-1", errors="replace")
                 except Exception as e:
                     return jsonify({"error": f"Failed to read EML file: {str(e)}"}), 400
             else:
diff --git a/backend/tests/test_email_header_analyzer.py b/backend/tests/test_email_header_analyzer.py
index db76146..1a88bf2 100644
--- a/backend/tests/test_email_header_analyzer.py
+++ b/backend/tests/test_email_header_analyzer.py
@@ -153,3 +153,14 @@ def test_api_endpoint_multipart_eml_trusted(self, client):
         assert data["success"] is True
         assert data["trust_level"] == "Trusted"
         assert data["risk_score"] == 0
+
+    def test_api_endpoint_multipart_eml_non_utf8(self, client):
+        headers_non_utf8 = LEGIT_HEADERS + "X-Custom: ñ\n"
+        data = {
+            "file": (io.BytesIO(headers_non_utf8.encode("latin-1")), "latin1.eml")
+        }
+        response = client.post("/analyze-email-header", data=data, content_type="multipart/form-data")
+        assert response.status_code == 200
+        data = response.get_json()
+        assert data["success"] is True
+        assert data["trust_level"] == "Trusted"