From c5926558d6f6e9f1cd1acd4145792258c41b2658 Mon Sep 17 00:00:00 2001
From: Michal Fularz <michal.fularz@gmail.com>
Date: Wed, 15 Apr 2026 19:09:26 +0200
Subject: [PATCH] Copilot based solution for some of the tasks.

---
 machine_learning_course/lab_s01e04.py | 651 +++++++++++++++++++++++++-
 1 file changed, 646 insertions(+), 5 deletions(-)

diff --git a/machine_learning_course/lab_s01e04.py b/machine_learning_course/lab_s01e04.py
index 559e290..22ba738 100644
--- a/machine_learning_course/lab_s01e04.py
+++ b/machine_learning_course/lab_s01e04.py
@@ -14,6 +14,7 @@
 from sklearn import metrics
 from sklearn import pipeline, cluster
 from sklearn import decomposition, manifold
+from sklearn import neighbors
 
 # import gap_statistic
 
@@ -165,12 +166,652 @@ def todo_4():
     print(X_filtered.describe())
 
 
-def main():
-    todo_1()  # WIP
-    todo_2()
-    todo_3()
-    todo_4()
+def todo_5():
+    # Load the Pima Indians Diabetes Database - https://www.openml.org/d/37
+    
+    print_function_name()
+    
+    X, y = datasets.fetch_openml('diabetes', as_frame=True, return_X_y=True)
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+    
+    # Extract only plas and mass features
+    X_train_features = X_train[['plas', 'mass']].values
+    X_test_features = X_test[['plas', 'mass']].values
+    
+    # Test different contamination parameters
+    contamination_values = [0.05, 0.1, 0.15]
+    
+    for contamination in contamination_values:
+        print(f"\nIsolationForest with contamination={contamination}")
+        
+        # Train IsolationForest
+        isolation_forest = ensemble.IsolationForest(
+            contamination=contamination, random_state=42
+        )
+        isolation_forest.fit(X_train_features)
+        y_pred_isolation = isolation_forest.predict(X_test_features)
+        
+        # Visualize predictions
+        plt.figure(figsize=(10, 6))
+        plt.scatter(
+            X_test_features[y_pred_isolation == 1, 0],
+            X_test_features[y_pred_isolation == 1, 1],
+            c='blue', label='Inliers', alpha=0.6
+        )
+        plt.scatter(
+            X_test_features[y_pred_isolation == -1, 0],
+            X_test_features[y_pred_isolation == -1, 1],
+            c='red', label='Outliers', alpha=0.6
+        )
+        
+        # Create decision boundary using contour plot
+        h = 0.5  # Step size in the mesh
+        x_min, x_max = X_test_features[:, 0].min() - 1, X_test_features[:, 0].max() + 1
+        y_min, y_max = X_test_features[:, 1].min() - 1, X_test_features[:, 1].max() + 1
+        
+        xx, yy = np.meshgrid(
+            np.arange(x_min, x_max, h),
+            np.arange(y_min, y_max, h)
+        )
+        
+        Z = isolation_forest.predict(np.c_[xx.ravel(), yy.ravel()])
+        Z = Z.reshape(xx.shape)
+        
+        plt.contourf(xx, yy, Z, alpha=0.3, levels=[-1, 0], colors=['red'])
+        plt.contourf(xx, yy, Z, alpha=0.1, levels=[0, 1], colors=['blue'])
+        
+        plt.xlabel('plas')
+        plt.ylabel('mass')
+        plt.title(f'IsolationForest (contamination={contamination})')
+        plt.legend()
+    
+    # Test LocalOutlierFactor method
+    # print("\n\nLocalOutlierFactor with n_neighbors=20")
+    # lof = neighbors.LocalOutlierFactor(n_neighbors=20)
+    # y_pred_lof = lof.fit_predict(X_test_features)
+    #
+    # plt.figure(figsize=(10, 6))
+    # plt.scatter(
+    #     X_test_features[y_pred_lof == 1, 0],
+    #     X_test_features[y_pred_lof == 1, 1],
+    #     c='green', label='Inliers', alpha=0.6
+    # )
+    # plt.scatter(
+    #     X_test_features[y_pred_lof == -1, 0],
+    #     X_test_features[y_pred_lof == -1, 1],
+    #     c='orange', label='Outliers', alpha=0.6
+    # )
+    #
+    # Create decision boundary for LOF using contour
+    # h = 0.5
+    # x_min, x_max = X_test_features[:, 0].min() - 1, X_test_features[:, 0].max() + 1
+    # y_min, y_max = X_test_features[:, 1].min() - 1, X_test_features[:, 1].max() + 1
+    #
+    # xx, yy = np.meshgrid(
+    #     np.arange(x_min, x_max, h),
+    #     np.arange(y_min, y_max, h)
+    # )
+    #
+    # Z_lof = lof.fit_predict(np.c_[xx.ravel(), yy.ravel()])
+    # Z_lof = Z_lof.reshape(xx.shape)
+    #
+    # plt.contourf(xx, yy, Z_lof, alpha=0.3, levels=[-1, 0], colors=['orange'])
+    # plt.contourf(xx, yy, Z_lof, alpha=0.1, levels=[0, 1], colors=['green'])
+    #
+    # plt.xlabel('plas')
+    # plt.ylabel('mass')
+    # plt.title('LocalOutlierFactor (n_neighbors=20)')
+    # plt.legend()
+
+    plt.show()
+
+def todo_6():
+    # Analyze https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_anomaly_comparison.html
+    # For the Pima Indians Diabetes Database example, visualize the area where values are considered as inliers
+    # Comparison of multiple anomaly detection methods
+    
+    print_function_name()
+    
+    from sklearn import covariance
+    
+    X, y = datasets.fetch_openml('diabetes', as_frame=True, return_X_y=True)
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+    
+    # Extract only plas and mass features
+    X_train_features = X_train[['plas', 'mass']].values
+    X_test_features = X_test[['plas', 'mass']].values
+    
+    # Define the anomaly detection methods
+    methods = {
+        'IsolationForest': ensemble.IsolationForest(contamination=0.1, random_state=42),
+        'LocalOutlierFactor': neighbors.LocalOutlierFactor(n_neighbors=20),
+        # 'OneClassSVM': svm.OneClassSVM(nu=0.1, kernel='rbf', gamma='auto'),
+        'EllipticEnvelope': covariance.EllipticEnvelope(contamination=0.1, random_state=42)
+    }
+    
+    # Create a mesh to plot decision boundaries
+    h = 0.5  # Step size in the mesh
+    x_min, x_max = X_test_features[:, 0].min() - 1, X_test_features[:, 0].max() + 1
+    y_min, y_max = X_test_features[:, 1].min() - 1, X_test_features[:, 1].max() + 1
+    
+    xx, yy = np.meshgrid(
+        np.arange(x_min, x_max, h),
+        np.arange(y_min, y_max, h)
+    )
+    
+    # Create subplots for each method
+    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
+    axes = axes.ravel()
+    
+    for idx, (method_name, method) in enumerate(methods.items()):
+        print(f"\n{method_name}")
+        
+        # Fit the method on training data
+        if method_name == 'LocalOutlierFactor':
+            method.fit(X_train_features)
+            y_pred_train = method.fit_predict(X_train_features)
+            y_pred_test = method.fit_predict(X_test_features)
+        else:
+            method.fit(X_train_features)
+            y_pred_train = method.predict(X_train_features)
+            y_pred_test = method.predict(X_test_features)
+        
+        # Predict on mesh
+        if method_name == 'LocalOutlierFactor':
+            Z = method.fit_predict(np.c_[xx.ravel(), yy.ravel()])
+        else:
+            Z = method.predict(np.c_[xx.ravel(), yy.ravel()])
+        
+        Z = Z.reshape(xx.shape)
+        
+        # Plot decision boundary
+        ax = axes[idx]
+        ax.contourf(xx, yy, Z, alpha=0.3, levels=[-1, 0], colors=['red'])
+        ax.contourf(xx, yy, Z, alpha=0.1, levels=[0, 1], colors=['blue'])
+        
+        # Plot inliers and outliers
+        ax.scatter(
+            X_test_features[y_pred_test == 1, 0],
+            X_test_features[y_pred_test == 1, 1],
+            c='blue', marker='o', label='Inliers', alpha=0.7, edgecolors='k'
+        )
+        ax.scatter(
+            X_test_features[y_pred_test == -1, 0],
+            X_test_features[y_pred_test == -1, 1],
+            c='red', marker='X', label='Outliers', alpha=0.7, edgecolors='k', s=100
+        )
+        
+        ax.set_xlabel('plas')
+        ax.set_ylabel('mass')
+        ax.set_title(method_name)
+        ax.legend()
+        ax.set_xlim(x_min, x_max)
+        ax.set_ylim(y_min, y_max)
+        
+        # Print statistics
+        n_outliers_test = (y_pred_test == -1).sum()
+        print(f"  Outliers detected in test set: {n_outliers_test} / {len(y_pred_test)}")
+    
+    plt.tight_layout()
+    plt.show()
+
 
+def todo_7():
+    # Reflect on why cross-validation is used in this task
+    # Test GridSearchCV and RandomizedSearchCV for hyperparameter tuning
+    # of Decision Tree and SVM on a small dataset (iris)
+    # Visualize the results
+    # Save the best model to a file
+    
+    print_function_name()
+    
+    import joblib
+    from sklearn import tree
+    
+    # Load iris dataset
+    X, y = datasets.load_iris(return_X_y=True)
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, test_size=0.3, random_state=42
+    )
+    
+    # ==================== GridSearchCV for SVM ====================
+    print("\n" + "="*60)
+    print("GridSearchCV for SVM")
+    print("="*60)
+    
+    svm_parameters = {
+        'kernel': ('linear', 'rbf', 'poly'),
+        'C': [0.1, 1, 10],
+        'gamma': ['scale', 'auto']
+    }
+    
+    svm_clf = model_selection.GridSearchCV(
+        svm.SVC(),
+        svm_parameters,
+        cv=5,
+        n_jobs=-1,
+        verbose=1
+    )
+    svm_clf.fit(X_train, y_train)
+    
+    print(f"\nBest SVM parameters: {svm_clf.best_params_}")
+    print(f"Best SVM CV score: {svm_clf.best_score_:.4f}")
+    print(f"SVM test score: {svm_clf.score(X_test, y_test):.4f}")
+    
+    # Create pivot table for SVM results (kernel vs C)
+    svm_results_df = pd.DataFrame(svm_clf.cv_results_)
+    svm_pivot = pd.pivot_table(
+        svm_results_df,
+        values='mean_test_score',
+        index='param_kernel',
+        columns='param_C'
+    )
+    
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(svm_pivot, annot=True, fmt='.3f', cmap='viridis', cbar_kws={'label': 'Mean CV Score'})
+    plt.title('GridSearchCV Results: SVM (Kernel vs C)')
+    plt.xlabel('C parameter')
+    plt.ylabel('Kernel')
+    
+    # ==================== GridSearchCV for Decision Tree ====================
+    print("\n" + "="*60)
+    print("GridSearchCV for Decision Tree")
+    print("="*60)
+    
+    dt_parameters = {
+        'max_depth': [2, 3, 4, 5, 6],
+        'min_samples_split': [2, 5, 10],
+        'criterion': ['gini', 'entropy']
+    }
+    
+    dt_clf = model_selection.GridSearchCV(
+        tree.DecisionTreeClassifier(random_state=42),
+        dt_parameters,
+        cv=5,
+        n_jobs=-1,
+        verbose=1
+    )
+    dt_clf.fit(X_train, y_train)
+    
+    print(f"\nBest Decision Tree parameters: {dt_clf.best_params_}")
+    print(f"Best Decision Tree CV score: {dt_clf.best_score_:.4f}")
+    print(f"Decision Tree test score: {dt_clf.score(X_test, y_test):.4f}")
+    
+    # Create pivot table for Decision Tree results (max_depth vs min_samples_split)
+    dt_results_df = pd.DataFrame(dt_clf.cv_results_)
+    dt_pivot = pd.pivot_table(
+        dt_results_df,
+        values='mean_test_score',
+        index='param_max_depth',
+        columns='param_min_samples_split'
+    )
+    
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(dt_pivot, annot=True, fmt='.3f', cmap='viridis', cbar_kws={'label': 'Mean CV Score'})
+    plt.title('GridSearchCV Results: Decision Tree (Max Depth vs Min Samples Split)')
+    plt.xlabel('Min Samples Split')
+    plt.ylabel('Max Depth')
+    
+    # ==================== RandomizedSearchCV for comparison ====================
+    print("\n" + "="*60)
+    print("RandomizedSearchCV for SVM (for comparison)")
+    print("="*60)
+    
+    svm_random_clf = model_selection.RandomizedSearchCV(
+        svm.SVC(),
+        svm_parameters,
+        n_iter=10,
+        cv=5,
+        n_jobs=-1,
+        random_state=42,
+        verbose=1
+    )
+    svm_random_clf.fit(X_train, y_train)
+    
+    print(f"\nBest SVM parameters (RandomizedSearchCV): {svm_random_clf.best_params_}")
+    print(f"Best SVM CV score (RandomizedSearchCV): {svm_random_clf.best_score_:.4f}")
+    print(f"SVM test score (RandomizedSearchCV): {svm_random_clf.score(X_test, y_test):.4f}")
+    
+    # ==================== Save best models ====================
+    print("\n" + "="*60)
+    print("Saving best models")
+    print("="*60)
+    
+    # Determine which model is best overall
+    if svm_clf.best_score_ >= dt_clf.best_score_:
+        best_model = svm_clf
+        best_model_name = "GridSearchCV_SVM_best_model.pkl"
+    else:
+        best_model = dt_clf
+        best_model_name = "GridSearchCV_DecisionTree_best_model.pkl"
+    
+    joblib.dump(best_model, best_model_name)
+    print(f"\nBest model saved as: {best_model_name}")
+    print(f"Best model type: {type(best_model.best_estimator_).__name__}")
+    print(f"Best model CV score: {best_model.best_score_:.4f}")
+    
+    # Save all models for reference
+    joblib.dump(svm_clf, "GridSearchCV_SVM_full_results.pkl")
+    joblib.dump(dt_clf, "GridSearchCV_DecisionTree_full_results.pkl")
+    joblib.dump(svm_random_clf, "RandomizedSearchCV_SVM_full_results.pkl")
+    
+    print("\nAll models saved:")
+    print("  - GridSearchCV_SVM_full_results.pkl")
+    print("  - GridSearchCV_DecisionTree_full_results.pkl")
+    print("  - RandomizedSearchCV_SVM_full_results.pkl")
+    
+    # Create comparison plot
+    plt.figure(figsize=(10, 6))
+    models_scores = {
+        'GridSearchCV SVM': svm_clf.best_score_,
+        'GridSearchCV Decision Tree': dt_clf.best_score_,
+        'RandomizedSearchCV SVM': svm_random_clf.best_score_
+    }
+    
+    bars = plt.bar(models_scores.keys(), models_scores.values(), color=['#1f77b4', '#ff7f0e', '#2ca02c'])
+    plt.ylabel('Mean CV Score')
+    plt.title('Comparison of Hyperparameter Search Methods')
+    plt.ylim([0.9, 1.0])
+    
+    # Add value labels on bars
+    for bar in bars:
+        height = bar.get_height()
+        plt.text(bar.get_x() + bar.get_width()/2., height,
+                f'{height:.4f}',
+                ha='center', va='bottom')
+    
+    plt.tight_layout()
+    plt.show()
+
+
+def todo_10():
+    # Model Ensembling: Voting and Stacking
+    # Combine multiple classifiers to improve predictions
+    # Understand hard voting (class) vs soft voting (probabilities)
+    # Compare with individual base learners
+    
+    print_function_name()
+    
+    import joblib
+    from sklearn import linear_model
+    from sklearn import tree
+    
+    # Load Breast Cancer dataset (binary classification, more realistic medical data)
+    X, y = datasets.load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, test_size=0.3, random_state=42
+    )
+    
+    # Normalize features (important for SVM and KNN)
+    scaler = preprocessing.StandardScaler()
+    X_train = scaler.fit_transform(X_train)
+    X_test = scaler.transform(X_test)
+    
+    # Define diverse base classifiers
+    clf_svm = svm.SVC(kernel='rbf', C=1, probability=True, random_state=42)
+    clf_dt = tree.DecisionTreeClassifier(max_depth=10, random_state=42)
+    clf_knn = neighbors.KNeighborsClassifier(n_neighbors=5)
+    clf_lr = linear_model.LogisticRegression(max_iter=5000, random_state=42)
+    
+    print("\n" + "="*70)
+    print("Training individual base classifiers on Breast Cancer Dataset")
+    print("="*70)
+    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features, 2 classes")
+    
+    # Train base classifiers
+    clf_svm.fit(X_train, y_train)
+    clf_dt.fit(X_train, y_train)
+    clf_knn.fit(X_train, y_train)
+    clf_lr.fit(X_train, y_train)
+    
+    # Evaluate individual classifiers
+    print(f"SVM test score: {clf_svm.score(X_test, y_test):.4f}")
+    print(f"Decision Tree test score: {clf_dt.score(X_test, y_test):.4f}")
+    print(f"KNN test score: {clf_knn.score(X_test, y_test):.4f}")
+    print(f"Logistic Regression test score: {clf_lr.score(X_test, y_test):.4f}")
+    
+    # ==================== Display Probabilities for Sample Predictions ====================
+    print("\n" + "="*70)
+    print("Class Probabilities for Sample Test Predictions")
+    print("="*70)
+    
+    n_samples_to_show = 5
+    sample_indices = np.random.choice(len(X_test), n_samples_to_show, replace=False)
+    
+    # Create figure for probability visualizations
+    fig, axes = plt.subplots(n_samples_to_show, 1, figsize=(10, 2*n_samples_to_show))
+    if n_samples_to_show == 1:
+        axes = [axes]
+    
+    for idx, sample_idx in enumerate(sample_indices):
+        X_sample = X_test[sample_idx:sample_idx+1]
+        y_true = y_test[sample_idx]
+        
+        # Get probabilities from classifiers that support predict_proba
+        proba_svm = clf_svm.predict_proba(X_sample)[0]
+        proba_dt = clf_dt.predict_proba(X_sample)[0]
+        proba_knn = clf_knn.predict_proba(X_sample)[0]
+        proba_lr = clf_lr.predict_proba(X_sample)[0]
+        
+        # Make predictions
+        pred_svm = clf_svm.predict(X_sample)[0]
+        pred_dt = clf_dt.predict(X_sample)[0]
+        pred_knn = clf_knn.predict(X_sample)[0]
+        pred_lr = clf_lr.predict(X_sample)[0]
+        
+        # Print textual representation
+        true_label = "Malignant" if y_true == 0 else "Benign"
+        print(f"\nSample {idx+1} (True label: {y_true} - {true_label})")
+        print(f"  SVM prediction: {pred_svm}, probabilities: malignant={proba_svm[0]:.4f}, benign={proba_svm[1]:.4f}")
+        print(f"  Decision Tree prediction: {pred_dt}, probabilities: malignant={proba_dt[0]:.4f}, benign={proba_dt[1]:.4f}")
+        print(f"  KNN prediction: {pred_knn}, probabilities: malignant={proba_knn[0]:.4f}, benign={proba_knn[1]:.4f}")
+        print(f"  Logistic Regression prediction: {pred_lr}, probabilities: malignant={proba_lr[0]:.4f}, benign={proba_lr[1]:.4f}")
+        
+        # Visualize probabilities for both classes
+        classes = ['Malignant', 'Benign']
+        ax = axes[idx]
+        
+        x_pos = np.arange(len(classes))
+        width = 0.2
+        
+        ax.bar(x_pos - 1.5*width, proba_svm, width, label='SVM', alpha=0.8)
+        ax.bar(x_pos - 0.5*width, proba_dt, width, label='Decision Tree', alpha=0.8)
+        ax.bar(x_pos + 0.5*width, proba_knn, width, label='KNN', alpha=0.8)
+        ax.bar(x_pos + 1.5*width, proba_lr, width, label='Logistic Regression', alpha=0.8)
+        
+        ax.set_xlabel('Class')
+        ax.set_ylabel('Probability')
+        ax.set_title(f'Sample {idx+1} - True: {true_label} (SVM:{pred_svm}, DT:{pred_dt}, KNN:{pred_knn}, LR:{pred_lr})')
+        ax.set_xticks(x_pos)
+        ax.set_xticklabels(classes)
+        ax.legend()
+        ax.set_ylim([0, 1])
+    
+    plt.tight_layout()
+    
+    # ==================== VotingClassifier - Hard Voting ====================
+    print("\n" + "="*70)
+    print("VotingClassifier - Hard Voting (class-based)")
+    print("="*70)
+    
+    voting_hard = ensemble.VotingClassifier(
+        estimators=[
+            ('svm', clf_svm),
+            ('dt', clf_dt),
+            ('knn', clf_knn),
+            ('lr', clf_lr)
+        ],
+        voting='hard'
+    )
+    voting_hard.fit(X_train, y_train)
+    
+    hard_score = voting_hard.score(X_test, y_test)
+    print(f"Hard Voting test score: {hard_score:.4f}")
+    
+    # ==================== VotingClassifier - Soft Voting ====================
+    print("\n" + "="*70)
+    print("VotingClassifier - Soft Voting (probability-based)")
+    print("="*70)
+    
+    voting_soft = ensemble.VotingClassifier(
+        estimators=[
+            ('svm', clf_svm),
+            ('dt', clf_dt),
+            ('knn', clf_knn),
+            ('lr', clf_lr)
+        ],
+        voting='soft'
+    )
+    voting_soft.fit(X_train, y_train)
+    
+    soft_score = voting_soft.score(X_test, y_test)
+    print(f"Soft Voting test score: {soft_score:.4f}")
+    
+    # ==================== StackingClassifier ====================
+    print("\n" + "="*70)
+    print("StackingClassifier (meta-learner: Logistic Regression)")
+    print("="*70)
+    
+    stacking_clf = ensemble.StackingClassifier(
+        estimators=[
+            ('svm', svm.SVC(kernel='rbf', C=1, probability=True, random_state=42)),
+            ('dt', tree.DecisionTreeClassifier(max_depth=10, random_state=42)),
+            ('knn', neighbors.KNeighborsClassifier(n_neighbors=5)),
+            ('lr', linear_model.LogisticRegression(max_iter=5000, random_state=42))
+        ],
+        final_estimator=linear_model.LogisticRegression(max_iter=5000, random_state=42),
+        cv=5,
+        stack_method='predict_proba'
+    )
+    stacking_clf.fit(X_train, y_train)
+    
+    stacking_score = stacking_clf.score(X_test, y_test)
+    print(f"Stacking test score: {stacking_score:.4f}")
+    
+    # ==================== Comparison ====================
+    print("\n" + "="*70)
+    print("Model Comparison")
+    print("="*70)
+    
+    results = {
+        'SVM': clf_svm.score(X_test, y_test),
+        'Decision Tree': clf_dt.score(X_test, y_test),
+        'KNN': clf_knn.score(X_test, y_test),
+        'Logistic Regression': clf_lr.score(X_test, y_test),
+        'Voting (Hard)': hard_score,
+        'Voting (Soft)': soft_score,
+        'Stacking': stacking_score
+    }
+    
+    for model_name, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
+        print(f"{model_name:25s}: {score:.4f}")
+    
+    # ==================== Visualization of Results ====================
+    plt.figure(figsize=(12, 6))
+    models = list(results.keys())
+    scores = list(results.values())
+    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2']
+    
+    bars = plt.bar(models, scores, color=colors[:len(models)])
+    plt.ylabel('Test Score (Accuracy)')
+    plt.title('Ensemble Methods Comparison on Breast Cancer Dataset')
+    plt.ylim([0.9, 1.0])
+    plt.xticks(rotation=45, ha='right')
+    
+    # Add value labels on bars
+    for bar in bars:
+        height = bar.get_height()
+        plt.text(bar.get_x() + bar.get_width()/2., height,
+                f'{height:.4f}',
+                ha='center', va='bottom', fontsize=10, fontweight='bold')
+    
+    plt.tight_layout()
+    
+    # ==================== Visualize Voting Results ====================
+    print("\n" + "="*70)
+    print("Analyzing Voting Ensemble Decisions")
+    print("="*70)
+    
+    # For a few test samples, show voting counts
+    n_voting_samples = 5
+    voting_indices = np.random.choice(len(X_test), n_voting_samples, replace=False)
+    
+    print(f"\nVoting decisions for {n_voting_samples} random test samples:")
+    for sample_idx in voting_indices:
+        X_sample = X_test[sample_idx:sample_idx+1]
+        y_true = y_test[sample_idx]
+        
+        # Get predictions from each base classifier
+        pred_svm = clf_svm.predict(X_sample)[0]
+        pred_dt = clf_dt.predict(X_sample)[0]
+        pred_knn = clf_knn.predict(X_sample)[0]
+        pred_lr = clf_lr.predict(X_sample)[0]
+        
+        # Get ensemble predictions
+        pred_hard = voting_hard.predict(X_sample)[0]
+        pred_soft = voting_soft.predict(X_sample)[0]
+        pred_stack = stacking_clf.predict(X_sample)[0]
+        
+        true_label = "Malignant" if y_true == 0 else "Benign"
+        print(f"\nSample (True: {y_true} - {true_label})")
+        print(f"  Base votes: SVM={pred_svm}, DT={pred_dt}, KNN={pred_knn}, LR={pred_lr}")
+        print(f"  Hard Voting: {pred_hard}, Soft Voting: {pred_soft}, Stacking: {pred_stack}")
+    
+    # ==================== Save Best Model ====================
+    print("\n" + "="*70)
+    print("Saving Models")
+    print("="*70)
+    
+    # Find best model
+    best_model_name = max(results, key=results.get)
+    best_score = results[best_model_name]
+    
+    if best_model_name == 'Voting (Soft)':
+        best_model = voting_soft
+    elif best_model_name == 'Voting (Hard)':
+        best_model = voting_hard
+    elif best_model_name == 'Stacking':
+        best_model = stacking_clf
+    elif best_model_name == 'SVM':
+        best_model = clf_svm
+    elif best_model_name == 'Decision Tree':
+        best_model = clf_dt
+    elif best_model_name == 'KNN':
+        best_model = clf_knn
+    else:  # Logistic Regression
+        best_model = clf_lr
+    
+    joblib.dump(best_model, 'best_ensemble_model_breastcancer.pkl')
+    print(f"\nBest model: {best_model_name} (score: {best_score:.4f})")
+    print(f"Saved as: best_ensemble_model_breastcancer.pkl")
+    
+    # Save all ensemble models
+    joblib.dump(voting_hard, 'voting_hard_classifier_breastcancer.pkl')
+    joblib.dump(voting_soft, 'voting_soft_classifier_breastcancer.pkl')
+    joblib.dump(stacking_clf, 'stacking_classifier_breastcancer.pkl')
+    
+    print("\nAll ensemble models saved:")
+    print("  - voting_hard_classifier_breastcancer.pkl")
+    print("  - voting_soft_classifier_breastcancer.pkl")
+    print("  - stacking_classifier_breastcancer.pkl")
+    print("  - best_ensemble_model_breastcancer.pkl")
+    
+    plt.show()
+
+
+def main():
+    # todo_1()  # WIP
+    # todo_2()
+    # todo_3()
+    # todo_4()
+    todo_5()
+    todo_6()
+    todo_7()
+    todo_10()
 
 if __name__ == '__main__':
     main()