From 276f186a45f65ce102ddf059575ac2489ccc2a28 Mon Sep 17 00:00:00 2001 From: Okke van Eck Date: Wed, 9 Jun 2021 12:27:09 +0200 Subject: [PATCH] Fixed chapter7 accuracy bugs --- Python3Code/Chapter7/FeatureSelection.py | 16 ++++--- Python3Code/Chapter7/LearningAlgorithms.py | 3 +- .../crowdsignals_ch7_classification.py | 46 +++++++------------ 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/Python3Code/Chapter7/FeatureSelection.py b/Python3Code/Chapter7/FeatureSelection.py index 7c2ed07c..03cb74a4 100755 --- a/Python3Code/Chapter7/FeatureSelection.py +++ b/Python3Code/Chapter7/FeatureSelection.py @@ -16,7 +16,6 @@ import copy import numpy as np from operator import itemgetter -import pandas as pd # Specifies feature selection approaches for classification to identify the most important features. class FeatureSelectionClassification: @@ -24,7 +23,7 @@ class FeatureSelectionClassification: # Forward selection for classification which selects a pre-defined number of features (max_features) # that show the best accuracy. We assume a decision tree learning for this purpose, but # this can easily be changed. It return the best features. - def forward_selection(self, max_features, X_train, y_train, gridsearch): + def forward_selection(self, max_features, X_train, X_test, y_train, y_test, gridsearch): # Start with no features. ordered_features = [] ordered_scores = [] @@ -35,9 +34,7 @@ def forward_selection(self, max_features, X_train, y_train, gridsearch): # Select the appropriate number of features. for i in range(0, max_features): - #print(i) - - #Determine the features left to select. + # Determine the features left to select. features_left = list(set(X_train.columns) - set(selected_features)) best_perf = 0 best_attribute = '' @@ -50,19 +47,24 @@ def forward_selection(self, max_features, X_train, y_train, gridsearch): # Determine the accuracy of a decision tree learner if we were to add # the feature. - pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features], gridsearch=False) - perf = ce.accuracy(y_train, pred_y_train) + pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], + y_train, + X_test[temp_selected_features], + gridsearch=False) + perf = ce.accuracy(y_test, pred_y_test) # If the performance is better than what we have seen so far (we aim for high accuracy) # we set the current feature to the best feature and the same for the best performance. if perf > best_perf: best_perf = perf best_feature = f + # We select the feature with the best performance. selected_features.append(best_feature) prev_best_perf = best_perf ordered_features.append(best_feature) ordered_scores.append(best_perf) + return selected_features, ordered_features, ordered_scores # Backward selection for classification which selects a pre-defined number of features (max_features) diff --git a/Python3Code/Chapter7/LearningAlgorithms.py b/Python3Code/Chapter7/LearningAlgorithms.py index 0184bff0..30a10708 100755 --- a/Python3Code/Chapter7/LearningAlgorithms.py +++ b/Python3Code/Chapter7/LearningAlgorithms.py @@ -174,10 +174,9 @@ def decision_tree(self, train_X, train_y, test_X, min_samples_leaf=50, criterion 'criterion':['gini', 'entropy']}] dtree = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='accuracy') else: - dtree = DecisionTreeClassifier(criterion=criterion) + dtree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, criterion=criterion) # Fit the model - dtree.fit(train_X, train_y.values.ravel()) if gridsearch and print_model_details: diff --git a/Python3Code/crowdsignals_ch7_classification.py b/Python3Code/crowdsignals_ch7_classification.py index a3338048..abbb0d98 100755 --- a/Python3Code/crowdsignals_ch7_classification.py +++ b/Python3Code/crowdsignals_ch7_classification.py @@ -7,24 +7,15 @@ # # ############################################################## -import os -import copy -import numpy as np import pandas as pd from pathlib import Path -import matplotlib.pyplot as plt import time start = time.time() -from sklearn.model_selection import train_test_split - from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning from Chapter7.LearningAlgorithms import ClassificationAlgorithms -from Chapter7.LearningAlgorithms import RegressionAlgorithms from Chapter7.Evaluation import ClassificationEvaluation -from Chapter7.Evaluation import RegressionEvaluation from Chapter7.FeatureSelection import FeatureSelectionClassification -from Chapter7.FeatureSelection import FeatureSelectionRegression from util import util from util.VisualizeDataset import VisualizeDataset @@ -84,30 +75,31 @@ fs = FeatureSelectionClassification() features, ordered_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION, - train_X[features_after_chapter_5], train_y, gridsearch=False) + train_X[features_after_chapter_5], + test_X[features_after_chapter_5], + train_y, + test_y, + gridsearch=False) DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION+1)], y=[ordered_scores], xlabel='number of features', ylabel='accuracy') - -# based on python2 features, slightly different. +# based on python2 features, slightly different. selected_features = ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120', 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40'] # # # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting? - learner = ClassificationAlgorithms() eval = ClassificationEvaluation() start = time.time() - reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] -## Due to runtime constraints we run the experiment 3 times, yet if you want even more robust data one should increase the repetitions. -N_REPEATS_NN = 3 +## Due to runtime constraints we run the experiment 3 times, yet if you want even more robust data one should increase the repetitions. +N_REPEATS_NN = 3 for reg_param in reg_parameters: performance_tr = 0 @@ -130,7 +122,6 @@ #Second, let us consider the influence of certain parameter settings for the tree model. (very related to the #regularization) and study the impact on performance. - leaf_settings = [1,2,5,10] performance_training = [] performance_test = [] @@ -150,7 +141,6 @@ # So yes, it is important :) Therefore we perform grid searches over the most important parameters, and do so by means # of cross validation upon the training set. - possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features] feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features'] N_KCV_REPEATS = 5 @@ -165,7 +155,6 @@ selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score. - performance_tr_nn = 0 performance_tr_rf = 0 performance_tr_svm = 0 @@ -181,23 +170,23 @@ print("Training RandomForest run {} / {} ... ".format(repeat, N_KCV_REPEATS, feature_names[i])) performance_tr_nn += eval.accuracy(train_y, class_train_y) performance_te_nn += eval.accuracy(test_y, class_test_y) - + class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True ) - + performance_tr_rf += eval.accuracy(train_y, class_train_y) performance_te_rf += eval.accuracy(test_y, class_test_y) print("Training SVM run {} / {}, featureset: {}... ".format(repeat, N_KCV_REPEATS, feature_names[i])) - + class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.support_vector_machine_with_kernel( selected_train_X, train_y, selected_test_X, gridsearch=True ) performance_tr_svm += eval.accuracy(train_y, class_train_y) performance_te_svm += eval.accuracy(test_y, class_test_y) - + overall_performance_tr_nn = performance_tr_nn/N_KCV_REPEATS overall_performance_te_nn = performance_te_nn/N_KCV_REPEATS overall_performance_tr_rf = performance_tr_rf/N_KCV_REPEATS @@ -205,7 +194,7 @@ overall_performance_tr_svm = performance_tr_svm/N_KCV_REPEATS overall_performance_te_svm = performance_te_svm/N_KCV_REPEATS -# #And we run our deterministic classifiers: + # And we run our deterministic classifiers: print("Determenistic Classifiers:") print("Training Nearest Neighbor run 1 / 1, featureset {}:".format(feature_names[i])) @@ -218,14 +207,14 @@ class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True ) - + performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) print("Training Naive Bayes run 1/1 featureset {}:".format(feature_names[i])) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes( selected_train_X, train_y, selected_test_X ) - + performance_tr_nb = eval.accuracy(train_y, class_train_y) performance_te_nb = eval.accuracy(test_y, class_test_y) @@ -241,9 +230,8 @@ DataViz.plot_performances_classification(['NN', 'RF','SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs) -# # And we study two promising ones in more detail. First, let us consider the decision tree, which works best with the -# # selected features. - +# And we study two promising ones in more detail. First, let us consider the decision tree, which works best with the +# selected features. class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True, export_tree_path=EXPORT_TREE_PATH)