Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions Python3Code/Chapter7/FeatureSelection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,14 @@
import copy
import numpy as np
from operator import itemgetter
import pandas as pd

# Specifies feature selection approaches for classification to identify the most important features.
class FeatureSelectionClassification:

# Forward selection for classification which selects a pre-defined number of features (max_features)
# that show the best accuracy. We assume a decision tree learning for this purpose, but
# this can easily be changed. It return the best features.
def forward_selection(self, max_features, X_train, y_train, gridsearch):
def forward_selection(self, max_features, X_train, X_test, y_train, y_test, gridsearch):
# Start with no features.
ordered_features = []
ordered_scores = []
Expand All @@ -35,9 +34,7 @@ def forward_selection(self, max_features, X_train, y_train, gridsearch):

# Select the appropriate number of features.
for i in range(0, max_features):
#print(i)

#Determine the features left to select.
# Determine the features left to select.
features_left = list(set(X_train.columns) - set(selected_features))
best_perf = 0
best_attribute = ''
Expand All @@ -50,19 +47,24 @@ def forward_selection(self, max_features, X_train, y_train, gridsearch):

# Determine the accuracy of a decision tree learner if we were to add
# the feature.
pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features], gridsearch=False)
perf = ce.accuracy(y_train, pred_y_train)
pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features],
y_train,
X_test[temp_selected_features],
gridsearch=False)
perf = ce.accuracy(y_test, pred_y_test)

# If the performance is better than what we have seen so far (we aim for high accuracy)
# we set the current feature to the best feature and the same for the best performance.
if perf > best_perf:
best_perf = perf
best_feature = f

# We select the feature with the best performance.
selected_features.append(best_feature)
prev_best_perf = best_perf
ordered_features.append(best_feature)
ordered_scores.append(best_perf)

return selected_features, ordered_features, ordered_scores

# Backward selection for classification which selects a pre-defined number of features (max_features)
Expand Down
3 changes: 1 addition & 2 deletions Python3Code/Chapter7/LearningAlgorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,9 @@ def decision_tree(self, train_X, train_y, test_X, min_samples_leaf=50, criterion
'criterion':['gini', 'entropy']}]
dtree = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='accuracy')
else:
dtree = DecisionTreeClassifier(criterion=criterion)
dtree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, criterion=criterion)

# Fit the model

dtree.fit(train_X, train_y.values.ravel())

if gridsearch and print_model_details:
Expand Down
46 changes: 17 additions & 29 deletions Python3Code/crowdsignals_ch7_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,15 @@
# #
##############################################################

import os
import copy
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import time
start = time.time()

from sklearn.model_selection import train_test_split

from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from Chapter7.LearningAlgorithms import RegressionAlgorithms
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.Evaluation import RegressionEvaluation
from Chapter7.FeatureSelection import FeatureSelectionClassification
from Chapter7.FeatureSelection import FeatureSelectionRegression
from util import util
from util.VisualizeDataset import VisualizeDataset

Expand Down Expand Up @@ -84,30 +75,31 @@
fs = FeatureSelectionClassification()

features, ordered_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
train_X[features_after_chapter_5], train_y, gridsearch=False)
train_X[features_after_chapter_5],
test_X[features_after_chapter_5],
train_y,
test_y,
gridsearch=False)

DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION+1)], y=[ordered_scores],
xlabel='number of features', ylabel='accuracy')


# based on python2 features, slightly different.
# based on python2 features, slightly different.
selected_features = ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40',
'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40']

# # # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()
start = time.time()


reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
performance_training = []
performance_test = []
## Due to runtime constraints we run the experiment 3 times, yet if you want even more robust data one should increase the repetitions.
N_REPEATS_NN = 3

## Due to runtime constraints we run the experiment 3 times, yet if you want even more robust data one should increase the repetitions.
N_REPEATS_NN = 3

for reg_param in reg_parameters:
performance_tr = 0
Expand All @@ -130,7 +122,6 @@

#Second, let us consider the influence of certain parameter settings for the tree model. (very related to the
#regularization) and study the impact on performance.

leaf_settings = [1,2,5,10]
performance_training = []
performance_test = []
Expand All @@ -150,7 +141,6 @@

# So yes, it is important :) Therefore we perform grid searches over the most important parameters, and do so by means
# of cross validation upon the training set.

possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features]
feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']
N_KCV_REPEATS = 5
Expand All @@ -165,7 +155,6 @@
selected_test_X = test_X[possible_feature_sets[i]]

# First we run our non deterministic classifiers a number of times to average their score.

performance_tr_nn = 0
performance_tr_rf = 0
performance_tr_svm = 0
Expand All @@ -181,31 +170,31 @@
print("Training RandomForest run {} / {} ... ".format(repeat, N_KCV_REPEATS, feature_names[i]))
performance_tr_nn += eval.accuracy(train_y, class_train_y)
performance_te_nn += eval.accuracy(test_y, class_test_y)

class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
selected_train_X, train_y, selected_test_X, gridsearch=True
)

performance_tr_rf += eval.accuracy(train_y, class_train_y)
performance_te_rf += eval.accuracy(test_y, class_test_y)

print("Training SVM run {} / {}, featureset: {}... ".format(repeat, N_KCV_REPEATS, feature_names[i]))

class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.support_vector_machine_with_kernel(
selected_train_X, train_y, selected_test_X, gridsearch=True
)
performance_tr_svm += eval.accuracy(train_y, class_train_y)
performance_te_svm += eval.accuracy(test_y, class_test_y)


overall_performance_tr_nn = performance_tr_nn/N_KCV_REPEATS
overall_performance_te_nn = performance_te_nn/N_KCV_REPEATS
overall_performance_tr_rf = performance_tr_rf/N_KCV_REPEATS
overall_performance_te_rf = performance_te_rf/N_KCV_REPEATS
overall_performance_tr_svm = performance_tr_svm/N_KCV_REPEATS
overall_performance_te_svm = performance_te_svm/N_KCV_REPEATS

# #And we run our deterministic classifiers:
# And we run our deterministic classifiers:
print("Determenistic Classifiers:")

print("Training Nearest Neighbor run 1 / 1, featureset {}:".format(feature_names[i]))
Expand All @@ -218,14 +207,14 @@
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
selected_train_X, train_y, selected_test_X, gridsearch=True
)

performance_tr_dt = eval.accuracy(train_y, class_train_y)
performance_te_dt = eval.accuracy(test_y, class_test_y)
print("Training Naive Bayes run 1/1 featureset {}:".format(feature_names[i]))
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(
selected_train_X, train_y, selected_test_X
)

performance_tr_nb = eval.accuracy(train_y, class_train_y)
performance_te_nb = eval.accuracy(test_y, class_test_y)

Expand All @@ -241,9 +230,8 @@

DataViz.plot_performances_classification(['NN', 'RF','SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs)

# # And we study two promising ones in more detail. First, let us consider the decision tree, which works best with the
# # selected features.

# And we study two promising ones in more detail. First, let us consider the decision tree, which works best with the
# selected features.
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features],
gridsearch=True,
print_model_details=True, export_tree_path=EXPORT_TREE_PATH)
Expand Down