-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path05_cross_validation.py
More file actions
114 lines (82 loc) · 3.55 KB
/
05_cross_validation.py
File metadata and controls
114 lines (82 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 20 14:56:01 2016
@author: anooptp
"""
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.cross_validation import KFold
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
y_pred =knn.predict(X_test)
print ("train_test_split method: ", metrics.accuracy_score(y_pred, y_test))
#from sklearn.cross_validation import KFold
#kf = KFold(25, n_folds=5, shuffle=False)
# print the contents of each training and testing set
#print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
#for iteration, data in enumerate(kf, start=1):
# print('{:^9} {} {:^25}'.format(iteration, data[0], data[1]))
# Cross-validation example: parameter tuning
from sklearn.cross_validation import cross_val_score
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier()
scores = cross_val_score(knn, X, y,cv =10, scoring='accuracy')
print ("cross_val_score method: ",scores)
print ("cross_val_score method: ",scores.mean())
# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors = k)
scores = cross_val_score(knn, X, y, cv = 10, scoring='accuracy')
k_scores.append(scores.mean())
print ("k_scores: ", k_scores)
import matplotlib.pyplot as plt
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()
# Cross-validation example: model selection
# Goal: Compare the best KNN model with logistic regression on the iris dataset
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors = 20)
print("KNeighborsClassifier(n_neighbors = 20): ", cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print("LogisticRegression: ", cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())
# Cross-validation example: feature selection
# Goal: Select whether the Newspaper feature should be included in the linear regression model on the advertising dataset
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
# read in the advertising dataset
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
print("------- Advertising data -------")
print(data.head())
feature_cols = ['TV', 'Radio', 'Newspaper']
X = data[feature_cols]
y = data.Sales
# 10-fold cross-validation with all three features
linreg = LinearRegression()
scores = cross_val_score(linreg, X, y, cv=10, scoring='mean_squared_error')
print ("LinearRegression: ", scores)
# fix the sign of MSE scores
mse_scores = -scores
print("LinearRegression (mse): ", mse_scores)
# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)
print("LinearRegression (rmse): ", rmse_scores)
# calculate the average RMSE
print("LinearRegression (rmse): ", rmse_scores.mean())
# 10-fold cross-validation with two features (excluding Newspaper)
feature_cols = ['TV', 'Radio']
X = data[feature_cols]
print("LinearRegression (excluding Newspaper): ", np.sqrt(-cross_val_score(linreg, X, y, cv=10, scoring='mean_squared_error')).mean())