-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path04_linear_regression.py
More file actions
87 lines (61 loc) · 2.24 KB
/
04_linear_regression.py
File metadata and controls
87 lines (61 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 14 00:01:24 2016
@author: anooptp
"""
import pandas as pd
# read CSV file directly from a URL and save the results
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# display the first 5 rows
print (data.head())
print(data.shape)
# Visualizing data using seaborn
import seaborn as sns
# visualize the relationship between the features and the response using scatterplots
#sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=5, aspect=0.7, kind='reg')
# create a Python list of feature names
feature_cols = ['TV', 'Radio', 'Newspaper']
# use the list to select a subset of the original DataFrame
X = data[feature_cols]
# equivalent command to do this in one line
X = data[['TV', 'Radio', 'Newspaper']]
print(X.head())
# check the type and shape of X
print(type(X))
print(X.shape)
y = data['Sales']
print(y.head())
# check the type and shape of y
print(type(y))
print(y.shape)
# Splitting X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Linear regression in scikit-learn
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
# print the intercept and coefficients
print(linreg.intercept_)
print (linreg.coef_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))
# make predictions on the testing set
y_pred = linreg.predict(X_test)
# Computing the RMSE for our Sales predictions
from sklearn import metrics
import numpy as np
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# Since Newspaper doesn't improve the quality of our predictions
feature_cols = ['TV', 'Radio']
X = data[feature_cols]
y = data.Sales
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
# make predictions on the testing set
y_pred = linreg.predict(X_test)
# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))