-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNumpyNN.py
More file actions
181 lines (142 loc) · 7.43 KB
/
NumpyNN.py
File metadata and controls
181 lines (142 loc) · 7.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import numpy as np
"""
This design adopt from Coursera Introduction to Deep Learning course
"""
class Layer:
"""
A building block. Base class for other concrete layers. Each layer is capable of performing two things:
- Process input to get output: output = layer.forward(input)
- Propagate gradients through itself: grad_input = layer.backward(input, grad_output)
Some layers also have learnable parameters which they update during layer.backward.
"""
def __init__ (self):
"""Here you can initialize layer parameters (if any) and auxiliary stuff."""
# A dummy layer does nothing
pass
def forward(self, input):
"""
Takes input data of shape [batch, numFeatures], returns output data [batch, numClasses]
"""
# A dummy layer just returns whatever it gets as input.
return input
def backward(self,input, grad_output):
"""
Performs a backpropagation step through the layer, with respect to the given input.
To compute loss gradients w.r.t input, you need to apply chain rule (backprop):
d loss / d x = (d loss / d layer) * (d layer / d x)
Luckily, you already receive d loss / d layer as input, so you only need to multiply it by d layer / d x.
If your layer has parameters (e.g. dense layer), you also need to update them here using d loss / d layer
"""
# The gradient of a dummy layer is precisely grad_output, but we'll write it more explicitly
num_units = input.shape[1]
d_layer_d_input = np.eye(num_units)
return np.dot(grad_output, d_layer_d_input) # chain rule
class ReLU(Layer):
def __init__(self):
"""ReLU layer simply applies elementwise rectified linear unit to all inputs"""
pass
def forward(self, input):
"""Apply elementwise ReLU to [batch, numFeatures] matrix"""
return np.maximum(input, 0)
def backward(self, input, grad_output):
"""Compute gradient of loss w.r.t. ReLU input"""
relu_grad = input > 0
return grad_output*relu_grad
class Dense(Layer):
"""
regular dense fully connect network. linear part (without activation)
"""
def __init__(self, numFeatures, output_units, learning_rate=0.1, init='Default'):
"""
A dense layer is a layer which performs a learned affine transformation:
f(x) = <W*x> + b
"""
self.learning_rate = learning_rate
# initialize weights with small random numbers. We use normal initialization,
# Add the Xavier factor for init. could add more later.
if init == 'Xavier':
self.weights = np.random.randn(numFeatures, numClasses) * np.sqrt(2.0 / (numFeatures + numClasses))
else:
self.weights = np.random.randn(numFeatures, numClasses) * 0.05
self.biases = np.zeros(output_units)
def forward(self,input):
"""
Perform an affine transformation:
f(x) = <W*x> + b, where W is of shape[numFeatures, numClasses], and b [batchSize, 1] to be broadcasted
input shape: [batchSize, numFeatures]
output shape: [batchSize, numClasses]
"""
return np.dot(input, self.weights) + self.biases
def backward(self,input,grad_output):
"""
Perform back propogation
dw =
"""
# Since grad_input (d f / d x) = grad_output (d f / d dense) * grad_layer_input (d dense / d x)
# where d dense/ d x = weights transposed as dense = weight * x + bias
# Keep in minde grad_output shape must be the same as that of output, so does grad_input shape
grad_input = np.dot(grad_output, self.weights.T)
# compute gradient w.r.t. weights and biases
# Since grad_weight (d f / d w) = grad_output (d f / d dense) * grad_layer_weight (d dense / d w)
# where d dense/ d w = inputs transposed as dense = weight * x + bias
# similarly, d dense / d b = 1, just need to average for all classes, use axis=0 to averge the classes.
grad_weights = 1 / (input.shape[0]) * np.dot(input.T, grad_output)
grad_biases = grad_output.mean(axis = 0)
assert grad_weights.shape == self.weights.shape and grad_biases.shape == self.biases.shape
# Here we perform a stochastic gradient descent step.
# Later on, you can try replacing that with something better.
self.weights = self.weights - self.learning_rate*grad_weights
self.biases = self.biases - self.learning_rate*grad_biases
return grad_input
class LogicTrainer
def softmax_crossentropy_with_logits(logits,reference_answers):
# Compute crossentropy from logits[batch, numClasses] and ids of correct answers
logits_for_answers = logits[np.arange(len(logits)),reference_answers]
xentropy = - logits_for_answers + np.log(np.sum(np.exp(logits),axis=-1))
return xentropy
def grad_softmax_crossentropy_with_logits(logits,reference_answers):
# Compute crossentropy gradient from logits[batch, n_classes] and ids of correct answers
ones_for_answers = np.zeros_like(logits)
ones_for_answers[np.arange(len(logits)),reference_answers] = 1
softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
return - ones_for_answers + softmax
def forward(network, X):
"""
Compute activations of all network layers by applying them sequentially.
Return a list of activations for each layer.
Make sure last activation corresponds to network logits.
"""
activations = []
input = X
for eachLayer in network:
activations.append(eachLayer.forward(input)) # push the output to activations cache 1st
input = activations[-1] # assign last item to input, ready for next layer
assert len(activations) == len(network)
return activations
def predict(network,X):
"""
Compute network predictions.
"""
logits = forward(network,X)[-1]
return logits.argmax(axis=-1)
def train(network,X,y):
"""
Train network on a given batch of X and y.
First need to call forward function of this class to get all layer activations.
Then run backward of each layer going from last to first layer.
After you called backward for all layers, all Dense layers have already made one gradient step.
"""
# Get the layer activations
layer_activations = forward(network,X)
layer_inputs = [X]+layer_activations #layer_input[i] is an input for network[i]
logits = layer_activations[-1]
# Compute the loss and the output gradient
loss = softmax_crossentropy_with_logits(logits,y)
loss_grad = grad_softmax_crossentropy_with_logits(logits,y)
#print("loss has shape" + str(loss.shape))
# Back propagate gradients through the network
gradCurrentLayer = loss_grad
for l in range(len(network))[::-1]:
# My Comment: Layer l input is the layer l-1 Output, layer l backward produce its grad input, which is l-1 gradOutput
gradCurrentLayer = network[l].backward(layer_inputs[l], gradCurrentLayer)
return np.mean(loss)