My First Neural Network
May 2, 2020

Preamble

I do not have a lot of experience with Python and the best practices for Python development. I hope to learn more and have been reading about Python development.

Aside

Coming from a JavaScript background I find Python, the language itself, amazing, but the package management system, compared to npm, very poor. Pip and virtual environments are quite convoluted.

Using Matrices for More Efficent Stochastic Gradient Descent

Micheal Nielson intentiollay wrote the code to be slow to show the power of matrices and numpy. The way he wrote it does not utlize the numpy matrix performance, all of which is written to be very fast. To my knowledge numpy is a C binding). The performance gains were very noticeable for me and could be improved much more by trying to use cupy a numpy like interface that utlizes nvidia's cuda tools to improve performance by off loading matrix operations to the GPU.

Here is an example image of a subset of nodes in the network:

Given a mini batch with mm number of images:

w=[w11w12w13w21w22w23]a=[Image OneImage TwoImage ma11a12a1ma21a22a2ma31a32a3m]b=[b1b1mb1b2b2b2]zcurrent layer=wa+b=[w11a11+w12a21+w13a31+b1mw11a1m+w12a2m+w13a3m+b1w21a11+w22a21+w23a31+b2w21a1m+w22a2m+w23a3m+b2]\begin{gathered} w = \begin{bmatrix} w_{\textcolor{#a61e4d}{1}\textcolor{#5f3dc4}{1}} & w_{\textcolor{#a61e4d}{1}\textcolor{#2b8a3e}{2}} & w_{\textcolor{#a61e4d}{1}\textcolor{#e67700}{3}} \\ w_{\textcolor{#1864ab}{2}\textcolor{#5f3dc4}{1}} & w_{\textcolor{#1864ab}{2}\textcolor{#2b8a3e}{2}} & w_{\textcolor{#1864ab}{2}\textcolor{#e67700}{3}} \end{bmatrix}\\ a = \begin{bmatrix} \text{Image One} & \text{Image Two} & \dots & \text{Image $m$} \\ a_{\textcolor{#5f3dc4}{1}1} & a_{\textcolor{#5f3dc4}{1}2} && a_{\textcolor{#5f3dc4}{1}m}\\ a_{\textcolor{#2b8a3e}{2}1} & a_{\textcolor{#2b8a3e}{2}2} && a_{\textcolor{#2b8a3e}{2}m}\\ a_{\textcolor{#e67700}{3}1} & a_{\textcolor{#e67700}{3}2} && a_{\textcolor{#e67700}{3}m}\\ \end{bmatrix}\\ b = \begin{bmatrix} \textcolor{#a61e4d}{b_1} & \textcolor{#a61e4d}{b_1} & \underset{m}{\dots} & \textcolor{#a61e4d}{b_1}\\ \textcolor{#1864ab}{b_2} & \textcolor{#1864ab}{b_2} && \textcolor{#1864ab}{b_2} \end{bmatrix}\\ z_\text{current layer} = w \cdot a + b =\\ \begin{bmatrix} w_{\textcolor{#a61e4d}{1}\textcolor{#5f3dc4}{1}}a_{\textcolor{#5f3dc4}{1}1} + w_{\textcolor{#a61e4d}{1}\textcolor{#2b8a3e}{2}}a_{\textcolor{#2b8a3e}{2}1} + w_{\textcolor{#a61e4d}{1}\textcolor{#e67700}{3}}a_{\textcolor{#e67700}{3}1} + \textcolor{#a61e4d}{b_1} & \underset{m}{\dots} & w_{\textcolor{#a61e4d}{1}\textcolor{#5f3dc4}{1}}a_{\textcolor{#5f3dc4}{1}m} + w_{\textcolor{#a61e4d}{1}\textcolor{#2b8a3e}{2}}a_{\textcolor{#2b8a3e}{2}m} + w_{\textcolor{#a61e4d}{1}\textcolor{#e67700}{3}}a_{\textcolor{#e67700}{3}m} + \textcolor{#a61e4d}{b_1}\\ w_{\textcolor{#1864ab}{2}\textcolor{#5f3dc4}{1}}a_{\textcolor{#5f3dc4}{1}1} + w_{\textcolor{#1864ab}{2}\textcolor{#2b8a3e}{2}}a_{\textcolor{#2b8a3e}{2}1} + w_{\textcolor{#1864ab}{2}\textcolor{#e67700}{3}}a_{\textcolor{#e67700}{3}1} + \textcolor{#1864ab}{b_2} && w_{\textcolor{#1864ab}{2}\textcolor{#5f3dc4}{1}}a_{\textcolor{#5f3dc4}{1}m} + w_{\textcolor{#1864ab}{2}\textcolor{#2b8a3e}{2}}a_{\textcolor{#2b8a3e}{2}m} + w_{\textcolor{#1864ab}{2}\textcolor{#e67700}{3}}a_{\textcolor{#e67700}{3}m} + \textcolor{#1864ab}{b_2}\\ \end{bmatrix} \end{gathered}

By organizing your matrices in the pattern above you can simultionously compute all the zz's for each image in the stochastic mini batch, instead of iterating over each image and doing wa+bw \cdot a + b individually.

The Code

The code was based on Micheal Nielson's in his tutorial. I tried to do it on my own but sometimes had to look to his for reference. I had trouble implementing evaluate and reliased that I was not using argmax.

Here's my code on GitHub.

Please note that comments are specific for training the data for recongizing the digits using the MNIST dataset, but the Network class could be used else where.

1
# Standard library
2
import random
3
4
# Third-party libraries
5
import numpy as np
6
7
class Network(object):
8
9
def __init__(self, sizes):
10
self.num_layers = len(sizes)
11
self.sizes = sizes
12
self.weights = [np.random.randn(y, x)
13
for x, y in zip(sizes[0:-1], sizes[1:])]
14
# create biases (x by 1) for layer 1 to last layer
15
self.biases = [np.random.randn(x, 1) for x in sizes[1:]]
16
17
# a = input vector
18
def feedforward(self, a):
19
# for every layer
20
for w, b in zip(self.weights, self.biases):
21
a = sigmoid(np.dot(w, a) + b)
22
return a
23
24
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
25
if test_data:
26
n_test = len(test_data)
27
n = len(training_data)
28
for j in range(epochs):
29
random.shuffle(training_data)
30
mini_batches = [
31
training_data[k:k+mini_batch_size]
32
for k in range(0, n, mini_batch_size)]
33
for mini_batch in mini_batches:
34
self.update_mini_batch(mini_batch, eta)
35
if test_data:
36
print("Epoch {0}: {1} / {2}".format(
37
j, self.evaluate(test_data), n_test))
38
else:
39
print("Epoch {0} complete".format(j))
40
41
def update_mini_batch(self, mini_batch, eta):
42
mini_batch_size = len(mini_batch)
43
44
# (number of images, input layer activation values)
45
xs = np.array([x for x, y in mini_batch]).transpose().reshape(
46
self.sizes[0], mini_batch_size)
47
# (number of images, expected output layer values)
48
ys = np.array([y for x, y in mini_batch]).transpose().reshape(
49
self.sizes[-1], mini_batch_size)
50
51
nabla_weight, nabla_bias = self.backprop(xs, ys, mini_batch_size)
52
53
# nabla_bias was a matrix with the biases as rows and mini_batch_size
54
# number of columns. We must flatten them
55
for layer in range(0, len(nabla_bias)):
56
# sum along the rows
57
biases = nabla_bias[layer].sum(axis=1)
58
bias_count = biases.shape[0]
59
# restructure back to node count x 1
60
nabla_bias[layer] = biases.reshape((bias_count, 1))
61
62
# there might be a better way to handle this with numpy
63
self.weights = [w - (eta / len(mini_batch)) * dnw for dnw,
64
w in zip(nabla_weight, self.weights)]
65
self.biases = [b - (eta / len(mini_batch)) * dnb for dnb,
66
b in zip(nabla_bias, self.biases)]
67
68
# move the in opposite (down the hill) of the gradient of the cost
69
70
def backprop(self, xs, ys, mini_batch_size):
71
# feed foward
72
activation = xs
73
activations = [xs]
74
zs = []
75
76
for w, b in zip(self.weights, self.biases):
77
# bs = [b, b, b, ... len(mini_batch)] create column of biases for
78
# every image in mini_batch
79
bs = np.tile(b, (1, mini_batch_size))
80
z = np.dot(w, activation) + bs
81
zs.append(z)
82
activation = sigmoid(z)
83
activations.append(activation)
84
85
# calculate error for last layer
86
nabla_bias = [np.zeros(b.shape) for b in self.biases]
87
nabla_weight = [np.zeros(w.shape) for w in self.weights]
88
89
delta = self.cost_derivative(
90
activations[-1], ys) * sigmoid_prime(zs[-1])
91
nabla_bias[-1] = delta
92
nabla_weight[-1] = np.dot(delta, activations[-2].transpose())
93
94
# back propgate the error
95
for l in range(2, self.num_layers):
96
z = zs[-l]
97
sp = sigmoid_prime(z)
98
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
99
nabla_bias[-l] = delta
100
nabla_weight[-l] = np.dot(delta, activations[-l - 1].transpose())
101
102
return (nabla_weight, nabla_bias)
103
104
def evaluate(self, test_data):
105
test_results = [(np.argmax(self.feedforward(x)), y)
106
for (x, y) in test_data]
107
return sum(int(x == y) for (x, y) in test_results)
108
109
def cost_derivative(self, output_activations, y):
110
return output_activations - y
111
112
113
# Miscellaneous functions
114
def sigmoid(z):
115
return 1.0 / (1.0 + np.exp(-z))
116
117
118
def sigmoid_prime(z):
119
return sigmoid(z) * (1 - sigmoid(z))