Preamble
I do not have a lot of experience with Python and the best practices for Python development. I hope to learn more and have been reading about Python development.
Aside
Coming from a JavaScript background I find Python, the language itself, amazing, but the package management system, compared to npm, very poor. Pip and virtual environments are quite convoluted.
Using Matrices for More Efficent Stochastic Gradient Descent
Micheal Nielson intentiollay wrote the code to be slow to show the power of matrices and numpy. The way he wrote it does not utlize the numpy matrix performance, all of which is written to be very fast. To my knowledge numpy is a C binding). The performance gains were very noticeable for me and could be improved much more by trying to use cupy a numpy like interface that utlizes nvidia's cuda tools to improve performance by off loading matrix operations to the GPU.
Here is an example image of a subset of nodes in the network:
Given a mini batch with m number of images:
By organizing your matrices in the pattern above you can simultionously compute all the z's for each image in the stochastic mini batch, instead of iterating over each image and doing w⋅a+b individually.
The Code
The code was based on
Micheal Nielson's in his tutorial.
I tried to do it on my own but sometimes had to look to his for reference. I had trouble
implementing evaluate
and reliased that I was not using argmax
.
Here's my code on GitHub.
Please note that comments are specific for training the data for recongizing the digits using the MNIST dataset, but the
Network
class could be used else where.
# Standard library import random # Third-party libraries import numpy as np class Network(object): def __init__(self, sizes): self.num_layers = len(sizes) self.sizes = sizes self.weights = [np.random.randn(y, x) for x, y in zip(sizes[0:-1], sizes[1:])] # create biases (x by 1) for layer 1 to last layer self.biases = [np.random.randn(x, 1) for x in sizes[1:]] # a = input vector def feedforward(self, a): # for every layer for w, b in zip(self.weights, self.biases): a = sigmoid(np.dot(w, a) + b) return a def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None): if test_data: n_test = len(test_data) n = len(training_data) for j in range(epochs): random.shuffle(training_data) mini_batches = [ training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)] for mini_batch in mini_batches: self.update_mini_batch(mini_batch, eta) if test_data: print("Epoch {0}: {1} / {2}".format( j, self.evaluate(test_data), n_test)) else: print("Epoch {0} complete".format(j)) def update_mini_batch(self, mini_batch, eta): mini_batch_size = len(mini_batch) # (number of images, input layer activation values) xs = np.array([x for x, y in mini_batch]).transpose().reshape( self.sizes[0], mini_batch_size) # (number of images, expected output layer values) ys = np.array([y for x, y in mini_batch]).transpose().reshape( self.sizes[-1], mini_batch_size) nabla_weight, nabla_bias = self.backprop(xs, ys, mini_batch_size) # nabla_bias was a matrix with the biases as rows and mini_batch_size # number of columns. We must flatten them for layer in range(0, len(nabla_bias)): # sum along the rows biases = nabla_bias[layer].sum(axis=1) bias_count = biases.shape[0] # restructure back to node count x 1 nabla_bias[layer] = biases.reshape((bias_count, 1)) # there might be a better way to handle this with numpy self.weights = [w - (eta / len(mini_batch)) * dnw for dnw, w in zip(nabla_weight, self.weights)] self.biases = [b - (eta / len(mini_batch)) * dnb for dnb, b in zip(nabla_bias, self.biases)] # move the in opposite (down the hill) of the gradient of the cost def backprop(self, xs, ys, mini_batch_size): # feed foward activation = xs activations = [xs] zs = [] for w, b in zip(self.weights, self.biases): # bs = [b, b, b, ... len(mini_batch)] create column of biases for # every image in mini_batch bs = np.tile(b, (1, mini_batch_size)) z = np.dot(w, activation) + bs zs.append(z) activation = sigmoid(z) activations.append(activation) # calculate error for last layer nabla_bias = [np.zeros(b.shape) for b in self.biases] nabla_weight = [np.zeros(w.shape) for w in self.weights] delta = self.cost_derivative( activations[-1], ys) * sigmoid_prime(zs[-1]) nabla_bias[-1] = delta nabla_weight[-1] = np.dot(delta, activations[-2].transpose()) # back propgate the error for l in range(2, self.num_layers): z = zs[-l] sp = sigmoid_prime(z) delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp nabla_bias[-l] = delta nabla_weight[-l] = np.dot(delta, activations[-l - 1].transpose()) return (nabla_weight, nabla_bias) def evaluate(self, test_data): test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data] return sum(int(x == y) for (x, y) in test_results) def cost_derivative(self, output_activations, y): return output_activations - y # Miscellaneous functions def sigmoid(z): return 1.0 / (1.0 + np.exp(-z)) def sigmoid_prime(z): return sigmoid(z) * (1 - sigmoid(z))