Neural Networks¶
This page contains a python implementation using only python + numpy to implement a basic feed forward neural network with a fixed single hidden layer and a variable number of neurons in the input/hidden/output layers.
Drawing largely from Machine Learning From Scratch: https://dafriedman97.github.io/mlbook/content/c7/concept.html and 3BlueOneBrown’s excellent series on Neural Networks: https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi
Here are some notes on the derivations for the gradients used to fit the network.
Derive dL/dW2 and dL/dc2¶
We want to update W1, W2, c1, c2. To do this we need to update using dL/dW1, dL/dW2, dL/dc1, dL/dc1 These are calculated using the chain rule \begin{align} \{\frac{\partial L}{\partial yhat}} RSS &= 2*(y-yhat) \{\frac{\partial yhat}{\partial h2}} sigmoid &= \sigma (yhat) (1 - \sigma (yhat)) \{\frac{\partial h2}{\partial W2}} &= z1 \{\frac{\partial h2}{\partial C2}} &= 1 \{\frac{\partial h2}{\partial z1}} &= W2 \{\frac{\partial z1}{\partial h1}} &= f1’ =f1’sigmoid = \sigma (z1) (1 - \sigma (z1)) \{\frac{\partial h1}{\partial W1}} &= x1 \{\frac{\partial h1}{\partial C1}} &= 1 \end{align}
Example Neural Network From Scratch¶
"""Trevor Gordon. 2021
Implementation of a feed forward neural network from scratch.
Drawing largely from https://dafriedman97.github.io/mlbook/content/c7/concept.html
and https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi
"""
import numpy as np
import tqdm
class FeedForwardNeuralNetwork():
"""Feed Forward Neural Network
This FFNN is fixed to have one hidden layer. It is defined by weights self.W1 and self.W2 and
biases self.c1 and self.c2
"""
def __init__(self) -> None:
"""Init function
Class variables:
self.n_hidden (int): Number of neurons in single hidden layer
self.f1 (func): Activation function to apply to hidden layer output before next layer
Only supporting sigmoid right now
"""
self.n_hidden = 1
self.f1 = self.sigmoid
self.f1_prime = self.sigmoid_prime
def sigmoid(self, x):
"""Sigmoid function to bound output between 0 and 1"""
return 1 / (1 + np.exp(-1*x))
def sigmoid_prime(self, x):
"""Derivative of the sigmoid function simplifies to this."""
return self.sigmoid(x) * (1 - self.sigmoid(x))
def initialize_params(self, len_input, len_hidden, len_output):
"""Initialize weights and biases randomly.
Args:
len_input (int): Number of neurons in X input layer
len_hidden (int): Number of neurons in hidden layer
len_output (int): Number of neurons in y output layer
Returns:
weights_1 (np.array): Rows for neurons in hidden layer and columns for
neurons in X input layer.
weights_2 (np.array): Rows for neurons in y output layer and columns for neurons in
hidden layer
biases_1 (np.array): Rows for neurons in hidden layer and 1 column
biases_2 (np.array): Rows for neurons in output layer and 1 column
"""
weights_1 = np.random.randn(len_hidden, len_input)/5
biases_1 = np.random.randn(len_hidden, 1)/5
weights_2 = np.random.randn(len_output, len_hidden)/5
biases_2 = np.random.randn(len_output, 1)/5
return weights_1, biases_1, weights_2, biases_2
def update_network_states(self):
"""Using weights, biases and activation function, calculate hidden layer and output
Returns:
h1 (np.array): Hidden layer with shape = (len_hidden, num_observations)
z1 (np.array): Activated hidden layer with shape = (len_hidden, num_observations)
h2 (np.array): Hidden layer 2 with shape = (len_output, num_observations)
yhat (np.array): Output layer with shape = (len_output, num_observations)
"""
h1 = np.dot(self.W1, self.X.T) + self.c1
z1 = self.f1(h1)
h2 = np.dot(self.W2, z1) + self.c2
yhat = self.f1(h2)
return h1, z1, h2, yhat
def get_loss_gradient_one_observations(self,
y_one_obsv,
yhat_one_obsv,
h2_one_obsv,
h1_one_obsv,
z1_one_obsv,
x_input_one_obsv,
len_hidden):
"""Given a single observation find the loss gradient.
Derivations for all of these are shown in the images folder.
Args:
y_one_obsv (np.array): Single output. Shape = (len_output, 1)
yhat_one_obsv (np.array): Single predicted output given current weights/biases
h2_one_obsv (np.array): Current h2 hidden layer for single input x and weights/biases
h1_one_obsv (np.array): Current h1 hidden layer for single input x and weights/biases
z1_one_obsv (np.array): Current z1 value for single input x and weights/biases
x_input_one_obsv (np.array): Single x input
len_hidden (int): Number of neurons in hidden layer
Returns:
dL_dW1_one_obvs (np.array): Partial grad showing direction to move. Same shape as W1
dL_dc1_one_obvs (np.array): Partial grad showing direction to move. Same shape as c1
dL_dW2_one_obvs (np.array): Partial grad showing direction to move. Same shape as W2
dL_dc2_one_obvs (np.array): Partial grad showing direction to move. Same shape as c2
"""
len_input = len(x_input_one_obsv)
len_output = len(y_one_obsv)
# Start at output by calculation error
dL_dyhat = -2*(y_one_obsv - yhat_one_obsv).T # (1, len_output)
## LAYER 2 ##
# dyhat_dh2.shape = (len_output, len_output)
dyhat_dh2 = np.diag(self.f1_prime(h2_one_obsv))
# dh2_dc2.shape = (len_output, len_output)
# dh2_dc2 = 1 because bias is simply added
dh2_dc2 = np.eye(len_output)
# dh2_dW2.shape = (len_output, (len_output, len_hidden))
dh2_dW2 = np.zeros((len_output, len_output, len_hidden))
for i in range(len_output):
dh2_dW2[i] = z1_one_obsv
# dh2_dz1.shape = (len_output, len_hidden)
dh2_dz1 = self.W2
## LAYER 1 ##
# dz1_dh1.shape = (len_hidden, len_hidden)
dz1_dh1 = np.diag(self.f1(h1_one_obsv)*(1-self.f1(h1_one_obsv)))
# dh1_dc1.shape = (len_hidden, len_hidden)
dh1_dc1 = np.eye(len_hidden)
# dh1_dW1
# (len_hidden, (len_hidden, D_X))
dh1_dW1 = np.zeros((len_hidden, len_hidden, len_input))
for i in range(len_hidden):
dh1_dW1[i] = x_input_one_obsv
## DERIVATIVES W.R.T. LOSS ##
dL_dh2 = dL_dyhat @ dyhat_dh2
dL_dW2_one_obvs = dL_dh2 @ dh2_dW2
dL_dc2_one_obvs = dL_dh2 @ dh2_dc2
dL_dh1 = dL_dh2 @ dh2_dz1 @ dz1_dh1
dL_dW1_one_obvs = dL_dh1 @ dh1_dW1
dL_dc1_one_obvs = dL_dh1 @ dh1_dc1
# dL_dc1_one_obvs = dL_dc1_one_obvs.reshape(len(dL_dc1_one_obvs), -1)
# dL_dc2_one_obvs = dL_dc2_one_obvs.reshape(len(dL_dc2_one_obvs), -1)
return dL_dW1_one_obvs, dL_dc1_one_obvs, dL_dW2_one_obvs, dL_dc2_one_obvs
def get_loss_gradient_iter_over_observations(self, len_hidden):
"""Get the cumulative gradient while iterating over each observation at a time
Args:
len_hidden (int): Number of neurons in hidden layer
Returns:
dL_dW1 (np.array): Partial grad showing direction to move. Same shape as W1
dL_dc1 (np.array): Partial grad showing direction to move. Same shape as c1
dL_dW2 (np.array): Partial grad showing direction to move. Same shape as W2
dL_dc2 (np.array): Partial grad showing direction to move. Same shape as c2
"""
dL_dW2 = 0
dL_dc2 = 0
dL_dW1 = 0
dL_dc1 = 0
num_observations = len(self.X)
for n in range(num_observations):
# Slice y and yhat by observation number
# dL_dyhat
y_one_obsv = self.y[n] # (1, len_output)
yhat_one_obsv = self.yhat[:, n] # (1, len_output)
h2_one_obsv = self.h2[:, n]
h1_one_obsv = self.h1[:, n]
z1_one_obsv = self.z1[:, n]
x_input_one_obsv = self.X[n]
(dL_dW1_one_obvs, dL_dc1_one_obvs, dL_dW2_one_obvs,
dL_dc2_one_obvs) = self.get_loss_gradient_one_observations(
y_one_obsv, yhat_one_obsv, h2_one_obsv, h1_one_obsv,
z1_one_obsv, x_input_one_obsv, len_hidden)
dL_dW2 += dL_dW2_one_obvs
dL_dc2 += dL_dc2_one_obvs
dL_dW1 += dL_dW1_one_obvs
dL_dc1 += dL_dc1_one_obvs
return dL_dW1, dL_dc1, dL_dW2, dL_dc2
def get_loss_gradient(self, len_hidden):
"""Get the loss gradient.
This defaults to get_loss_gradient_iter_over_observations right now. In a future version
this should be implemented while iterating on mini batches to improve speed.
Args:
len_hidden (int): Number of neurons in hidden layer
Returns:
dL_dW1 (np.array): Partial grad showing direction to move. Same shape as W1
dL_dc1 (np.array): Partial grad showing direction to move. Same shape as c1
dL_dW2 (np.array): Partial grad showing direction to move. Same shape as W2
dL_dc2 (np.array): Partial grad showing direction to move. Same shape as c2
"""
(dL_dW1,
dL_dc1,
dL_dW2,
dL_dc2) = self.get_loss_gradient_iter_over_observations(len_hidden)
return dL_dW1, dL_dc1, dL_dW2, dL_dc2
def fit(self, X, y, len_hidden, grad_step=1e-5, n_iter=1000, seed=None):
"""Fit a feedforward neural network to the test set given.
Args:
X (np.array): Input data with columns for elements of a single input
and rows for number of observations. X.shape = (num_observations, data_per_observation)
y (np.array): Correct prediction for the input samples. y.shape = (num_observations, len_output)
len_hidden (int): Number of neurons in input layer.
grad_step (float, optional): Step for updating weights with gradient vectors. Defaults to 1e-5.
n_iter (int, optional): Number of iterations. Defaults to 1e3.
"""
self.X = X
# Reshaping with -1 make sure theres a second dim of at least 1
self.y = y.reshape(len(y), -1)
len_input = X.shape[1]
len_output = self.y.shape[1]
self.W1, self.c1, self.W2, self.c2 = self.initialize_params(
len_input, len_hidden, len_output)
self.h1, self.z1, self.h2, self.yhat = self.update_network_states()
print(f"Starting to fit for {n_iter} iterations")
for i in tqdm.tqdm(range(n_iter)):
# Adjust weights and biases in the direction of the negative gradient of the loss function
dL_dW1, dL_dc1, dL_dW2, dL_dc2 = self.get_loss_gradient(len_hidden)
self.W1 -= grad_step * dL_dW1
self.W2 -= grad_step * dL_dW2
self.c1 -= grad_step * dL_dc1.reshape(-1, 1)
self.c2 -= grad_step * dL_dc2.reshape(-1, 1)
self.h1, self.z1, self.h2, self.yhat = self.update_network_states()
return True
def predict(self, X_predict):
"""Predict output for given input after model has been fit.
Args:
X_predict (np.array): Input data with columns for elements of a single input.
and rows for each prediction. X.shape = (num_to_predict, data_per_observation)
Returns:
np.array: Predictions. y.shape = (data_per_output, num_to_predict)
"""
self.X = X_predict
h1, z1, h2, yhat = self.update_network_states()
return yhat