Backpropagation Implementations
import numpy as np
Single Update Example
Helper Function
def print_array(array):
print('Shape: ' + str(array.shape))
print('\r')
print(array)
Activation Function
def sigmoid(x):
return 1 / (1 + np.exp(-x))
Inputs, Weights, Target Initialization
3 inputs, 2 hidden, 1 output
x = np.array([0.5, 0.1, -0.2])
target = 0.6
learnrate = 0.5
weights_input_hidden = np.array([[0.5, -0.6],
[0.1, -0.2],
[0.1, 0.7]])
print_array(weights_input_hidden)
Shape: (3, 2)
[[ 0.5 -0.6]
[ 0.1 -0.2]
[ 0.1 0.7]]
weights_hidden_output = np.array([0.1, -0.3])
print_array(weights_hidden_output)
Shape: (2,)
[ 0.1 -0.3]
Feed Forward Calculations
Hidden Layer Output
hidden_layer_input = np.dot(x, weights_input_hidden)
hidden_layer_output = sigmoid(hidden_layer_input)
print_array(hidden_layer_output)
Shape: (2,)
[0.55971365 0.38698582]
Output Layer Output
output_layer_in = np.dot(hidden_layer_output, weights_hidden_output)
output = sigmoid(output_layer_in)
print_array(output)
Shape: ()
0.48497343084992534
Backpropagation
Output Error
error = target - output
error
0.11502656915007464
Output Error Term
output_error_term = error * output * (1 - output)
output_error_term
0.028730669543515018
Hidden Error Term
hidden_error_term = np.dot(output_error_term, weights_hidden_output) * \
hidden_layer_output * (1 - hidden_layer_output)
print_array(hidden_error_term)
Shape: (2,)
[ 0.00070802 -0.00204471]
Change in weights for hidden layer to output layer
delta_w_h_o = learnrate * output_error_term * hidden_layer_output
print_array(delta_w_h_o)
Shape: (2,)
[0.00804047 0.00555918]
Change in weights for input layer to hidden layer
delta_w_i_h = learnrate * hidden_error_term * x[:, None]
print_array(delta_w_i_h)
Shape: (3, 2)
[[ 1.77005547e-04 -5.11178506e-04]
[ 3.54011093e-05 -1.02235701e-04]
[-7.08022187e-05 2.04471402e-04]]
Graduate School Admissions Example
From this point, this page is very similar to the Neural Network Admissions Example. It relies on the same base data and contains much of the same code.
import pandas as pd
admissions = pd.read_csv('gradient-descent-implementations/data/data.csv')
print('raw admissions: ' + str(admissions.shape[0]) + ' rows')
admissions.head()
raw admissions: 400 rows
admit | gre | gpa | rank | |
---|---|---|---|---|
0 | 0 | 380 | 3.61 | 3 |
1 | 1 | 660 | 3.67 | 3 |
2 | 1 | 800 | 4.00 | 1 |
3 | 1 | 640 | 3.19 | 4 |
4 | 0 | 520 | 2.93 | 4 |
# Make dummy variables for rank
data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')],
axis=1)
data = data.drop(columns=['rank'])
# Standarize features
for field in ['gre', 'gpa']:
mean, std = data[field].mean(), data[field].std()
data.loc[:,field] = (data[field]-mean)/std
print('data: ' + str(data.shape[0]) + ' rows')
data.head()
data: 400 rows
admit | gre | gpa | rank_1 | rank_2 | rank_3 | rank_4 | |
---|---|---|---|---|---|---|---|
0 | 0 | -1.798011 | 0.578348 | 0 | 0 | 1 | 0 |
1 | 1 | 0.625884 | 0.736008 | 0 | 0 | 1 | 0 |
2 | 1 | 1.837832 | 1.603135 | 1 | 0 | 0 | 0 |
3 | 1 | 0.452749 | -0.525269 | 0 | 0 | 0 | 1 |
4 | 0 | -0.586063 | -1.208461 | 0 | 0 | 0 | 1 |
# Split off random 10% of the data for testing
np.random.seed(21)
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)
data, test_data = data.iloc[sample], data.drop(sample)
# Split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']
print(' features: ' + str(features.shape[0]) + ' rows')
print('features_test: ' + str(features_test.shape[0]) + ' rows')
features.head()
features: 360 rows
features_test: 40 rows
gre | gpa | rank_1 | rank_2 | rank_3 | rank_4 | |
---|---|---|---|---|---|---|
106 | 0.972155 | 0.446965 | 1 | 0 | 0 | 0 |
9 | 0.972155 | 1.392922 | 0 | 1 | 0 | 0 |
61 | -0.239793 | -0.183673 | 0 | 0 | 0 | 1 |
224 | 1.837832 | -1.287291 | 0 | 1 | 0 | 0 |
37 | -0.586063 | -1.287291 | 0 | 0 | 1 | 0 |
General Algorithm for Implementing Backpropagation
The error term for the output layer:
$$\delta_k = (y_k - \hat{y}_k) f'(a_k)$$
The error term for the hidden layer:
$$\delta_j = \sum[w_{jk} \delta_k] f'(h_j)$$
The following example considers a simple network with one hidden layer and one output unit. Here’s the algorithm:
- Set the weight steps for each layer to zero
- Input to hidden weights: $\Delta w_{ij}=0$
- Hidden to output weights: $\Delta W_j=0$
- For each record in the training data
- Make a forward pass through the network, calculating the output: $\hat{y}$
- Calculate the error gradient in the output unit: $\delta^O = (y - \hat{y}) f'(z)$ where $z = \sum_j W_j a_j$, the input to the output unit
- Propagate the errors to the hidden layer: $\delta_j^h = \delta^O W_j f'(h_j)$
- Update the weight steps:
- $\Delta W_j = \Delta W_j + \delta^O a_j$
- $\Delta w_ij = \Delta w_{ij} + \delta_J^H a_i$
- Update the weights, where $\lambda$ is the learning rate and $m$ is the number of records:
- $W_j = W_j + \eta \Delta W_j / m$
- $w_{ij} = w_{ij} + \eta \Delta w_{ij}/m$
- Repeat for $e$ epochs
Setup
np.random.seed(21)
def sigmoid(x):
"""
Calculate sigmoid
"""
return 1 / (1 + np.exp(-x))
# Hyperparameters
n_hidden = 2 # number of hidden units
epochs = 900
learnrate = 0.005
n_records, n_features = features.shape
last_loss = None
# Initialize weights
weights_input_hidden = np.random.normal(scale=1 / n_features ** .5,
size=(n_features, n_hidden))
weights_hidden_output = np.random.normal(scale=1 / n_features ** .5,
size=n_hidden)
Backpropagation
for e in range(epochs):
del_w_input_hidden = np.zeros(weights_input_hidden.shape)
del_w_hidden_output = np.zeros(weights_hidden_output.shape)
for x, y in zip(features.values, targets):
## Forward pass ##
# Calculate the output
hidden_input = np.dot(x, weights_input_hidden)
hidden_output = sigmoid(hidden_input)
output = sigmoid(np.dot(hidden_output,
weights_hidden_output))
## Backward pass ##
# Calculate the network's prediction error
error = y - output
# Calculate error term for the output unit
output_error_term = error * output * (1 - output)
## propagate errors to hidden layer
# Calculate the hidden layer's contribution to the error
hidden_error = np.dot(output_error_term, weights_hidden_output)
# Calculate the error term for the hidden layer
hidden_error_term = hidden_error * hidden_output * (1 - hidden_output)
# Update the change in weights
del_w_hidden_output += output_error_term * hidden_output
del_w_input_hidden += hidden_error_term * x[:, None]
# Update weights
weights_input_hidden += learnrate * del_w_input_hidden / n_records
weights_hidden_output += learnrate * del_w_hidden_output / n_records
# Printing out the mean square error on the training set
if e % (int(epochs / 10)) == 0:
hidden_output = sigmoid(np.dot(x, weights_input_hidden))
out = sigmoid(np.dot(hidden_output,
weights_hidden_output))
loss = np.mean((out - targets) ** 2)
if last_loss and last_loss < loss:
print("Epoch {:4} - Train loss: {:1.5f} WARNING - Loss Increasing"
.format(e,loss))
else:
print("Epoch {:4} - Train loss: {:1.5f}"
.format(e,loss))
last_loss = loss
Epoch 0 - Train loss: 0.25136
Epoch 90 - Train loss: 0.24997
Epoch 180 - Train loss: 0.24862
Epoch 270 - Train loss: 0.24732
Epoch 360 - Train loss: 0.24606
Epoch 450 - Train loss: 0.24485
Epoch 540 - Train loss: 0.24368
Epoch 630 - Train loss: 0.24255
Epoch 720 - Train loss: 0.24145
Epoch 810 - Train loss: 0.24040
hidden = sigmoid(np.dot(features_test, weights_input_hidden))
out = sigmoid(np.dot(hidden, weights_hidden_output))
predictions = out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))
Prediction accuracy: 0.725
This content is taken from notes I took while pursuing the Intro to Machine Learning with Pytorch nanodegree certification.