Day1 06 Simple NN Python
Day1 06 Simple NN Python
Contents
A simple two-layer neural network Dealing with Python errors
Creating neural network Considerations
Initialize parameters A full version
Forward propagation Standalone training code
► Active function Standalone inference code
► Loss function
Backpropagation: single-neuron case
Loss function and backward propagation
Backpropagation: two-neurons case
Backward propagation
Sequence of layers
Running an example
All together
1
1/19/2025
W1 W2
import numpy as np
h class NeuralNetwork:
x def __init__(self, n_x, n_h, n_y):
y """ n_x: number of input nodes
n_h: number of hidden nodes
n_y: number of output nodes"""
self.W1 = np.random.rand(n_x, n_h)
W1 W2 self.W2 = np.random.rand(n_h, n_y)
self.hidden = np.zeros((1,n_h))
self.output = np.zeros((n_y, 1))
output layer self.activation = sigmoid
input layer
hidden layer
nn = NeuralNetwork(3,4,1)
2
1/19/2025
3
""" n_x: number of input nodes
n_h: number of hidden nodes 0.9
0.8 0.1 0.3 0.0
4
0.7
self.hidden = np.zeros((1, n_h))
hidden
self.output = np.zeros((n_y, 1)) 0.5 W2
self.activation = sigmoid 0.0 0.0 0.0 0.0 1
1
output 2-rank array with shape [4, 1]
nn = NeuralNetwork(3,4,1)
2-rank array with shape [1, 4]
2-rank array with shape [1, 1]
h
x
y
input
n
W1 W2 data
output layer 3
input layer 2-rank array with shape [n, 3]
hidden layer
3
1/19/2025
Forward propagation
class NeuralNetwork: It calculates the predicted output.
...
► numpy.dot(a, b)
def feedforward(self, In):
self.hidden = self.activation(np.dot(In, self.W1)) Dot product of two arrays (i.e., matrix
self.output = self.activation(np.dot(self.hidden, self.W2)) multiplication)
return self.output
h
x
y
ℎ = 𝑓 𝑊1 × 𝑥 + 𝑏1
W1 W2
𝑧 = 𝑓 𝑊2 × ℎ + 𝑏2
= 𝑓 𝑊2 × 𝑓 𝑊1 × 𝑥 + 𝑏1 + 𝑏2
output layer
input layer
hidden layer
f 0.5
Activation function
Use one of many activation function
1
sigmoid(z) =s(z)=
1+𝑒 −𝑧
𝑑𝑠(𝑧) 𝑑 1 𝑑
= = 1 + 𝑒 −𝑧 −1
= − 1 + 𝑒 −𝑧 −2
× −𝑒 −𝑧 = 𝑠(𝑧) × (1 − 𝑠 𝑧 )
𝑑𝑧 𝑑𝑧 1 + 𝑒 −𝑧 𝑑𝑧
import numpy as np
from matplotlib import pyplot as plt
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def dsigmoid(z):
return sigmoid(z) * ( 1 - sigmoid(z))
if __name__ == "__main__":
z = np.linspace(-10, 10, 200)
plt.grid()
plt.plot(z, sigmoid(z))
plt.plot(z, dsigmoid(z))
plt.show()
4
1/19/2025
Activation function
Use one of many activation function
sinh(𝑧) 𝑒 𝑧 − 𝑒 −𝑧
tanh(𝑧) = = 𝑧
cosh(𝑧) 𝑒 + 𝑒 −𝑧
𝑧 −𝑧
𝜕 tanh 𝑧 𝑒 +𝑒 × 𝑒 𝑧 + 𝑒 −𝑧 + 𝑒 𝑧 − 𝑒 −𝑧 × 𝑒 𝑧 − 𝑒 −𝑧 𝑒 𝑧 − 𝑒 −𝑧 2
= 𝑧 −𝑧 2
=1− 𝑧 = 1 − tanh(𝑧)2
𝜕𝑧 𝑒 +𝑒 𝑒 + 𝑒 −𝑧 2
import numpy as np
from matplotlib import pyplot as plt
def tanh(z):
return np.tanh(z)
def dtanh(z):
return 1.0 - np.tanh(z)**2
if __name__ == "__main__":
z = np.linspace(-6, 6, 100)
plt.grid()
plt.plot(z, tanh(z))
plt.plot(z, dtanh(z))
plt.show()
Loss function
Select one of many loss functions. Our goal in training is to find the best set of
► a way to evaluate the “goodness” of our weights and biases that minimizes the loss
predictions (i.e. how far off are our function.
predictions)
Calculate the derivative of the loss function
► ➔ loss: measure the error the prediction with respect to the weights and biases.
2
SSE= σ𝑛𝑖=1 𝑧 − 𝑦
10
5
1/19/2025
https://youtu.be/tIeHLnjs5U8
11
Backward propagation
class NeuralNetwork:
x * 2 * ( z – y ) * df(x)/dx
...
def backprop(self, In, Out, Desired):
diff = Out - Desired
d_W2 = np.dot(self.hidden.T, (2*diff*self.activation(Out, True)))
d_W1 = np.dot(In.T, np.dot(2*diff*self.activation(Out, True), self.W2.T)*self.activation(self.hidden, True))
self.W1 -= d_W1
self.W2 -= d_W2 𝑑𝐿 𝑑𝑔 𝑧
= 𝑎𝑙−1 × 𝛿 𝑙+1 × 𝑊 𝑙+1 ×
𝑑𝑊 1 𝑑𝑧
Update weights
*negative slope causes increase weights
*positive slope causes decrease weight The ratio of updating parameters.
► Learning rate.
► W = W – ·W
w w w
w w w
12
6
1/19/2025
Running an example
class NeuralNetwork:
... Input data
if __name__ == "__main__": shape (4, 3)
X = np.array([[0,0,1],
[0,1,1],
[1,0,1], Desired output
[1,1,1]]) shape (4, 1)
Y = np.array([[0],
Building network
[1],
3:4:1
[1],
[0]])
13
14
7
1/19/2025
15
nn = NeuralNetwork(X.shape[1],4,Y.shape[1])
loss_values = []
for i in range(1000):
z = nn.feedforward(X)
nn.backprop(X, z, Y)
loss = lossFunction(z, Y)
loss_values.append(loss)
print(nn.output)
plt.plot(loss_values)
plt.xlabel("Iterations"); plt.xlim(-10, len(loss_values))
plt.ylabel("Loss")
plt.show()
16
8
1/19/2025
17
Considerations
Initial value issues How to save and load trained results, i.e.,
► Determines local or global minima weights.
18
9
1/19/2025
class NeuralNetwork:
def __init__(self, n_x, n_h, n_y, init=True):
"""n_x: number of input nodes
n_h: number of hidden nodes
n_y: number of output nodes"""
if init:
self.W1 = np.random.rand(n_x, n_h)
self.W2 = np.random.rand(n_h, n_y) self.hidden = np.zeros((1, n_h))
else: self.output = np.zeros((n_y, 1))
self.W1 = np.zeros((n_x, n_h)) self.activation = sigmoidFunction
self.W2 = np.zeros((n_h, n_y)) self.inference = self.feedforward
19
20
10
1/19/2025
21
nn = NeuralNetwork(X.shape[1],4,Y.shape[1])
loss_values = []
for i in range(1000):
z = nn.feedforward(X)
nn.backprop(X, z, Y)
loss = lossFunction(z, Y)
loss_values.append(loss)
nn.save('weight.txt')
print(nn.output)
#plt.figure()
plt.plot(loss_values)
plt.xlabel("Iterations")
plt.xlim(-10, len(loss_values))
plt.ylabel("Loss")
plt.show()
22
11
1/19/2025
if len(sys.argv)==1: wfile='weight.txt'
else : wfile=sys.argv[1]
23
if len(sys.argv)==1: wfile='weight.txt'
else : wfile=sys.argv[1]
Get file name to read trained weights from
net.load(wfile)
X = np.array([[0,1,0]])
24
12
1/19/2025
25
26
13
1/19/2025
27
28
14
1/19/2025
29
y: output value
e: expected value
½: make life easy
variation of Wi causes z to vary,
variation of z causes g(z) to vary,
30
15
1/19/2025
𝜕𝑔(𝑧)
►
𝜕𝑧
derivative of activation function g(z)
𝜕 σ 𝑥𝑗∙𝑊𝑗+𝑏
►
𝜕𝑊𝑖
gradient of L against Wi (applying chain rule)
𝑑 𝑥1∙𝑊1+𝑥2∙𝑊2+⋯+𝑥𝑛∙𝑊𝑛
=
𝑑(𝑥𝑖∙𝑊𝑖)
= 𝑥𝑖
𝜕𝐿(𝑦,𝑒) 𝜕𝐿(𝑦,𝑒) 𝜕𝑔(𝑧) 𝜕 σ 𝑥𝑗∙𝑊𝑗+𝑏 𝑑𝑊𝑖 𝑑𝑊𝑖
► = × ×
𝜕𝑊𝑖 𝜕𝑦 𝜕𝑧 𝜕𝑊𝑖
L(y,e)=1/2*(y-e)**2
𝜕𝐿(𝑦,𝑒) 𝜕𝑔(𝑧)
y = g(z) = 𝑦−𝑒 ∙ ∙ 𝑥𝑖
𝜕𝑊𝑖 𝜕𝑧
z=sum(xj*Wj+b)
31
32
16
1/19/2025
𝜕𝑔(𝑧)
►
𝜕𝑧
derivative of activation function g(z)
𝜕 σ 𝑎∙𝑣+𝑏
►
𝜕𝑣
Gradient of L against v
𝑑 𝑎1∙𝑊1+𝑎2∙𝑊2+⋯+𝑎𝑛∙𝑊𝑛
=
𝑑(𝑎∙𝑣)
=𝑎
𝜕𝐿(𝑦,𝑒) 𝜕𝐿(𝑦,𝑒) 𝜕𝑔(𝑧) 𝜕 σ 𝑎∙𝑣+𝑏 𝑑𝑣 𝑑𝑣
► = × ×
𝜕𝑣 𝜕𝑏 𝜕𝑧 𝜕𝑣
L(y,e)=1/2*(y-e)**2
𝜕𝐿(𝑦,𝑒) 𝜕𝑔(𝑧)
b = g(z) = y = 𝑦−𝑒 ∙ ∙ 𝑎
𝜕𝑣 𝜕𝑧
z=sum(xj*Wj+b)
► where a is result of previous layer
► where v is one of W2
33
► ∂𝓛/∂wᵢ = 𝛿³ * v * g²’(z²) * xᵢ
for layer 2
► variation of z²affects g²(z²) 𝛿³= (a³- y) * g³’(z³)
► variation of g²(z²) affects z³(note that at this
point v is considered fixed)
► variation of z³affects g³(z³)
► variation of g³(z³) affects 𝓛(y, ŷ)
34
17
1/19/2025
Sequence of layers
35
18