How to Build a Simple Neural Network from Scratch with NumPy
This article walks through implementing a basic multi‑layer neural network using only NumPy, covering terminology, network architecture, forward and backward propagation, activation functions, loss calculation, parameter updates with SGD, and compares the custom model with a Keras implementation.
Terminology
X = input matrix, y = label vector, W = weight matrix, b = bias vector, Z = X·Wᵀ + b, A = activation of Z, k = number of classes. Lower‑case symbols denote vectors, upper‑case denote matrices.
Architecture
Forward propagation computes Z = X·Wᵀ + b for each layer, applies ReLU in hidden layers and Softmax in the output layer to obtain class probabilities.
Activation Functions
The implementation uses ReLU for hidden layers because of its simplicity and Softmax for the final layer to produce a probability distribution.
Loss Function
Cross‑entropy loss is computed directly from the softmax output without one‑hot encoding of labels.
y = [1, 0, 0]
ŷ = [3.01929735e-07, 7.83961013e-09, 9.99999690e-01]Data Loading
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
def get_data(path):
data = pd.read_csv(path, index_col=0)
cols = list(data.columns)
target = cols.pop()
X = data[cols].copy()
y = data[target].copy()
y = LabelEncoder().fit_transform(y)
return np.array(X), np.array(y)
X, y = get_data("iris.csv")Layer Definition
import numpy as np
class DenseLayer:
def __init__(self, neurons):
self.neurons = neurons
def relu(self, inputs):
"""ReLU Activation Function"""
return np.maximum(0, inputs)
def softmax(self, inputs):
"""Softmax Activation Function"""
exp_scores = np.exp(inputs)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return probs
def relu_derivative(self, dA, Z):
"""ReLU Derivative Function"""
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
return dZ
def forward(self, inputs, weights, bias, activation):
"""Single Layer Forward Propagation"""
Z_curr = np.dot(inputs, weights.T) + bias
if activation == 'relu':
A_curr = self.relu(Z_curr)
elif activation == 'softmax':
A_curr = self.softmax(Z_curr)
return A_curr, Z_curr
def backward(self, dA_curr, W_curr, Z_curr, A_prev, activation):
"""Single Layer Backward Propagation"""
if activation == 'softmax':
dW = np.dot(A_prev.T, dA_curr)
db = np.sum(dA_curr, axis=0, keepdims=True)
dA = np.dot(dA_curr, W_curr)
else:
dZ = self.relu_derivative(dA_curr, Z_curr)
dW = np.dot(A_prev.T, dZ)
db = np.sum(dZ, axis=0, keepdims=True)
dA = np.dot(dZ, W_curr)
return dA, dW, dbNetwork Definition
class Network:
def __init__(self):
self.network = [] # layers
self.architecture = [] # list of dicts: input_dim, output_dim, activation
self.params = [] # weights and biases
self.memory = [] # store inputs and Z for backprop
self.gradients = []
def add(self, layer):
self.network.append(layer)
def _compile(self, data):
for idx, layer in enumerate(self.network):
if idx == 0:
self.architecture.append({
'input_dim': data.shape[1],
'output_dim': layer.neurons,
'activation': 'relu'
})
elif idx < len(self.network) - 1:
self.architecture.append({
'input_dim': self.network[idx-1].neurons,
'output_dim': layer.neurons,
'activation': 'relu'
})
else:
self.architecture.append({
'input_dim': self.network[idx-1].neurons,
'output_dim': layer.neurons,
'activation': 'softmax'
})
return self
def _init_weights(self, data):
self._compile(data)
np.random.seed(99)
for i in range(len(self.architecture)):
self.params.append({
'W': np.random.uniform(-1, 1,
size=(self.architecture[i]['output_dim'],
self.architecture[i]['input_dim'])),
'b': np.zeros((1, self.architecture[i]['output_dim']))
})
return self
def _forwardprop(self, data):
A_curr = data
for i in range(len(self.params)):
A_prev = A_curr
A_curr, Z_curr = self.network[i].forward(
inputs=A_prev,
weights=self.params[i]['W'],
bias=self.params[i]['b'],
activation=self.architecture[i]['activation']
)
self.memory.append({'inputs': A_prev, 'Z': Z_curr})
return A_curr
def _backprop(self, predicted, actual):
num_samples = len(actual)
dscores = predicted.copy()
dscores[range(num_samples), actual] -= 1
dscores /= num_samples
dA_prev = dscores
for idx, layer in reversed(list(enumerate(self.network))):
A_prev = self.memory[idx]['inputs']
Z_curr = self.memory[idx]['Z']
W_curr = self.params[idx]['W']
activation = self.architecture[idx]['activation']
dA_prev, dW_curr, db_curr = layer.backward(dA_prev, W_curr, Z_curr, A_prev, activation)
self.gradients.append({'dW': dW_curr, 'db': db_curr})
def _update(self, lr=0.01):
for idx, layer in enumerate(self.network):
self.params[idx]['W'] -= lr * list(reversed(self.gradients))[idx]['dW'].T
self.params[idx]['b'] -= lr * list(reversed(self.gradients))[idx]['db']
def _get_accuracy(self, predicted, actual):
return np.mean(np.argmax(predicted, axis=1) == actual)
def _calculate_loss(self, predicted, actual):
samples = len(actual)
correct_logprobs = -np.log(predicted[range(samples), actual])
return np.sum(correct_logprobs) / samples
def train(self, X_train, y_train, epochs):
self.loss = []
self.accuracy = []
self._init_weights(X_train)
for i in range(epochs):
yhat = self._forwardprop(X_train)
self.accuracy.append(self._get_accuracy(yhat, y_train))
self.loss.append(self._calculate_loss(yhat, y_train))
self._backprop(yhat, y_train)
self._update()
if i % 20 == 0:
print(f"EPOCH: {i}, ACCURACY: {self.accuracy[-1]}, LOSS: {self.loss[-1]}")Training Results
Running model.train(X, y, 200) prints epoch‑wise accuracy and loss, for example:
EPOCH: 0, ACCURACY: 0.3333333333333333, LOSS: 8.40744716505373
EPOCH: 20, ACCURACY: 0.4, LOSS: 0.9217739285797661
EPOCH: 40, ACCURACY: 0.43333333333333335, LOSS: 0.7513140371257646
...
EPOCH: 180, ACCURACY: 0.82, LOSS: 0.49134888468425214Experimenting with Architecture
Commenting out the hidden layer with 10 neurons ( # model.add(DenseLayer(10))) reduces network depth; retraining shows lower accuracy and different loss dynamics (plots omitted).
Keras Comparison
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.optimizers import SGD
y_onehot = tf.keras.utils.to_categorical(y, num_classes=3)
model2 = Sequential()
model2.add(Dense(6, activation='relu'))
model2.add(Dense(10, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(3, activation='softmax'))
model2.compile(SGD(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
model2.fit(x=X, y=y_onehot, epochs=30)Keras does not set a NumPy random seed; disabling the seed in the NumPy implementation yields comparable stochastic behavior.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Code DAO
We deliver AI algorithm tutorials and the latest news, curated by a team of researchers from Peking University, Shanghai Jiao Tong University, Central South University, and leading AI companies such as Huawei, Kuaishou, and SenseTime. Join us in the AI alchemy—making life better!
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
