Posted on Jul 7

Code a Neural Network from Scratch in NumPy

Part of my Zero to AI Researcher / Engineer Course

Part 1: Getting Started - Your First Neural Network

import numpy as np import matplotlib.pyplot as plt # Create simple dataset - XOR problem X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) y = np.array([[0], [1], [1], [0]]) print(f"Input shape: {X.shape}") # (4, 2) print(f"Output shape: {y.shape}") # (4, 1) print(f"Dataset:") for i in range(len(X)): print(f" {X[i]} -> {y[i][0]}") # Print each input pair and its expected output

What happened: We created the XOR dataset - a classic non-linearly separable problem that requires a neural network to solve.

Part 2: Understanding the Math - Forward Pass

Basic Network Architecture

# Network architecture: 2 -> 4 -> 1 (input -> hidden -> output) input_size = 2 # Number of input features (x and y coordinates) hidden_size = 4 # Number of neurons in hidden layer output_size = 1 # Number of output values (0 or 1)  # Initialize weights and biases np.random.seed(42) W1 = np.random.randn(input_size, hidden_size) * 0.5 # Weights: connection strengths between layers b1 = np.zeros((1, hidden_size)) # Biases: adjustable offsets for each neuron W2 = np.random.randn(hidden_size, output_size) * 0.5 # Weights for output layer b2 = np.zeros((1, output_size)) # Bias for output layer  print(f"W1 shape: {W1.shape}") # (2, 4) print(f"b1 shape: {b1.shape}") # (1, 4) print(f"W2 shape: {W2.shape}") # (4, 1) print(f"b2 shape: {b2.shape}") # (1, 1)

OPTIONAL: Understanding Matrix Shapes in Neural Networks

# Let's trace through the shapes step by step print("Shape flow through network:") print(f"Input X: {X.shape}") # (4, 2) print(f"Weights W1: {W1.shape}") # (2, 4) print(f"X @ W1: {(X @ W1).shape}") # (4, 4) print(f"Bias b1: {b1.shape}") # (1, 4)  # Broadcasting explanation sample_mult = X @ W1 print(f"\nBroadcasting bias:") print(f"(X @ W1) shape: {sample_mult.shape}") # (4, 4) print(f"b1 shape: {b1.shape}") # (1, 4) print(f"Result shape: {(sample_mult + b1).shape}") # (4, 4)

Forward Pass Implementation

def forward_pass(X, W1, b1, W2, b2): """Complete forward pass through the network""" # Hidden layer  z1 = X @ W1 + b1 # Linear transformation  a1 = sigmoid(z1) # Activation  # Output layer  z2 = a1 @ W2 + b2 # Linear transformation  a2 = sigmoid(z2) # Activation  return z1, a1, z2, a2 # Test forward pass z1, a1, z2, predictions = forward_pass(X, W1, b1, W2, b2) print(f"Predictions shape: {predictions.shape}") print(f"Predictions:\n{predictions.flatten()}") print(f"Actual labels:\n{y.flatten()}")

OPTIONAL: Understanding the Sigmoid Function

def sigmoid(x): """Sigmoid activation function""" return 1 / (1 + np.exp(-np.clip(x, -500, 500))) # Clip to prevent overflow  # Test sigmoid on different inputs test_values = np.array([-10, -1, 0, 1, 10]) sigmoid_values = sigmoid(test_values) print("Sigmoid function behavior:") for i, val in enumerate(test_values): print(f" sigmoid({val:3.0f}) = {sigmoid_values[i]:.3f}") # Visualize sigmoid x_range = np.linspace(-10, 10, 100) y_sigmoid = sigmoid(x_range) plt.figure(figsize=(8, 4)) plt.plot(x_range, y_sigmoid, 'b-', linewidth=2) plt.title('Sigmoid Activation Function') plt.xlabel('x') plt.ylabel('sigmoid(x)') plt.grid(True, alpha=0.3) plt.show()

OPTIONAL: Breaking Down the Sigmoid Formula

# Let's understand sigmoid step by step: 1 / (1 + e^(-x)) x = 2.0 print(f"Input: x = {x}") print(f"Step 1: -x = {-x}") print(f"Step 2: e^(-x) = np.exp(-x) = {np.exp(-x):.3f}") print(f"Step 3: 1 + e^(-x) = {1 + np.exp(-x):.3f}") print(f"Step 4: 1 / (1 + e^(-x)) = {1 / (1 + np.exp(-x)):.3f}") print(f"Sigmoid result: {sigmoid(x):.3f}") # Why clipping? print(f"\nWhy we clip large values:") large_x = 1000 print(f"Without clipping: np.exp(-{large_x}) would cause overflow") print(f"With clipping: np.exp(-500) = {np.exp(-500):.2e} (very small, safe)")

Key insight: The forward pass transforms input through weighted connections and activations to produce predictions.

Part 3: Computing Loss and Gradients

Loss Function

def compute_loss(predictions, targets): """Mean squared error loss""" return np.mean((predictions - targets) ** 2) # Square the difference to penalize large errors

OPTIONAL: Why We Square the Difference

# Let's see why we use (predictions - targets) ** 2 pred = np.array([0.8, 0.2, 0.9, 0.1]) target = np.array([1.0, 0.0, 1.0, 0.0]) differences = pred - target squared_differences = differences ** 2 print("Understanding squared error:") print(f"Predictions: {pred}") print(f"Targets: {target}") print(f"Differences: {differences}") print(f"Squared: {squared_differences}") print(f"Mean squared error: {np.mean(squared_differences):.4f}") # Why not just absolute difference? abs_differences = np.abs(differences) print(f"\nComparison:") print(f"Absolute differences: {abs_differences}") print(f"Squared differences: {squared_differences}") print("Squared errors penalize large mistakes more heavily!")

Calculate initial loss

initial_loss = compute_loss(predictions, y) print(f"Initial loss: {initial_loss:.4f}")

OPTIONAL: Understanding Derivative of Sigmoid

def sigmoid_derivative(x): """Derivative of sigmoid function""" s = sigmoid(x) return s * (1 - s) # Derivative formula: sigmoid(x) * (1 - sigmoid(x))  # Test derivative test_vals = np.array([-2, -1, 0, 1, 2]) sigmoid_vals = sigmoid(test_vals) derivative_vals = sigmoid_derivative(test_vals) print("Sigmoid and its derivative:") for i, val in enumerate(test_vals): print(f" x={val:2.0f}: sigmoid={sigmoid_vals[i]:.3f}, derivative={derivative_vals[i]:.3f}") # Visualize both functions x_range = np.linspace(-6, 6, 100) y_sigmoid = sigmoid(x_range) y_derivative = sigmoid_derivative(x_range) plt.figure(figsize=(10, 4)) plt.plot(x_range, y_sigmoid, 'b-', label='sigmoid(x)', linewidth=2) plt.plot(x_range, y_derivative, 'r--', label="sigmoid'(x)", linewidth=2) plt.title('Sigmoid Function and Its Derivative') plt.xlabel('x') plt.ylabel('y') plt.legend() plt.grid(True, alpha=0.3) plt.show()

OPTIONAL: Chain Rule in Backpropagation

# The chain rule: if y = f(g(x)), then dy/dx = f'(g(x)) * g'(x) # # For our network: Loss = MSE(sigmoid(W2 * sigmoid(W1 * X + b1) + b2), y) # We need: dLoss/dW2, dLoss/db2, dLoss/dW1, dLoss/db1  print("Chain rule breakdown:") print("dLoss/dW2 = dLoss/da2 * da2/dz2 * dz2/dW2") print(" where:") print(" dLoss/da2 = 2 * (predictions - targets) # MSE derivative") print(" da2/dz2 = sigmoid'(z2) # sigmoid derivative") print(" dz2/dW2 = a1 # linear layer derivative")

Backpropagation Implementation

def backward_pass(X, y, z1, a1, z2, a2, W1, b1, W2, b2): """Compute gradients using backpropagation""" m = X.shape[0] # Number of samples  # Output layer gradients  dz2 = 2 * (a2 - y) * sigmoid_derivative(z2) # (4, 1)  dW2 = a1.T @ dz2 / m # (4, 1)  db2 = np.mean(dz2, axis=0, keepdims=True) # (1, 1)  # Hidden layer gradients  dz1 = (dz2 @ W2.T) * sigmoid_derivative(z1) # (4, 4)  dW1 = X.T @ dz1 / m # (2, 4)  db1 = np.mean(dz1, axis=0, keepdims=True) # (1, 4)  return dW1, db1, dW2, db2 # Test backpropagation dW1, db1, dW2, db2 = backward_pass(X, y, z1, a1, z2, predictions, W1, b1, W2, b2) print(f"Gradient shapes:") print(f" dW1: {dW1.shape}, dW2: {dW2.shape}") print(f" db1: {db1.shape}, db2: {db2.shape}")

OPTIONAL: Understanding Output Layer Gradients

# Let's break down the output layer gradient calculation print("Output layer gradient breakdown:") print("dz2 = 2 * (a2 - y) * sigmoid_derivative(z2)") # Step by step error = a2 - y # How far off our predictions are print(f"Error (a2 - y) shape: {error.shape}") print(f"Error values:\n{error.flatten()}") mse_gradient = 2 * error # Derivative of MSE print(f"\nMSE gradient (2 * error) shape: {mse_gradient.shape}") sigmoid_grad = sigmoid_derivative(z2) # Derivative of sigmoid print(f"Sigmoid gradient shape: {sigmoid_grad.shape}") dz2_step = mse_gradient * sigmoid_grad # Chain rule print(f"Combined gradient (dz2) shape: {dz2_step.shape}")

OPTIONAL: Understanding Hidden Layer Gradients

# Hidden layer gradients are more complex due to chain rule print("Hidden layer gradient breakdown:") print("dz1 = (dz2 @ W2.T) * sigmoid_derivative(z1)") # Step by step error_propagated = dz2 @ W2.T # Propagate error backwards print(f"Error propagated shape: {error_propagated.shape}") print(f"This spreads output error to each hidden neuron") hidden_sigmoid_grad = sigmoid_derivative(z1) # Local gradient print(f"Hidden sigmoid gradient shape: {hidden_sigmoid_grad.shape}") dz1_step = error_propagated * hidden_sigmoid_grad # Final gradient print(f"Combined hidden gradient (dz1) shape: {dz1_step.shape}")

OPTIONAL: Understanding Weight Gradients

# Weight gradients show how to adjust connections print("Weight gradient calculation:") print("dW2 = a1.T @ dz2 / m") print(f"a1.T shape: {a1.T.shape}") # Transposed hidden activations print(f"dz2 shape: {dz2.shape}") # Output gradients print(f"dW2 shape: {(a1.T @ dz2).shape}") # Weight gradients  # This gives us the gradient for each weight connection print(f"\nWeight gradients tell us:") print(f"- Positive gradient: decrease this weight") print(f"- Negative gradient: increase this weight") print(f"- Large gradient: this weight has big impact on error")

Critical concept: Backpropagation uses the chain rule to compute how much each weight contributes to the total error.

Part 4: Training the Network

Training Loop

def train_network(X, y, epochs=1000, learning_rate=1.0): """Train the neural network""" # Initialize weights  np.random.seed(42) W1 = np.random.randn(2, 4) * 0.5 b1 = np.zeros((1, 4)) W2 = np.random.randn(4, 1) * 0.5 b2 = np.zeros((1, 1)) losses = [] for epoch in range(epochs): # Forward pass  z1, a1, z2, predictions = forward_pass(X, W1, b1, W2, b2) # Compute loss  loss = compute_loss(predictions, y) losses.append(loss) # Backward pass  dW1, db1, dW2, db2 = backward_pass(X, y, z1, a1, z2, predictions, W1, b1, W2, b2) # Update weights  W1 -= learning_rate * dW1 b1 -= learning_rate * db1 W2 -= learning_rate * dW2 b2 -= learning_rate * db2 # Print progress  if epoch % 100 == 0: print(f"Epoch {epoch:4d}: Loss = {loss:.6f}") return W1, b1, W2, b2, losses # Train the network W1_trained, b1_trained, W2_trained, b2_trained, loss_history = train_network(X, y)

OPTIONAL: Understanding Learning Rate

# Learning rate controls how big steps we take during optimization # Too small: slow convergence, too large: might overshoot minimum  learning_rates = [0.1, 1.0, 10.0] plt.figure(figsize=(12, 4)) for i, lr in enumerate(learning_rates): plt.subplot(1, 3, i+1) # Train with this learning rate  _, _, _, _, losses = train_network(X, y, epochs=500, learning_rate=lr) plt.plot(losses) plt.title(f'Learning Rate = {lr}') plt.xlabel('Epoch') plt.ylabel('Loss') plt.yscale('log') plt.grid(True, alpha=0.3) plt.tight_layout() plt.show()

OPTIONAL: Visualizing Training Progress

# Plot loss curve plt.figure(figsize=(10, 4)) plt.subplot(1, 2, 1) plt.plot(loss_history) plt.title('Training Loss') plt.xlabel('Epoch') plt.ylabel('Loss') plt.yscale('log') plt.grid(True, alpha=0.3) plt.subplot(1, 2, 2) plt.plot(loss_history[-100:]) # Last 100 epochs plt.title('Training Loss (Last 100 Epochs)') plt.xlabel('Epoch') plt.ylabel('Loss') plt.grid(True, alpha=0.3) plt.tight_layout() plt.show() print(f"Final loss: {loss_history[-1]:.6f}")

Part 5: Testing the Trained Network

Final Predictions

# Test the trained network z1_final, a1_final, z2_final, final_predictions = forward_pass(X, W1_trained, b1_trained, W2_trained, b2_trained) print("Final Results:") print("Input -> Target | Prediction | Rounded") print("-" * 40) for i in range(len(X)): pred = final_predictions[i, 0] rounded = round(pred) target = y[i, 0] print(f"{X[i]} -> {target} | {pred:.4f} | {rounded}") # Calculate accuracy rounded_predictions = np.round(final_predictions) accuracy = np.mean(rounded_predictions == y) print(f"\nAccuracy: {accuracy:.1%}")

OPTIONAL: Visualizing Decision Boundary

# Create a grid of points to visualize the decision boundary def plot_decision_boundary(W1, b1, W2, b2): """Plot the decision boundary learned by the network""" # Create a grid  xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100)) # Flatten the grid for prediction  grid_points = np.c_[xx.ravel(), yy.ravel()] # Make predictions on the grid  _, _, _, grid_predictions = forward_pass(grid_points, W1, b1, W2, b2) grid_predictions = grid_predictions.reshape(xx.shape) # Plot  plt.figure(figsize=(8, 6)) plt.contourf(xx, yy, grid_predictions, levels=50, alpha=0.8, cmap='RdYlBu') plt.colorbar(label='Network Output') # Plot data points  colors = ['red' if label == 0 else 'blue' for label in y.flatten()] plt.scatter(X[:, 0], X[:, 1], c=colors, s=100, edgecolors='black', linewidth=2) # Add labels  for i, (x, y_val) in enumerate(zip(X, y.flatten())): plt.annotate(f'({x[0]},{x[1]})→{y_val}', (x[0], x[1]), xytext=(5, 5), textcoords='offset points') plt.title('Neural Network Decision Boundary') plt.xlabel('Input 1') plt.ylabel('Input 2') plt.grid(True, alpha=0.3) plt.show() # Visualize the decision boundary plot_decision_boundary(W1_trained, b1_trained, W2_trained, b2_trained)

Part 6: Understanding What We Built

Complete Neural Network Class

class SimpleNeuralNetwork: """A simple 2-layer neural network implementation""" def __init__(self, input_size=2, hidden_size=4, output_size=1): # Initialize weights  self.W1 = np.random.randn(input_size, hidden_size) * 0.5 self.b1 = np.zeros((1, hidden_size)) self.W2 = np.random.randn(hidden_size, output_size) * 0.5 self.b2 = np.zeros((1, output_size)) def sigmoid(self, x): return 1 / (1 + np.exp(-np.clip(x, -500, 500))) def forward(self, X): self.z1 = X @ self.W1 + self.b1 self.a1 = self.sigmoid(self.z1) self.z2 = self.a1 @ self.W2 + self.b2 self.a2 = self.sigmoid(self.z2) return self.a2 def backward(self, X, y): m = X.shape[0] # Output layer gradients  dz2 = 2 * (self.a2 - y) * self.sigmoid(self.z2) * (1 - self.sigmoid(self.z2)) dW2 = self.a1.T @ dz2 / m db2 = np.mean(dz2, axis=0, keepdims=True) # Hidden layer gradients  dz1 = (dz2 @ self.W2.T) * self.sigmoid(self.z1) * (1 - self.sigmoid(self.z1)) dW1 = X.T @ dz1 / m db1 = np.mean(dz1, axis=0, keepdims=True) return dW1, db1, dW2, db2 def train(self, X, y, epochs=1000, learning_rate=1.0): losses = [] for epoch in range(epochs): # Forward pass  predictions = self.forward(X) # Compute loss  loss = np.mean((predictions - y) ** 2) losses.append(loss) # Backward pass  dW1, db1, dW2, db2 = self.backward(X, y) # Update weights  self.W1 -= learning_rate * dW1 self.b1 -= learning_rate * db1 self.W2 -= learning_rate * dW2 self.b2 -= learning_rate * db2 if epoch % 100 == 0: print(f"Epoch {epoch:4d}: Loss = {loss:.6f}") return losses def predict(self, X): return self.forward(X) # Test the class nn = SimpleNeuralNetwork() losses = nn.train(X, y, epochs=1000, learning_rate=1.0) predictions = nn.predict(X) print("\nClass-based Neural Network Results:") for i in range(len(X)): pred = predictions[i, 0] target = y[i, 0] print(f"{X[i]} -> {target} | Prediction: {pred:.4f} | Rounded: {round(pred)}")

OPTIONAL: Comparing with Different Architectures

# Test different hidden layer sizes hidden_sizes = [2, 4, 8, 16] results = {} for hidden_size in hidden_sizes: print(f"\nTesting hidden size: {hidden_size}") nn = SimpleNeuralNetwork(input_size=2, hidden_size=hidden_size, output_size=1) losses = nn.train(X, y, epochs=1000, learning_rate=1.0) predictions = nn.predict(X) # Calculate accuracy  accuracy = np.mean(np.round(predictions) == y) results[hidden_size] = { 'final_loss': losses[-1], 'accuracy': accuracy, 'predictions': predictions } print(f" Final loss: {losses[-1]:.6f}") print(f" Accuracy: {accuracy:.1%}") # Plot comparison plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) hidden_sizes_list = list(results.keys()) final_losses = [results[hs]['final_loss'] for hs in hidden_sizes_list] plt.bar(hidden_sizes_list, final_losses) plt.title('Final Loss vs Hidden Size') plt.xlabel('Hidden Layer Size') plt.ylabel('Final Loss') plt.yscale('log') plt.subplot(1, 2, 2) accuracies = [results[hs]['accuracy'] for hs in hidden_sizes_list] plt.bar(hidden_sizes_list, accuracies) plt.title('Accuracy vs Hidden Size') plt.xlabel('Hidden Layer Size') plt.ylabel('Accuracy') plt.ylim(0, 1) plt.tight_layout() plt.show()

Key takeaway: You've built a complete neural network from scratch! The network learns to solve the XOR problem by discovering the right weights and biases through gradient descent.

Summary

You've successfully implemented:

Forward propagation: Computing predictions from inputs
Loss computation: Measuring how wrong the predictions are
Backpropagation: Computing gradients using the chain rule
Weight updates: Using gradient descent to improve the network
Complete training loop: Putting it all together

The neural network learns by repeatedly adjusting its weights based on the errors it makes, eventually discovering the complex decision boundary needed to solve the XOR problem.

DEV Community