import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.models import Sequential # Model type to be used
from keras.layers.core import Dense, Activation
from keras.optimizers import gradient_descent_v2
# Set random seed for keras
from numpy.random import seed
42) seed(
MNIST
Reference Material:
Libraries
Data
= pd.read_csv('mnist_data', sep = ',', header=None)\
data 0: 'label'}, axis=1) .rename({
Code
def digit_as_matrix(data, digit):
= data[data['label'] == digit]\
digit_data 'label', axis=1)
.drop(return digit_data.values[0].reshape(28,28)
# Source: https://colab.research.google.com/github/AviatorMoser/keras-mnist-tutorial/blob/master/MNIST%20in%20Keras.ipynb
def pretty_print_matrix(mat, fmt="g"):
= [max([len(("{:"+fmt+"}").format(x)) for x in col]) for col in mat.T]
col_maxes for x in mat:
for i, y in enumerate(x):
print(("{:"+str(col_maxes[i])+fmt+"}").format(y), end=" ")
print("")
As Figures
Code
'figure.figsize'] = (5,10) # Make the figures a bit bigger
plt.rcParams[
for i in range(10):
5,2,i+1)
plt.subplot(='gray', interpolation='none')
plt.imshow(digit_as_matrix(data, i), cmapf'Digit {i}')
plt.title(
plt.tight_layout()#'C:\Users\Pedro\AppData\Local\Programs\Quarto\'
As a Matrix
Code
7)) pretty_print_matrix(digit_as_matrix(data,
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 84 185 159 151 60 36 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 222 254 254 254 254 241 198 198 198 198 198 198 198 198 170 52 0 0 0 0 0 0
0 0 0 0 0 0 67 114 72 114 163 227 254 225 254 254 254 250 229 254 254 140 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 17 66 14 67 67 67 59 21 236 254 106 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 83 253 209 18 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 233 255 83 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 129 254 238 44 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 59 249 254 62 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 133 254 187 5 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 205 248 58 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 126 254 182 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 75 251 240 57 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 19 221 254 166 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 3 203 254 219 35 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 38 254 254 77 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 31 224 254 115 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 133 254 254 52 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 61 242 254 254 52 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 121 254 254 219 40 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 121 254 207 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Preprocessing
First, we separate the data into predictor variables (independent variables) and response variable (dependent variable). The predictor variables (pixels) are the features or attributes that are used to make predictions, while the response variabl (digit label) is the variable we want to predict or explain based on the predictor variables.
= data.drop('label', axis=1), data[['label']] X, y
Next, we proceed with data preparation for the model. In this step, we focus on scaling the pixels of our data to fall within the range of 0 to 1. While not mandatory, this practice is highly recommended as it tends to make computations faster.
= X.to_numpy().astype('float32') / 255 X
To enhance compatibility with the anticipated loss function, we employ one-hot encoding for our labels.
= np_utils.to_categorical(y, 10) Y
Model
After preprocessing the data, it is essential to choose the model architecture. As per the exercise requirements, we need to consider the following variations:
Units in the hidden layer: 25, 50, 100.
Batch Size for Gradient Calculation:
Stochastic Gradient Descent: Batch Size = 1.
Mini-Batch: Batch Size = 10, 50.
Gradient Descent: Batch Size = 5000.
Learning Rate: 0.5, 1, 10.
In order to facilitate the exploration of multiple variations, we will create flexible functions that enable effortless parameter customization.
Before we proceed with the actual model, it is important to note that the chosen values for the learning rate are unconventional. Typically, the learning rate is set to a small number, such as 0.001, 0.01, or 0.1, in practical scenarios. However, we will use these values to illustrate the impact of the learning rate on the model’s performance.
Build
We start our model architecture by creating a function that builds the model. The model is a sequential model, which means that it is a linear stack of layers. In this case, we have two layers: a hidden layer and an output layer. The hidden layer has a sigmoid activation function, while the output layer has a softmax activation function.
def _build_model(n_units):
= Sequential()
model =(784,)))
model.add(Dense(n_units, input_shape'sigmoid'))
model.add(Activation(10))
model.add(Dense('softmax'))
model.add(Activation(
return model
Compile
We employ gradient descent as our optimizer and utilize categorical cross-entropy as the loss function. The choice of optimization technique — stochastic gradient descent, mini-batch gradient descent, or gradient descent — depends on the batch size being considered.
def _compile_model(model, learning_rate):
= gradient_descent_v2.SGD(learning_rate=learning_rate)
opt
compile(
model.='categorical_crossentropy',
loss=opt,
optimizer=['accuracy']
metrics
)
return model
Train
As observed, we are training the model using the entire dataset, without utilizing separate validation and test sets. While this approach is not ideal, it is adopted to simplify the exploration of various iterations and variations.
def _train_model(model, X, Y, batch_size):
model.fit(=0,
X, Y, verbose=batch_size,
batch_size=100
epochs
)
return model
Wrapper Function
def build_train_wrapper(X, Y, n_units, batch_size, learning_rate):
= _build_model(n_units)
model
= _compile_model(model, learning_rate)
model
= _train_model(model, X, Y, batch_size)
model
return model
Empirical Errors
Based on the provided graphs, we can begin analyzing the impact of the batch size on the model’s performance. As anticipated, the stochastic gradient descent (batch size = 1) demonstrates the overall highest loss and lowest accuracy, aligning with expectations. Additionally, it is evident that SGD exhibits a significant variance in the error term.
Conversely, we expected the gradient descent (batch size = 5000) to yield the lowest loss and highest accuracy. However, contrary to our expectations, GD underperformed when compared to the mini-batch approach. Surprisingly, the mini-batch gradient descent (batch size = 10, 50) displayed remarkable results, consistently outperforming the other approaches by a substantial margin.
Shifting our focus to the learning rate, while keeping in mind the concerns mentioned earlier, it is clear that both GD and SGD struggled to converge to 0 in the majority of cases. However, just by the scale of the graph we can se that bigger learning rates were tightly associated with higher error rates. On the other hand, the mini-batch approach exhibited a more consistent behavior.
Regarding the variation in the number of hidden layer units, the results were somewhat mixed. It appears that the errors were more influenced by the learning rate and batch size rather than the number of hidden layer units. However, upon closer examination of the lines with less variability, it is evident that increasing the number of hidden layer units led to a reduction in error. This relationship aligns with our expectations, as a higher number of units in the hidden layer enables the model to learn more intricate patterns.
Code
# Line colors
= {
colors 1: 'red',
10: 'blue',
50: 'green',
5000: 'orange'
}
'figure.figsize'] = (12,12)
plt.rcParams[
for batch_size in [1, 10, 50, 5000]: # For each batch size
= 0
cont for unit in [25, 50, 100]: # For each hidden layer unit
for learning_rate in [0.5, 1, 10]: # For each learning rate
# Build, train and evaluate model
= build_train_wrapper(
fitted_model
X, Y, unit,
batch_size,
learning_rate
)
= fitted_model.history.history['loss']
train_loss = range(1, len(train_loss) + 1)
epochs
# Draw plot
3,3,cont+1)
plt.subplot(=batch_size)
plt.plot(epochs, train_loss, colors[batch_size], labelf'Hidden Layers Units: {unit} | Learning Rate: {learning_rate}')
plt.title('Epochs')
plt.xlabel('Error')
plt.ylabel(='Batch Size', loc='upper right')
plt.legend(title
+= 1
cont plt.tight_layout()