In [11]:
###########################################################################################
# digit code source:                                                                      #
# “Handwritten Digit Recognition with Scikit-Learn.”                                      #
# The Data Frog, thedatafrog.com/en/articles/handwritten-digit-recognition-scikit-learn/. #
###########################################################################################

from sklearn import datasets
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from scipy.io import loadmat
import numpy as np

def show_digit(i):
    fig = plt.figure(figsize=(4,4))
    #plt.subplot(4,4,i+1)
    plt.imshow(i, cmap='binary')
    #plt.title(digits.target[i])
    plt.axis('off')
    plt.show()
    
digits = datasets.load_digits()

#####################################################################################################
# conv and pool functions code source:                                                              #
# https://towardsdatascience.com/a-guide-to-convolutional-neural-networks-from-scratch-f1e3bfc3e2de #
#####################################################################################################

def relu(Z):
    return np.maximum(0, Z)

def conv(x, filt, bias, stride, pad):
    """
    - N: number of images
    - H: height (number of rows)
    - W: width (number of columns)
    - F: number of filters
    - bias: array of biases to apply to each filter
    - stride: stride value to apply to each image when convolving
    - pad: number of rows and columns to zero-pad around image
    """
    N, H, W = x.shape
    F, HH, WW = filt.shape
    # get output dimensions
    H_out = 1 + (H + 2 * pad - HH) // stride
    W_out = 1 + (W + 2 * pad - WW) // stride
    out = np.zeros((N, F, H_out, W_out))
    pad_widths = ((0,), (pad,), (pad,))
    xpad = np.pad(x, pad_widths, 'constant')
    Npad, Hpad, Wpad = xpad.shape
    
    # perform convolution
    for n in range(N):
        for f in range(F):          
            for i in range(0, Hpad-(HH-1), stride):
                for j in range(0, Wpad-(WW-1), stride):
                    prod = np.sum(np.multiply(filt[f], xpad[n, i:i+HH, j:j+WW]))
                    out[n, f, int(i/stride), int(j/stride)] = prod + bias[f]

    cache = (x, filt, bias, stride, pad)
    return out, cache


def pool(x, Hp, Wp, stride):
    """
    - N: number of images
    - F: number of filters applied to each image
    - H: height (number of rows)
    - W: width (number of columns)
    - Hp: height to pool by
    - Wp: width to pool by
    - stride: stride value to apply to each image when pooling
    """
    # currently does max pooling
    # get output dimensions
    N, F, H, W = x.shape
    H_out = (H - Hp) // stride + 1
    W_out = (W - Wp) // stride + 1
    out = np.zeros((N, F, H_out, W_out))
    
    # pool
    for n in range(N):
        for f in range(F):
            for i in range(H_out):
                for j in range(W_out):
                    out[n, f, i, j] = np.max(x[n, f, i*stride:i*stride+Hp, j*stride:j*stride+Wp])
    
    cache = (x, Hp, Wp, stride)
    return out, cache

    
def CNN(filts_size, numFilts, convStride, poolStride, activ, hidden1, hidden2):
    """
    - filts_size: specifies the size of filters to use. Can be 2 (2x2), 3 (3x3), or 23 (both)
    - numFilts: specifies the number of filters to use. If unused, just set to large number.
    - convStride: stride of the convolutional layer
    - poolStride: stride of the pooling layer
    - activ: the activation function to use in the fully-connected network. Can be 'identity', 'logistic', 'relu', 'softmax', 'tanh'
    - hidden1: The number of nodes to use in the first hidden layer
    - hidden2: The number of nodes to use in the second hidden layer
    """

    ### Preprocessing input
    # Flatten images

    # load UCI ML hand-written digits
    y = digits.target
    x = digits.images.reshape((len(digits.images), -1))
    dim = 8

    # Split data into training and testing
    x_train = x[:1000]
    y_train = y[:1000]
    x_test = x[1000:]
    y_test = y[1000:]
    
    # some example filters to apply at the convolutional layer
    rand_filt2 = np.random.rand(2, 2)
    identity2 = np.array([[1, 0],[0,0]])
    top_edge_filt2 = np.array([[-1, -1],[1, 1]])
    bot_edge_filt2 = np.array([[1, 1],[-1, -1]])
    rgt_edge_filt2 = np.array([[-1, 1],[-1, 1]])
    lft_edge_filt2 = np.array([[1, -1],[1, -1]])
    diag_filt2 = np.array([[1, -1],[-1, 1]])
    back_diag_filt2 = np.array([[-1, 1],[1, -1]])

    rand_filt3 = np.random.rand(3, 3)
    identity3 = np.array([[0,0,0],[0,1,0],[0,0,0]])
    top_edge_filt3 = np.array([[-1, -1,-1],[0,0,0],[1,1,1]])
    bot_edge_filt3 = np.array([[1,1,1],[0,0,0],[-1,-1,-1]])
    rgt_edge_filt3 = np.array([[-1,0,1],[-1,0,1],[-1,0,1]])
    lft_edge_filt3 = np.array([[1,0,-1],[1,0,-1],[1,0,-1]])
    diag_filt3 = np.array([[1,0,-1],[0,0,0],[-1,0,1]])
    back_diag_filt3 = np.array([[-1,0,1],[0,0,0],[1,0,-1]])

    top_edge_filt23 = np.array([[-1, -1,0],[1, 1,0],[0,0,0]])
    bot_edge_filt23 = np.array([[1, 1,0],[-1, -1,0],[0,0,0]])
    rgt_edge_filt23 = np.array([[-1, 1,0],[-1, 1,0],[0,0,0]])
    lft_edge_filt23 = np.array([[1, -1,0],[1, -1,0],[0,0,0]])
    diag_filt23 = np.array([[1, -1,0],[-1, 1,0],[0,0,0]])
    back_diag_filt23 = np.array([[-1, 1,0],[1, -1,0],[0,0,0]])

    x_train = x_train.reshape(x_train.shape[0],dim,dim)

    if (filts_size == 2):
        filts = np.array([rand_filt2, identity2, top_edge_filt2, bot_edge_filt2, rgt_edge_filt2, lft_edge_filt2, diag_filt2, back_diag_filt2])
    elif (filts_size == 3):
        filts = np.array([rand_filt3, identity3, top_edge_filt3, bot_edge_filt3, rgt_edge_filt3, lft_edge_filt3, diag_filt3, back_diag_filt3])
    elif (filts_size == 23):
        filts = np.array([rand_filt3, identity3, top_edge_filt3, bot_edge_filt3, rgt_edge_filt3, lft_edge_filt3, diag_filt3, back_diag_filt3, top_edge_filt23, bot_edge_filt23, rgt_edge_filt23, lft_edge_filt23, diag_filt23, back_diag_filt23])

    if (numFilts < filts.size):
        filts = filts[:numFilts]

    bias = np.zeros((filts.shape[0], 1))
    
    ### Set up the CNN for training

    # Layer 1: convolutional
    y0, c0 = conv(x_train,filts,bias,convStride,1)

    # Layer 2: ReLu activation
    N, F, H, W = y0.shape
    for n in range(N):
        for f in range(F):
            y0[n,f,:,:] = relu(y0[n,f,:,:])

    # Layer 3: Pooling
    p0, c1 = pool(y0, 2, 2, poolStride)

    # Flatten input for NN
    p0 = p0.reshape(y_train.size, int(p0.size/y_train.size))
    N, F = p0.shape

    # Train sklearn's neural net module using the output of our pooling layer
    clf = MLPClassifier(activation=activ, solver='sgd', alpha=1e-5,hidden_layer_sizes=(hidden1, hidden2), max_iter=1000, random_state=1)
    clf.fit(p0, y_train)
    y_hat_train = clf.predict(p0)
    print("Training Error Rate: ", np.count_nonzero(y_hat_train - y_train)/y_train.shape[0])

    ### Test the CNN
    # Feed the testing data forward through conv and pooling layers

    # Layer 1: convolutional
    x_test = x_test.reshape(x_test.shape[0],dim,dim)
    y0, c0 = conv(x_test,filts,bias,convStride,1)

    # Layer 2: ReLu activation
    N, F, H, W = y0.shape
    for n in range(N):
        for f in range(F):
            y0[n,f,:,:] = relu(y0[n,f,:,:])

    # Layer 3: Pooling
    p0, c1 = pool(y0, 2, 2, poolStride)
    # Flatten input for NN
    p0 = p0.reshape(N, p0[0].size)
    N, F = p0.shape

    # Layer 4: predict with sklearn neural network
    y_hat = clf.predict(p0)
    print("Testing Error Rate: ", np.count_nonzero(y_hat - y_test)/y_test.shape[0])
In [26]:
# for convenience, the CNN function definition is reproduced below:

#def CNN(filts_size, numFilts, convStride, poolStride, activ, hidden1, hidden2):
"""
    - filts_size: specifies the size of filters to use. Can be 2 (2x2), 3 (3x3), or 23 (both)
    - numFilts: specifies the number of filters to use. If unused, just set to large number.
    - convStride: stride of the convolutional layer
    - poolStride: stride of the pooling layer
    - activ: the activation function to use in the fully-connected network. Can be 'identity', 'logistic', 'relu', 'softmax', 'tanh'
    - hidden1: The number of nodes to use in the first hidden layer
    - hidden2: The number of nodes to use in the second hidden layer
"""

# Example network (stride of 2 with 3x3 filters):
CNN(3, 99, 2, 2,'relu', 100, 30)
Training Error Rate:  0.127
Testing Error Rate:  0.24090338770388958
In [27]:
# Student: run the network with a convolutional stride of 1 using 3x3 filters
CNN(3, 99, 1, 2,'relu', 100, 30)
Training Error Rate:  0.001
Testing Error Rate:  0.12170639899623588
In [28]:
# Student: using the same stride as above, run the network using 2x2 filters:
# how does this affect performance? (remember that our test images are 8x8)
CNN(2, 99, 1, 2,'relu', 100, 30)
Training Error Rate:  0.0
Testing Error Rate:  0.11041405269761606
In [29]:
# Student: using the same stride as above, run the network using both sets of filters:
CNN(23, 99, 1, 2,'relu', 100, 30)
Training Error Rate:  0.0
Testing Error Rate:  0.08531994981179424
In [8]:
#Student: experiment with other values