# imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision.datasets import MNIST
from torchvision.transforms.v2 import ToTensor
from torchvision import datasets, transforms

# seeding ensures that any "randomness" is reproducible
torch.manual_seed(42)

<torch._C.Generator at 0x7c3719e7a0d0>

# MNIST transform - single channel, so only 1 mean and 1 SD
mnist_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1308,), (0.3016,))
])

# MNIST data
train_dataset = MNIST(
    root='./data',
    train=True,
    download=True,
    transform=mnist_transform
)

# split the given 'test' set into into val/test
non_train_dataset = MNIST(
    root='./data',
    train=False,
    download=True,
    transform=mnist_transform
)

val_dataset, test_dataset = torch.utils.data.random_split(
    non_train_dataset,
     [5000, 5000]
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz

100%|██████████| 9.91M/9.91M [00:00<00:00, 11.5MB/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz

100%|██████████| 28.9k/28.9k [00:00<00:00, 346kB/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz

100%|██████████| 1.65M/1.65M [00:00<00:00, 3.20MB/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz

100%|██████████| 4.54k/4.54k [00:00<00:00, 4.11MB/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

len(train_dataset), len(val_dataset), len(test_dataset)

(60000, 5000, 5000)

# dataloaders
batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4
)

/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py:617: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(

# Loss fuction and optimizer
def get_crit_and_opt(net, kind='SGD', lr=0.0001, momentum=0.1):
    criterion = nn.CrossEntropyLoss()
    if kind == 'SGD':
        optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)
    elif kind == 'Adam':
        optimizer = optim.Adam(net.parameters(), lr=lr)
    else:
        raise ValueError('Invalid optimizer type')
    return criterion, optimizer

class LeNet(nn.Module):

    def __init__(self, debug=False):
        self.debug = debug
        super(LeNet, self).__init__()

        # 1 input image channel, 6 output channels, 5x5 square convolution
        self.conv1 = nn.Conv2d(1, 6, 5, padding=2) # pad the image: width 2

        # 6 input channels to 16 output channels with square 5x5 convolution
        self.conv2 = nn.Conv2d(6, 16, 5)

        # affine operations: y = Wx + b
        # 16 channels each of size 5x5 to 1x120 vector
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84) # 1x120 vector to 1x84 vector
        self.fc3 = nn.Linear(84, 10) # 1x84 vector to 1x10 vector

    def forward(self, x):

        # 2d convolution
        conv1_out = self.conv1(x)

        # sigmoid activation
        sig1_out = F.sigmoid(conv1_out)

        # average pooling over a 2x2 window
        pool1_out = F.avg_pool2d(sig1_out, (2, 2))

        # second conv
        conv2_out = self.conv2(pool1_out)

        # another sigmoid
        sig2_out = F.sigmoid(conv2_out)

        # If the size is a square, you can specify with a single number: just 2
        pool2_out = F.avg_pool2d(sig2_out, 2)

        # flatten all dimensions except the batch dimension
        # 1 means "from dimension 1 onward"
        flat = torch.flatten(pool2_out, 1)

        # first fully connected layer
        fc1_out = self.fc1(flat)

        # sigmoid again
        sig3_out = F.sigmoid(fc1_out)

        # second fully connected layer
        fc2_out = self.fc2(sig3_out)

        # final sigmoid
        sig4_out = F.sigmoid(fc2_out)

        # final fully connected layer
        output = self.fc3(sig4_out)

        if self.debug:
            print('             input:', x.shape)
            print('       after conv1:', conv1_out.shape)
            print('    after sigmoid1:', sig1_out.shape)
            print('    after avgpool1:', pool1_out.shape)
            print('       after conv2:', conv2_out.shape)
            print('    after sigmoid2:', sig2_out.shape)
            print('    after avgpool2:', pool2_out.shape)
            print('         flattened:', flat.shape)
            print('         after fc1:', fc1_out.shape)
            print('    after sigmoid3:', sig3_out.shape)
            print('         after fc2:', fc2_out.shape)
            print('    after sigmoid4:', fc2_out.shape)
            print('after fc3 (output):', output.shape)

        return output

debug_net = LeNet(debug=True)

for images, labels in train_loader:
    with torch.no_grad():
        output = debug_net(images)
    break

             input: torch.Size([32, 1, 28, 28])
       after conv1: torch.Size([32, 6, 28, 28])
    after sigmoid1: torch.Size([32, 6, 28, 28])
    after avgpool1: torch.Size([32, 6, 14, 14])
       after conv2: torch.Size([32, 16, 10, 10])
    after sigmoid2: torch.Size([32, 16, 10, 10])
    after avgpool2: torch.Size([32, 16, 5, 5])
         flattened: torch.Size([32, 400])
         after fc1: torch.Size([32, 120])
    after sigmoid3: torch.Size([32, 120])
         after fc2: torch.Size([32, 84])
    after sigmoid4: torch.Size([32, 84])
after fc3 (output): torch.Size([32, 10])

# Create your own model for the MNIST data here:
# Note! Please do not copy LeNet exactly!
class CNN_Network():
    pass

class AverageMeter(object):

    """Computes and stores an average and current value."""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def error_rate(output, target, topk=(1,)):

    """Computes the top-k error rate for the specified values of k."""

    maxk = max(topk) # largest k we'll need to work with
    batch_size = target.size(0) # determine batch size

    # get maxk best predictions for each item in the batch, both values and indices
    _, pred = output.topk(maxk, 1, True, True)

    # reshape predictions and targets and compare them element-wise
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk: # for each top-k accuracy we want

        # num correct
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        # num incorrect
        wrong_k = batch_size - correct_k
        # as a percentage
        res.append(wrong_k.mul_(100.0 / batch_size))

    return res

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# training function - 1 epoch
def train(
    train_loader,
    model,
    criterion,
    optimizer,
    epoch,
    epochs,
    print_freq = 100,
    verbose = True
):

    # track average and worst losses
    losses = AverageMeter()

    # set training mode
    model.train()

    # iterate over data - automatically shuffled
    for i, (images, labels) in enumerate(train_loader):

        # put batch of image tensors on GPU
        images = images.to(device)
        # put batch of label tensors on GPU
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # model output
        outputs = model(images)

        # loss computation
        # print(outputs.shape, labels.shape)
        loss = criterion(outputs, labels)

        # back propagation
        loss.backward()

        # update model parameters
        optimizer.step()

        # update meter with the value of the loss once for each item in the batch
        losses.update(loss.item(), images.size(0))

        # logging during epoch
        if i % print_freq == 0 and verbose == True:
            print(
                f'Epoch: [{epoch+1}/{epochs}][{i:4}/{len(train_loader)}]\t'
                f'Loss: {losses.val:.4f} ({losses.avg:.4f} on avg)'
            )

    # log again at end of epoch
    print(f'\n* Epoch: [{epoch+1}/{epochs}]\tTrain loss: {losses.avg:.3f}\n')

    return losses.avg

# val function
def validate(
    val_loader,
    model,
    criterion,
    epoch,
    epochs,
    print_freq = 100,
    verbose = True
):

    # track average and worst losses and batch-wise top-1 and top-5 accuracies
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # set evaluation mode
    model.eval()

    # iterate over data - automatically shuffled
    for i, (images, labels) in enumerate(val_loader):

        # put batch of image tensors on GPU
        images = images.to(device)
        # put batch of label tensors on GPU
        labels = labels.to(device)

        # model output
        output = model(images)

        # loss computation
        loss = criterion(output, labels)

        # top-1 and top-5 accuracy on this batch
        err1, err5, = error_rate(output.data, labels, topk=(1, 5))

        # update meters with the value of the loss once
        # for each item in the batch
        losses.update(loss.item(), images.size(0))
        # update meters with top-1 and top-5 accuracy on this batch once
        # for each item in the batch
        top1.update(err1.item(), images.size(0))
        top5.update(err5.item(), images.size(0))

        # logging during epoch
        if i % print_freq == 0 and verbose == True:
            print(
                f'Test (on val set): [{epoch+1}/{epochs}][{i:4}/{len(val_loader)}]\t'
                f'Loss: {losses.val:.4f} ({losses.avg:.4f} on avg)\t'
                f'Top-1 err: {top1.val:.4f} ({top1.avg:.4f} on avg)\t'
                f'Top-5 err: {top5.val:.4f} ({top5.avg:.4f} on avg)'
            )

    # logging for end of epoch
    print(
        f'\n* Epoch: [{epoch+1}/{epochs}]\t'
        f'Test loss: {losses.avg:.3f}\t'
        f'Top-1 err: {top1.avg:.3f}\t'
        f'Top-5 err: {top5.avg:.3f}\n'
    )

    # avergae top-1 and top-5 accuracies batch-wise, and average loss batch-wise
    return top1.avg, top5.avg, losses.avg

# best error rates so far
best_err1 = 100
best_err5 = 100

# Run the main function.
if __name__ == '__main__':

    # select a model to train here
    model = LeNet()

    # move to GPU
    model.to(device)

    # select number of epochs
    epochs = 3
    lr = 0.001
    momentum = 0.1
    # kind = 'SGD'
    kind = 'Adam'

    # get criterion and optimizer
    criterion, optimizer = get_crit_and_opt(model, kind, lr, momentum)

    # epoch loop
    for epoch in range(0, epochs):

        # train for one epoch
        train_loss = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            epochs
        )

        # evaluate on validation set
        err1, err5, val_loss = validate(
            val_loader,
            model,
            criterion,
            epoch,
            epochs
        )

        # remember best prec@1 and save checkpoint
        is_best = err1 <= best_err1
        best_err1 = min(err1, best_err1)
        if is_best:
            best_err5 = err5

        print(
            'Current best error rate (top-1 and top-5 error):',
            best_err1, best_err5, '\n'
        )
    print('Best error rate (top-1 and top-5 error):', best_err1, best_err5)

Epoch: [1/3][   0/1875]	Loss: 2.3165 (2.3165 on avg)
Epoch: [1/3][ 100/1875]	Loss: 2.3313 (2.3115 on avg)
Epoch: [1/3][ 200/1875]	Loss: 1.9670 (2.2720 on avg)
Epoch: [1/3][ 300/1875]	Loss: 1.2362 (2.0371 on avg)
Epoch: [1/3][ 400/1875]	Loss: 0.6606 (1.7785 on avg)
Epoch: [1/3][ 500/1875]	Loss: 0.7309 (1.5782 on avg)
Epoch: [1/3][ 600/1875]	Loss: 0.4914 (1.4168 on avg)
Epoch: [1/3][ 700/1875]	Loss: 0.3318 (1.2875 on avg)
Epoch: [1/3][ 800/1875]	Loss: 0.3037 (1.1836 on avg)
Epoch: [1/3][ 900/1875]	Loss: 0.2392 (1.0962 on avg)
Epoch: [1/3][1000/1875]	Loss: 0.3149 (1.0205 on avg)
Epoch: [1/3][1100/1875]	Loss: 0.4641 (0.9585 on avg)
Epoch: [1/3][1200/1875]	Loss: 0.3922 (0.9038 on avg)
Epoch: [1/3][1300/1875]	Loss: 0.3548 (0.8566 on avg)
Epoch: [1/3][1400/1875]	Loss: 0.3410 (0.8149 on avg)
Epoch: [1/3][1500/1875]	Loss: 0.1588 (0.7770 on avg)
Epoch: [1/3][1600/1875]	Loss: 0.1589 (0.7456 on avg)
Epoch: [1/3][1700/1875]	Loss: 0.3468 (0.7165 on avg)
Epoch: [1/3][1800/1875]	Loss: 0.1898 (0.6902 on avg)

* Epoch: [1/3]	Train loss: 0.671

Test (on val set): [1/3][   0/157]	Loss: 0.1954 (0.1954 on avg)	Top-1 err: 6.2500 (6.2500 on avg)	Top-5 err: 0.0000 (0.0000 on avg)
Test (on val set): [1/3][ 100/157]	Loss: 0.1912 (0.1892 on avg)	Top-1 err: 3.1250 (5.1980 on avg)	Top-5 err: 0.0000 (0.1238 on avg)

* Epoch: [1/3]	Test loss: 0.183	Top-1 err: 5.020	Top-5 err: 0.100

Current best error rate (top-1 and top-5 error): 5.02 0.1 

Epoch: [2/3][   0/1875]	Loss: 0.3430 (0.3430 on avg)
Epoch: [2/3][ 100/1875]	Loss: 0.4540 (0.2100 on avg)
Epoch: [2/3][ 200/1875]	Loss: 0.1063 (0.2081 on avg)
Epoch: [2/3][ 300/1875]	Loss: 0.3764 (0.2053 on avg)
Epoch: [2/3][ 400/1875]	Loss: 0.1718 (0.2070 on avg)
Epoch: [2/3][ 500/1875]	Loss: 0.1153 (0.2002 on avg)
Epoch: [2/3][ 600/1875]	Loss: 0.1558 (0.1959 on avg)
Epoch: [2/3][ 700/1875]	Loss: 0.1203 (0.1911 on avg)
Epoch: [2/3][ 800/1875]	Loss: 0.0403 (0.1873 on avg)
Epoch: [2/3][ 900/1875]	Loss: 0.1710 (0.1844 on avg)
Epoch: [2/3][1000/1875]	Loss: 0.1645 (0.1807 on avg)
Epoch: [2/3][1100/1875]	Loss: 0.1248 (0.1799 on avg)
Epoch: [2/3][1200/1875]	Loss: 0.0585 (0.1787 on avg)
Epoch: [2/3][1300/1875]	Loss: 0.1293 (0.1764 on avg)
Epoch: [2/3][1400/1875]	Loss: 0.1911 (0.1747 on avg)
Epoch: [2/3][1500/1875]	Loss: 0.1302 (0.1718 on avg)
Epoch: [2/3][1600/1875]	Loss: 0.0563 (0.1692 on avg)
Epoch: [2/3][1700/1875]	Loss: 0.1004 (0.1668 on avg)
Epoch: [2/3][1800/1875]	Loss: 0.1159 (0.1651 on avg)

* Epoch: [2/3]	Train loss: 0.163

Test (on val set): [2/3][   0/157]	Loss: 0.0431 (0.0431 on avg)	Top-1 err: 0.0000 (0.0000 on avg)	Top-5 err: 0.0000 (0.0000 on avg)
Test (on val set): [2/3][ 100/157]	Loss: 0.1286 (0.1092 on avg)	Top-1 err: 3.1250 (3.4344 on avg)	Top-5 err: 0.0000 (0.0619 on avg)

* Epoch: [2/3]	Test loss: 0.106	Top-1 err: 3.200	Top-5 err: 0.080

Current best error rate (top-1 and top-5 error): 3.2 0.08 

Epoch: [3/3][   0/1875]	Loss: 0.0763 (0.0763 on avg)
Epoch: [3/3][ 100/1875]	Loss: 0.1534 (0.1299 on avg)
Epoch: [3/3][ 200/1875]	Loss: 0.4391 (0.1233 on avg)
Epoch: [3/3][ 300/1875]	Loss: 0.0429 (0.1276 on avg)
Epoch: [3/3][ 400/1875]	Loss: 0.1163 (0.1320 on avg)
Epoch: [3/3][ 500/1875]	Loss: 0.2248 (0.1272 on avg)
Epoch: [3/3][ 600/1875]	Loss: 0.0648 (0.1230 on avg)
Epoch: [3/3][ 700/1875]	Loss: 0.0794 (0.1224 on avg)
Epoch: [3/3][ 800/1875]	Loss: 0.0140 (0.1190 on avg)
Epoch: [3/3][ 900/1875]	Loss: 0.0759 (0.1180 on avg)
Epoch: [3/3][1000/1875]	Loss: 0.2431 (0.1167 on avg)
Epoch: [3/3][1100/1875]	Loss: 0.1245 (0.1148 on avg)
Epoch: [3/3][1200/1875]	Loss: 0.1608 (0.1139 on avg)
Epoch: [3/3][1300/1875]	Loss: 0.0874 (0.1123 on avg)
Epoch: [3/3][1400/1875]	Loss: 0.1046 (0.1103 on avg)
Epoch: [3/3][1500/1875]	Loss: 0.1034 (0.1098 on avg)
Epoch: [3/3][1600/1875]	Loss: 0.2035 (0.1098 on avg)
Epoch: [3/3][1700/1875]	Loss: 0.3565 (0.1084 on avg)
Epoch: [3/3][1800/1875]	Loss: 0.1284 (0.1076 on avg)

* Epoch: [3/3]	Train loss: 0.107

Test (on val set): [3/3][   0/157]	Loss: 0.1195 (0.1195 on avg)	Top-1 err: 6.2500 (6.2500 on avg)	Top-5 err: 0.0000 (0.0000 on avg)
Test (on val set): [3/3][ 100/157]	Loss: 0.0410 (0.0805 on avg)	Top-1 err: 0.0000 (2.3824 on avg)	Top-5 err: 0.0000 (0.0000 on avg)

* Epoch: [3/3]	Test loss: 0.085	Top-1 err: 2.400	Top-5 err: 0.020

Current best error rate (top-1 and top-5 error): 2.4 0.02 

Best error rate (top-1 and top-5 error): 2.4 0.02

# Create a classification report for one model
from sklearn.metrics import classification_report

# get the true classes and model predictions for the test set for one model
# y_true = the true numerical classes
# y_pred = predicted numerical classes
# target_names = string names of the classes
y_true = []
y_pred = []
target_names = [str(x) for x in range(10)]

with torch.no_grad():
    model.eval()
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        output = model(images)

        y_true.extend(labels.tolist())
        y_pred.extend(output.argmax(dim=1).tolist())

print(classification_report(y_true, y_pred, target_names=target_names))

/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py:617: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       451
           1       0.97      0.99      0.98       553
           2       0.99      0.96      0.98       495
           3       0.97      0.97      0.97       530
           4       0.98      0.96      0.97       501
           5       0.94      0.98      0.96       449
           6       0.98      0.97      0.97       460
           7       0.98      0.98      0.98       531
           8       0.96      0.98      0.97       508
           9       0.99      0.93      0.95       522

    accuracy                           0.97      5000
   macro avg       0.97      0.97      0.97      5000
weighted avg       0.97      0.97      0.97      5000

Notebook 2 - CNNs in PyTorch¶

Setting up¶

Data¶

Defining and training CNNs¶