PyTorch Lightning: Validation and model evaluation#

  • we are seeding everything in this example.

  • we are calculating the validation loss too. So we need to include that in our LitModel class.

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
print("Lightning version:", L.__version__)
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU available")
print("Torch version:", torch.__version__)
print("CUDA is available:", torch.cuda.is_available())

L.seed_everything(123456) # reproducibility
Seed set to 123456
Lightning version: 2.5.1
GPU name: NVIDIA RTX A5000
Torch version: 2.6.0+cu124
CUDA is available: True
123456
class LitModel(L.LightningModule): # a replacesment of nn.Module
    def __init__(self):
      super().__init__() # call __init__ of the super class to init important LightningModule functions
      self.model = nn.Sequential(
         nn.Flatten(),
         nn.Linear(28*28, 128),
         nn.ReLU(),
         nn.Linear(128, 10)
      )
   
    def forward(self, x):
      return self.model(x)
   
    def training_step(self, batch, batch_idx):
      x, y = batch
      logits = self(x)
      loss = F.cross_entropy(logits, y)
      self.log("train_loss", loss)
      #return loss
      return {"loss": loss} #both are the same

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log("val_loss", loss, prog_bar=True) # prog_bar=False will not show the val loss in the training progress bar.

    def configure_optimizers(self):
      return torch.optim.Adam(self.parameters(), lr=1e-3) # the NN get the parameters not self.model.parameters()
# Data
transform = transforms.ToTensor()
dataset = MNIST(root = "./MNIST", download = True, train = True, transform = transform)
train_ds, val_ds = random_split(dataset, [55000, 5000])
train_loader = DataLoader(train_ds, batch_size=64)
val_loader = DataLoader(val_ds, batch_size=64)
# checkpoint based on val loss
checkpoint_cb = ModelCheckpoint(monitor="val_loss", mode="min")
# trainer
model = LitModel()
trainer = L.Trainer(max_epochs = 3,
                    accelerator="auto", # auto will select gpu if available
                    callbacks=[checkpoint_cb]) 
trainer.fit(model, train_loader, val_loader)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 101 K  | train
---------------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
Epoch 2: 100%|██████████| 860/860 [00:14<00:00, 59.91it/s, v_num=2, val_loss=0.120]
`Trainer.fit` stopped: `max_epochs=3` reached.
Epoch 2: 100%|██████████| 860/860 [00:14<00:00, 59.87it/s, v_num=2, val_loss=0.120]