PyTorch Lightning: Validation and model evaluation#
we are seeding everything in this example.
we are calculating the validation loss too. So we need to include that in our
LitModelclass.
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
print("Lightning version:", L.__version__)
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU available")
print("Torch version:", torch.__version__)
print("CUDA is available:", torch.cuda.is_available())
L.seed_everything(123456) # reproducibility
Seed set to 123456
Lightning version: 2.5.1
GPU name: NVIDIA RTX A5000
Torch version: 2.6.0+cu124
CUDA is available: True
123456
class LitModel(L.LightningModule): # a replacesment of nn.Module
def __init__(self):
super().__init__() # call __init__ of the super class to init important LightningModule functions
self.model = nn.Sequential(
nn.Flatten(),
nn.Linear(28*28, 128),
nn.ReLU(),
nn.Linear(128, 10)
)
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.cross_entropy(logits, y)
self.log("train_loss", loss)
#return loss
return {"loss": loss} #both are the same
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.cross_entropy(logits, y)
self.log("val_loss", loss, prog_bar=True) # prog_bar=False will not show the val loss in the training progress bar.
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3) # the NN get the parameters not self.model.parameters()
# Data
transform = transforms.ToTensor()
dataset = MNIST(root = "./MNIST", download = True, train = True, transform = transform)
train_ds, val_ds = random_split(dataset, [55000, 5000])
train_loader = DataLoader(train_ds, batch_size=64)
val_loader = DataLoader(val_ds, batch_size=64)
# checkpoint based on val loss
checkpoint_cb = ModelCheckpoint(monitor="val_loss", mode="min")
# trainer
model = LitModel()
trainer = L.Trainer(max_epochs = 3,
accelerator="auto", # auto will select gpu if available
callbacks=[checkpoint_cb])
trainer.fit(model, train_loader, val_loader)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params | Mode
---------------------------------------------
0 | model | Sequential | 101 K | train
---------------------------------------------
101 K Trainable params
0 Non-trainable params
101 K Total params
0.407 Total estimated model params size (MB)
5 Modules in train mode
0 Modules in eval mode
Epoch 2: 100%|██████████| 860/860 [00:14<00:00, 59.91it/s, v_num=2, val_loss=0.120]
`Trainer.fit` stopped: `max_epochs=3` reached.
Epoch 2: 100%|██████████| 860/860 [00:14<00:00, 59.87it/s, v_num=2, val_loss=0.120]