Object detection is the second type of problem we will be solving in this blog using lightning and pascal voc. This problem is very common because it allow to locate a variable number of objects in images.
Since it is used in case of autonomous driving, real time object counting object detection architectures are grouped into two categories. The real time object detection where the boss is the YOLO version 11 at the time i write this article. The more recent networks that can fight with yolo in RTOD (Real time object detection) is the D-Fine architecture which is an improved DETR.
We implemented it in this article, we chose to train this architecture in lightning since the training loop can be re-used with other models and dataset. I don’t know what is the best training paradigm but i want one that keeps as agnostic as possible to any framework, and for me lighting allow strong customization and proximity to pure torchscript while allowing easy multi device training and metrics logging etc..
Data
We will model pascal voc images in this project because its very easily accessible from torchvision repos.
from torchvision.datasets import VOCDetectionfrom torchvision.transforms import v2 import matplotlib.pyplot as pltimport numpy as npfrom torch.utils.data import DataLoaderimport torch from omegaconf import OmegaConffrom torchvision.ops import box_convertfrom torchvision.utils import draw_bounding_boxesimport albumentations as Afrom transformers import DFineForObjectDetection, AutoImageProcessorfrom albumentations.pytorch import ToTensorV2from torchvision.ops import batched_nmsimport torchmetricsimport lightning as Lfrom torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpnfrom typing import Dict, List, Optional
/home/arthings/big_partition/projet/myblog/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning:
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
/home/arthings/big_partition/projet/myblog/.venv/lib/python3.10/site-packages/albumentations/__init__.py:28: UserWarning:
A new version of Albumentations is available: '2.0.8' (you have '2.0.5'). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
d-fine expect bbox in coco format so we convert it in the get_item part.
categories = ['pottedplant', 'bottle', 'chair', 'diningtable', 'person', 'car', 'train', 'bus', 'bicycle', 'cat', 'aeroplane', 'tvmonitor', 'sofa', 'sheep', 'dog', 'bird', 'motorbike', 'horse', 'boat', 'cow', "bg"]def unpack_box(box_dict:Dict):""" Unpack the box dictionary into a list of coordinates """return torch.tensor(np.array([ box_dict["xmin"], box_dict["ymin"], box_dict["xmax"], box_dict["ymax"] ], dtype =float))def annotation_to_torch(target:Dict): rep = {} detections = target["annotation"]["object"] rep["labels"] = np.array([categories.index(i["name"]) for i in detections])# xmin ymin xmax ymax rep["boxes"] = torch.stack([unpack_box(i["bndbox"]) for i in detections])return rep image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine_x_coco")class MyVoc(VOCDetection):def__getitem__(self, index): image, target =super().__getitem__(index)# Apply your transformations here target = annotation_to_torch(target) transform = A.Compose([ A.PadIfNeeded(500,500), A.HorizontalFlip(), A.RandomCrop(400,400), A.Resize(224, 224), A.Normalize(normalization="min_max"), ToTensorV2() ], bbox_params=A.BboxParams(format='pascal_voc', # Specify input format label_fields=['class_labels'], filter_invalid_bboxes=True) ) transformed = transform( image=np.array(image), bboxes=target["boxes"], class_labels=target["labels"] ) transformed["labels"] = transformed["class_labels"] transformed["boxes"] = transformed["bboxes"] transformed.pop("bboxes") image = transformed.pop("image") transformed["boxes"] = box_convert( torch.from_numpy(transformed["boxes"], ),"xyxy","xywh",).float() transformed["labels"] = torch.from_numpy(transformed["labels"]) transformed["class_labels"] = torch.from_numpy(transformed["class_labels"])return image.float(), transformeddef draw_item(self, index:Optional[int] =None, n=5):if index isNone: index = np.random.randint(0, len(self)) image, labels =self[index] with_boxes = draw_bounding_boxes( image = image, boxes= box_convert(labels["boxes"], "xywh", "xyxy"), labels = [categories[i] for i in labels["labels"]], colors ="red" ) plt.figure(figsize=(10, 10)) plt.imshow(with_boxes.permute(1, 2, 0).numpy()) plt.axis('off') plt.show()return
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Nothing special here, we show what the data looks like.
Modeling part is here, whith the lightning module.
Code
config = OmegaConf.create({"lr": 1e-4,"batch_size":2,"epochs":3,"world_size":1})def apply_nms(preds:Dict, iou_thr:float=.5): nms_indices = batched_nms( preds["boxes"], scores = preds["scores"], idxs=preds["labels"], iou_threshold=iou_thr ) preds_nms = {} preds_nms["boxes"] = preds["boxes"][nms_indices,:] preds_nms["scores"] = preds["scores"][nms_indices] preds_nms["labels"] = preds["labels"][nms_indices]# high_scores_indices = preds_nms["scores"] > .3# preds_nms["boxes"] = preds_nms["boxes"][high_scores_indices]# preds_nms["scores"] = preds_nms["scores"][high_scores_indices]# preds_nms["labels"] = preds_nms["labels"][high_scores_indices]return preds_nmsclass odModule(L.LightningModule):def__init__(self, config, categories:List[str], nms_thr:float=.5):super().__init__()# if config.checkpoint is not None:# print(f"checkpoint from {config.checkpoint}") num_classes =len(categories)self.categories = categoriesself.nms_thr = nms_thrself.config = config# self.model = fasterrcnn_mobilenet_v3_large_fpn(pretrained = False, num_classes=num_classes)self.model = DFineForObjectDetection.from_pretrained("ustc-community/dfine_x_coco", id2label= {i:cat for i,cat inenumerate(categories)}, label2id={cat:i for i,cat inenumerate(categories)}, ignore_mismatched_sizes=True, ) metrics = torchmetrics.MetricCollection( [ torchmetrics.detection.mean_ap.MeanAveragePrecision(# extended_summary=True, iou_thresholds=np.linspace(0,1,20).tolist(), class_metrics=True, iou_type="bbox", ), ] ) self.val_metrics = metrics.clone(prefix="Validation/")self.test_metrics = metrics.clone(prefix="Test/")self.save_hyperparameters(ignore=["train_ds"])@staticmethoddef prepare_batch(batch): images, targets = batchreturn images, targetsdef forward(self, x, y=None):if y isnotNone:returnself.model(pixel_values = x, labels = y)else: preds =self.model(x) n, c, h, w = x.shape preds = image_processor.post_process_object_detection( preds, target_sizes=[(h,w) for _ inrange(n)], threshold=0.5)return predsdef predict(self, x):"""Forward the model then run NMS (for evaluation) Args: x (_type_): _description_ Returns: _type_: _description_ """ preds:List =self(x)# preds_nms = [apply_nms(i, self.nms_thr) for i in preds]return predsdef training_step(self, batch, batch_idx): img_b, target_b =self.prepare_batch(batch) bs =len(img_b) dfine_output =self(img_b, target_b)self.log_dict( dfine_output.loss_dict, on_step=False, on_epoch=True, sync_dist=True, batch_size=bs, prog_bar=True, )return {"loss": dfine_output.loss}def validation_step(self, batch, batch_idx): img_b, target_b =self.prepare_batch(batch) output_nms =self.predict(img_b)self.val_metrics(output_nms, target_b)returndef on_validation_epoch_end(self): m =self.val_metrics.compute() m_single= {i:j for i,j in m.items() if j.nelement() ==1}self.log_dict(m_single, on_epoch=True, sync_dist=False)for i, class_id inenumerate(m["Validation/classes"]):self.log(f"Validation/MAP {self.categories[class_id]}", m["Validation/map_per_class"][i])self.val_metrics.reset()returndef test_step(self, batch, batch_idx): img_b, target_b =self.prepare_batch(batch) output_nms =self.predict(img_b)self.test_metrics(output_nms, target_b)returndef on_test_epoch_end(self): m =self.test_metrics.compute() m_single= {i:j for i,j in m.items() if j.nelement() ==1}self.log_dict(m_single, on_epoch=True, sync_dist=False)for i, class_id inenumerate(m["Test/classes"]):self.log(f"Test/MAP {self.categories[class_id]}", m["Test/map_per_class"][i])self.test_metrics.reset()returndef configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.config.lr, weight_decay=1e-4*self.config.batch_size /16, )# scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(# optimizer, T_max=scheduler_nsteps, eta_min=self.config.lr / 10# )# sched_config1 = {"scheduler": scheduler1, "interval": "epoch"}return [optimizer]#, [sched_config1]model = odModule( config, categories)
Some weights of DFineForObjectDetection were not initialized from the model checkpoint at ustc-community/dfine_x_coco and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
- model.decoder.class_embed.3.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.decoder.class_embed.3.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
- model.decoder.class_embed.4.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.decoder.class_embed.4.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
- model.decoder.class_embed.5.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.decoder.class_embed.5.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
- model.denoising_class_embed.weight: found shape torch.Size([81, 256]) in the checkpoint and torch.Size([22, 256]) in the model instantiated
- model.enc_score_head.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([21]) in the model instantiated
- model.enc_score_head.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([21, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
def collate_fn(batch):# Separate the images and targets images = [] targets = []for image, target in batch: images.append(image) # Assuming each item has an 'image' key targets.append(target) # Assuming each item has a 'target' key# Stack images into a single tensor images = torch.stack(images, dim=0)# Convert targets to a list of dictionaries or tensors# This depends on how your targets are structured# For example, if targets are dictionaries with bounding boxes and labelsreturn images, targetstrain_loader = DataLoader( ds, batch_size =2, shuffle =True, collate_fn = collate_fn)val_loader = DataLoader( val_ds, batch_size =2, shuffle =False, collate_fn = collate_fn)im, tar =next(iter(train_loader))tar
Using 16bit Automatic Mixed Precision (AMP)
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Source Code
---title: D-Fine on pascal Vocauthor: "Julien Combes"date: "2025-06-09"categories: [Phd, code, DeepLearning, ComputerVision, ENG]image: "image.png"---Object detection is the second type of problem we will be solving in this blog using lightning and pascal voc.This problem is very common because it allow to locate a variable number of objects in images.Since it is used in case of autonomous driving, real time object counting object detection architectures are grouped into two categories.The real time object detection where the boss is the [YOLO](https://docs.ultralytics.com/fr/models/yolo11/) version 11 at the time i write this article. The more recent networks that can fight with yolo in RTOD (Real time object detection) is the [D-Fine architecture](https://arxiv.org/abs/2410.13842) which is an improved DETR.We implemented it in this article, we chose to train this architecture in lightning since the training loop can be re-used with other models and dataset. I don't know what is the best training paradigm but i want one that keeps as agnostic as possible to any framework, and for me lighting allow strong customization and proximity to pure torchscript while allowing easy multi device training and metrics logging etc..# Data We will model pascal voc images in this project because its very easily accessible from torchvision repos.```{python}from torchvision.datasets import VOCDetectionfrom torchvision.transforms import v2 import matplotlib.pyplot as pltimport numpy as npfrom torch.utils.data import DataLoaderimport torch from omegaconf import OmegaConffrom torchvision.ops import box_convertfrom torchvision.utils import draw_bounding_boxesimport albumentations as Afrom transformers import DFineForObjectDetection, AutoImageProcessorfrom albumentations.pytorch import ToTensorV2from torchvision.ops import batched_nmsimport torchmetricsimport lightning as Lfrom torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpnfrom typing import Dict, List, Optional```d-fine expect bbox in coco format so we convert it in the get_item part.```{python}categories = ['pottedplant', 'bottle', 'chair', 'diningtable', 'person', 'car', 'train', 'bus', 'bicycle', 'cat', 'aeroplane', 'tvmonitor', 'sofa', 'sheep', 'dog', 'bird', 'motorbike', 'horse', 'boat', 'cow', "bg"]def unpack_box(box_dict:Dict):""" Unpack the box dictionary into a list of coordinates """return torch.tensor(np.array([ box_dict["xmin"], box_dict["ymin"], box_dict["xmax"], box_dict["ymax"] ], dtype =float))def annotation_to_torch(target:Dict): rep = {} detections = target["annotation"]["object"] rep["labels"] = np.array([categories.index(i["name"]) for i in detections])# xmin ymin xmax ymax rep["boxes"] = torch.stack([unpack_box(i["bndbox"]) for i in detections])return rep image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine_x_coco")class MyVoc(VOCDetection):def__getitem__(self, index): image, target =super().__getitem__(index)# Apply your transformations here target = annotation_to_torch(target) transform = A.Compose([ A.PadIfNeeded(500,500), A.HorizontalFlip(), A.RandomCrop(400,400), A.Resize(224, 224), A.Normalize(normalization="min_max"), ToTensorV2() ], bbox_params=A.BboxParams(format='pascal_voc', # Specify input format label_fields=['class_labels'], filter_invalid_bboxes=True) ) transformed = transform( image=np.array(image), bboxes=target["boxes"], class_labels=target["labels"] ) transformed["labels"] = transformed["class_labels"] transformed["boxes"] = transformed["bboxes"] transformed.pop("bboxes") image = transformed.pop("image") transformed["boxes"] = box_convert( torch.from_numpy(transformed["boxes"], ),"xyxy","xywh",).float() transformed["labels"] = torch.from_numpy(transformed["labels"]) transformed["class_labels"] = torch.from_numpy(transformed["class_labels"])return image.float(), transformeddef draw_item(self, index:Optional[int] =None, n=5):if index isNone: index = np.random.randint(0, len(self)) image, labels =self[index] with_boxes = draw_bounding_boxes( image = image, boxes= box_convert(labels["boxes"], "xywh", "xyxy"), labels = [categories[i] for i in labels["labels"]], colors ="red" ) plt.figure(figsize=(10, 10)) plt.imshow(with_boxes.permute(1, 2, 0).numpy()) plt.axis('off') plt.show()return```Nothing special here, we show what the data looks like.```{python}ds = MyVoc( root ="./data", download =False, image_set="train")val_ds = MyVoc( root ="./data", download =False, image_set="val",)ds.draw_item()```# ModelingModeling part is here, whith the lightning module.```{python}#| code-fold: trueconfig = OmegaConf.create({"lr": 1e-4,"batch_size":2,"epochs":3,"world_size":1})def apply_nms(preds:Dict, iou_thr:float=.5): nms_indices = batched_nms( preds["boxes"], scores = preds["scores"], idxs=preds["labels"], iou_threshold=iou_thr ) preds_nms = {} preds_nms["boxes"] = preds["boxes"][nms_indices,:] preds_nms["scores"] = preds["scores"][nms_indices] preds_nms["labels"] = preds["labels"][nms_indices]# high_scores_indices = preds_nms["scores"] > .3# preds_nms["boxes"] = preds_nms["boxes"][high_scores_indices]# preds_nms["scores"] = preds_nms["scores"][high_scores_indices]# preds_nms["labels"] = preds_nms["labels"][high_scores_indices]return preds_nmsclass odModule(L.LightningModule):def__init__(self, config, categories:List[str], nms_thr:float=.5):super().__init__()# if config.checkpoint is not None:# print(f"checkpoint from {config.checkpoint}") num_classes =len(categories)self.categories = categoriesself.nms_thr = nms_thrself.config = config# self.model = fasterrcnn_mobilenet_v3_large_fpn(pretrained = False, num_classes=num_classes)self.model = DFineForObjectDetection.from_pretrained("ustc-community/dfine_x_coco", id2label= {i:cat for i,cat inenumerate(categories)}, label2id={cat:i for i,cat inenumerate(categories)}, ignore_mismatched_sizes=True, ) metrics = torchmetrics.MetricCollection( [ torchmetrics.detection.mean_ap.MeanAveragePrecision(# extended_summary=True, iou_thresholds=np.linspace(0,1,20).tolist(), class_metrics=True, iou_type="bbox", ), ] ) self.val_metrics = metrics.clone(prefix="Validation/")self.test_metrics = metrics.clone(prefix="Test/")self.save_hyperparameters(ignore=["train_ds"])@staticmethoddef prepare_batch(batch): images, targets = batchreturn images, targetsdef forward(self, x, y=None):if y isnotNone:returnself.model(pixel_values = x, labels = y)else: preds =self.model(x) n, c, h, w = x.shape preds = image_processor.post_process_object_detection( preds, target_sizes=[(h,w) for _ inrange(n)], threshold=0.5)return predsdef predict(self, x):"""Forward the model then run NMS (for evaluation) Args: x (_type_): _description_ Returns: _type_: _description_ """ preds:List =self(x)# preds_nms = [apply_nms(i, self.nms_thr) for i in preds]return predsdef training_step(self, batch, batch_idx): img_b, target_b =self.prepare_batch(batch) bs =len(img_b) dfine_output =self(img_b, target_b)self.log_dict( dfine_output.loss_dict, on_step=False, on_epoch=True, sync_dist=True, batch_size=bs, prog_bar=True, )return {"loss": dfine_output.loss}def validation_step(self, batch, batch_idx): img_b, target_b =self.prepare_batch(batch) output_nms =self.predict(img_b)self.val_metrics(output_nms, target_b)returndef on_validation_epoch_end(self): m =self.val_metrics.compute() m_single= {i:j for i,j in m.items() if j.nelement() ==1}self.log_dict(m_single, on_epoch=True, sync_dist=False)for i, class_id inenumerate(m["Validation/classes"]):self.log(f"Validation/MAP {self.categories[class_id]}", m["Validation/map_per_class"][i])self.val_metrics.reset()returndef test_step(self, batch, batch_idx): img_b, target_b =self.prepare_batch(batch) output_nms =self.predict(img_b)self.test_metrics(output_nms, target_b)returndef on_test_epoch_end(self): m =self.test_metrics.compute() m_single= {i:j for i,j in m.items() if j.nelement() ==1}self.log_dict(m_single, on_epoch=True, sync_dist=False)for i, class_id inenumerate(m["Test/classes"]):self.log(f"Test/MAP {self.categories[class_id]}", m["Test/map_per_class"][i])self.test_metrics.reset()returndef configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.config.lr, weight_decay=1e-4*self.config.batch_size /16, )# scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(# optimizer, T_max=scheduler_nsteps, eta_min=self.config.lr / 10# )# sched_config1 = {"scheduler": scheduler1, "interval": "epoch"}return [optimizer]#, [sched_config1]model = odModule( config, categories)``````{python}def collate_fn(batch):# Separate the images and targets images = [] targets = []for image, target in batch: images.append(image) # Assuming each item has an 'image' key targets.append(target) # Assuming each item has a 'target' key# Stack images into a single tensor images = torch.stack(images, dim=0)# Convert targets to a list of dictionaries or tensors# This depends on how your targets are structured# For example, if targets are dictionaries with bounding boxes and labelsreturn images, targetstrain_loader = DataLoader( ds, batch_size =2, shuffle =True, collate_fn = collate_fn)val_loader = DataLoader( val_ds, batch_size =2, shuffle =False, collate_fn = collate_fn)im, tar =next(iter(train_loader))tar```# TrainingAll the training is shown here.The full training is happening here.the full training has not been done yet but no bug has been encountered in first batches.```{python}model.train()trainer= L.Trainer( max_epochs=3, precision ="16-mixed", enable_checkpointing=True, num_sanity_val_steps=2, log_every_n_steps=50, check_val_every_n_epoch=1,)# trainer.fit(model, train_loader, val_loader)```