Object Detection Tutorial#

In this tutorial, you will learn how to validate your object detection model using deepchecks test suites. You can read more about the different checks and suites for computer vision use cases at the examples section

If you just want to see the output of this tutorial, jump to the observing the results section.

An object detection tasks usually consist of two parts:

  • Object Localization, where the model predicts the location of an object in the image,

  • Object Classification, where the model predicts the class of the detected object.

The common output of an object detection model is a list of bounding boxes around the objects, and their classes.

# Before we start, if you don't have deepchecks vision package installed yet, run:
import sys
!{sys.executable} -m pip install "deepchecks[vision]" --quiet --upgrade # --user

# or install using pip from your python environment

Defining the data and model#

# Importing the required packages
import os
import urllib.request
import xml.etree.ElementTree as ET
import zipfile
from functools import partial

import albumentations as A
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision
from albumentations.pytorch import ToTensorV2
from PIL import Image
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection import _utils as det_utils
from torchvision.models.detection.ssdlite import SSDLiteClassificationHead

from deepchecks.vision.detection_data import DetectionData

Load Data#

The model in this tutorial is used to detect tomatoes in images. The model is trained on a dataset consisted of 895 images of tomatoes, with bounding box annotations provided in PASCAL VOC format. All annotations belong to a single class: tomato.

Note

The dataset is available at the following link: https://www.kaggle.com/andrewmvd/tomato-detection

We thank the authors of the dataset for providing the dataset.

url = 'https://figshare.com/ndownloader/files/34488599'
urllib.request.urlretrieve(url, 'tomato-detection.zip')

with zipfile.ZipFile('tomato-detection.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

class TomatoDataset(Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms

        self.images = list(sorted(os.listdir(os.path.join(root, 'images'))))
        self.annotations = list(sorted(os.listdir(os.path.join(root, 'annotations'))))

    def __getitem__(self, idx):
        img_path = os.path.join(self.root, "images", self.images[idx])
        ann_path = os.path.join(self.root, "annotations", self.annotations[idx])
        img = Image.open(img_path).convert("RGB")
        bboxes = []
        labels = []
        with open(ann_path, 'r') as f:
            tree = ET.parse(f)
            root = tree.getroot()
            size = root.find('size')
            w = int(size.find('width').text)
            h = int(size.find('height').text)

            for obj in root.iter('object'):
                difficult = obj.find('difficult').text
                if int(difficult) == 1:
                    continue
                cls_id = 1
                xmlbox = obj.find('bndbox')
                b = [float(xmlbox.find('xmin').text), float(xmlbox.find('ymin').text), float(xmlbox.find('xmax').text),
                        float(xmlbox.find('ymax').text)]
                bboxes.append(b)
                labels.append(cls_id)

        bboxes = torch.as_tensor(np.array(bboxes), dtype=torch.float32)
        labels = torch.as_tensor(np.array(labels), dtype=torch.int64)

        if self.transforms is not None:
            res = self.transforms(image=np.array(img), bboxes=bboxes, class_labels=labels)

        target = {
            'boxes': [torch.Tensor(x) for x in res['bboxes']],
            'labels': res['class_labels']
        }

        img = res['image']

        return img, target

    def __len__(self):
        return len(self.images)

data_transforms = A.Compose([
    A.Resize(height=256, width=256),
    A.CenterCrop(height=224, width=224),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2(),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))

dataset = TomatoDataset(root=os.path.join(os.path.curdir, 'tomato-detection/data'),
                        transforms=data_transforms)
train_set, test_set = torch.utils.data.random_split(dataset,
                                                    [int(len(dataset)*0.9), len(dataset)-int(len(dataset)*0.9)],
                                                    generator=torch.Generator().manual_seed(42))
test_set.transforms = A.Compose([ToTensorV2()])
train_loader = DataLoader(train_set, batch_size=64, collate_fn=(lambda batch: tuple(zip(*batch))))
test_loader = DataLoader(test_set, batch_size=64, collate_fn=(lambda batch: tuple(zip(*batch))))

Visualize a Few Images#

Let’s visualize a few training images so as to understand the data augmentation.

def prepare(inp):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1) * 255
    inp = inp.transpose((2,0,1))
    return torch.tensor(inp, dtype=torch.uint8)

import torchvision.transforms.functional as F


def show(imgs):
    if not isinstance(imgs, list):
        imgs = [imgs]
    fig, axs = plt.subplots(ncols=len(imgs), squeeze=False, figsize=(20,20))
    for i, img in enumerate(imgs):
        img = img.detach()
        img = F.to_pil_image(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

from torchvision.utils import draw_bounding_boxes

data = next(iter(train_loader))
inp, targets = data[0][:4], data[1][:4]


result = [draw_bounding_boxes(prepare(inp[i]), torch.stack(targets[i]['boxes']),
                              colors=['yellow'] * torch.stack(targets[i]['boxes']).shape[0], width=5)
          for i in range(len(targets))]
show(result)
plot detection tutorialTomatoes with bbox

Downloading a Pre-trained Model#

In this tutorial, we will download a pre-trained SSDlite model and a MobileNetV3 Large backbone from the official PyTorch repository. For more details, please refer to the official documentation.

After downloading the model, we will fine-tune it for our particular classes. We will do it by replacing the pre-trained head with a new one that matches our needs.

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)

in_channels = det_utils.retrieve_out_channels(model.backbone, (320, 320))
num_anchors = model.anchor_generator.num_anchors_per_location()
norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)

model.head.classification_head = SSDLiteClassificationHead(in_channels, num_anchors, 2, norm_layer)
model.to(device)
Downloading: "https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth" to /home/runner/.cache/torch/hub/checkpoints/ssdlite320_mobilenet_v3_large_coco-a79551df.pth

  0%|          | 0.00/13.4M [00:00<?, ?B/s]
 10%|#         | 1.41M/13.4M [00:00<00:00, 13.2MB/s]
 24%|##3       | 3.17M/13.4M [00:00<00:00, 15.3MB/s]
 41%|####      | 5.44M/13.4M [00:00<00:00, 17.9MB/s]
 62%|######1   | 8.31M/13.4M [00:00<00:00, 21.8MB/s]
 87%|########6 | 11.6M/13.4M [00:00<00:00, 26.2MB/s]
100%|##########| 13.4M/13.4M [00:00<00:00, 23.8MB/s]

SSD(
  (backbone): SSDLiteFeatureExtractorMobileNet(
    (features): Sequential(
      (0): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
        (1): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): ConvNormActivation(
              (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (2): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): ConvNormActivation(
              (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
              (1): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (2): ConvNormActivation(
              (0): Conv2d(64, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(24, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (3): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(24, 72, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(72, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): ConvNormActivation(
              (0): Conv2d(72, 72, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=72, bias=False)
              (1): BatchNorm2d(72, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (2): ConvNormActivation(
              (0): Conv2d(72, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(24, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (4): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(24, 72, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(72, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): ConvNormActivation(
              (0): Conv2d(72, 72, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), groups=72, bias=False)
              (1): BatchNorm2d(72, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(72, 24, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(24, 72, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(72, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(40, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (5): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(40, 120, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(120, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): ConvNormActivation(
              (0): Conv2d(120, 120, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=120, bias=False)
              (1): BatchNorm2d(120, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(120, 32, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(32, 120, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(120, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(40, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (6): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(40, 120, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(120, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): ConvNormActivation(
              (0): Conv2d(120, 120, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=120, bias=False)
              (1): BatchNorm2d(120, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(120, 32, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(32, 120, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(120, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(40, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (7): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(40, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(240, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(240, 240, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=240, bias=False)
              (1): BatchNorm2d(240, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): ConvNormActivation(
              (0): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (8): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(80, 200, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(200, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(200, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=200, bias=False)
              (1): BatchNorm2d(200, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): ConvNormActivation(
              (0): Conv2d(200, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (9): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(80, 184, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(184, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(184, 184, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=184, bias=False)
              (1): BatchNorm2d(184, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): ConvNormActivation(
              (0): Conv2d(184, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (10): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(80, 184, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(184, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(184, 184, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=184, bias=False)
              (1): BatchNorm2d(184, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): ConvNormActivation(
              (0): Conv2d(184, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (11): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
              (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(480, 120, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(120, 480, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(480, 112, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(112, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (12): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(112, 672, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(672, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(672, 672, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=672, bias=False)
              (1): BatchNorm2d(672, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(672, 168, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(168, 672, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(672, 112, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(112, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (13): ConvNormActivation(
          (0): Conv2d(112, 672, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(672, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
      )
      (1): Sequential(
        (0): Sequential(
          (1): ConvNormActivation(
            (0): Conv2d(672, 672, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), groups=672, bias=False)
            (1): BatchNorm2d(672, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): Hardswish()
          )
          (2): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(672, 168, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(168, 672, kernel_size=(1, 1), stride=(1, 1))
            (activation): ReLU()
            (scale_activation): Hardsigmoid()
          )
          (3): ConvNormActivation(
            (0): Conv2d(672, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          )
        )
        (1): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(480, 480, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=480, bias=False)
              (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(480, 120, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(120, 480, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(480, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (2): InvertedResidual(
          (block): Sequential(
            (0): ConvNormActivation(
              (0): Conv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (1): ConvNormActivation(
              (0): Conv2d(480, 480, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=480, bias=False)
              (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): Hardswish()
            )
            (2): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(480, 120, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(120, 480, kernel_size=(1, 1), stride=(1, 1))
              (activation): ReLU()
              (scale_activation): Hardsigmoid()
            )
            (3): ConvNormActivation(
              (0): Conv2d(480, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (3): ConvNormActivation(
          (0): Conv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
      )
    )
    (extra): ModuleList(
      (0): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(480, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvNormActivation(
          (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256, bias=False)
          (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): ConvNormActivation(
          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(512, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
      )
      (1): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvNormActivation(
          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=128, bias=False)
          (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): ConvNormActivation(
          (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
      )
      (2): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvNormActivation(
          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=128, bias=False)
          (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): ConvNormActivation(
          (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
      )
      (3): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvNormActivation(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
          (1): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): ConvNormActivation(
          (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
      )
    )
  )
  (anchor_generator): DefaultBoxGenerator(aspect_ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], clip=True, scales=[0.2, 0.35, 0.5, 0.65, 0.8, 0.95, 1.0], steps=None)
  (head): SSDLiteHead(
    (classification_head): SSDLiteClassificationHead(
      (module_list): ModuleList(
        (0): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(672, 672, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=672, bias=False)
            (1): BatchNorm2d(672, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(672, 12, kernel_size=(1, 1), stride=(1, 1))
        )
        (1): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
            (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(480, 12, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False)
            (1): BatchNorm2d(512, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(512, 12, kernel_size=(1, 1), stride=(1, 1))
        )
        (3): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
            (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
        )
        (4): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
            (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
        )
        (5): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
            (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(128, 12, kernel_size=(1, 1), stride=(1, 1))
        )
      )
    )
    (regression_head): SSDLiteRegressionHead(
      (module_list): ModuleList(
        (0): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(672, 672, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=672, bias=False)
            (1): BatchNorm2d(672, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(672, 24, kernel_size=(1, 1), stride=(1, 1))
        )
        (1): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
            (1): BatchNorm2d(480, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(480, 24, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False)
            (1): BatchNorm2d(512, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1))
        )
        (3): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
            (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(256, 24, kernel_size=(1, 1), stride=(1, 1))
        )
        (4): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
            (1): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(256, 24, kernel_size=(1, 1), stride=(1, 1))
        )
        (5): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
            (1): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(128, 24, kernel_size=(1, 1), stride=(1, 1))
        )
      )
    )
  )
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
      Resize(min_size=(320,), max_size=320, mode='bilinear')
  )
)

Loading Pre-trained Weights#

For this tutorial we will not include the training code itself, but will download and load pre-trained weights.

model.load_state_dict(torch.load('tomato-detection/ssd_model.pth'))
_ = model.eval()

Validating the Model With Deepchecks#

Now, after we have the training data, test data and the model, we can validate the model with deepchecks test suites.

Visualize the Data Loader and the Model Outputs#

First we’ll make sure we are familiar with the data loader and the model outputs.

batch = next(iter(train_loader))

print("Batch type is: ", type(batch))
print("First element is: ", type(batch[0]), "with len of ", len(batch[0]))
print("Example output of an image shape from the dataloader ", batch[0][0].shape)
print("Image values", batch[0][0])
print("-"*80)

print("Second element is: ", type(batch[1]), "with len of ", len(batch[1]))
print("Example output of a label from the dataloader ", batch[1][0])
Batch type is:  <class 'tuple'>
First element is:  <class 'tuple'> with len of  64
Example output of an image shape from the dataloader  torch.Size([3, 224, 224])
Image values tensor([[[-1.79253, -1.82678, -1.82678,  ...,  1.39267,  1.34130,  1.32417],
         [-1.72403, -1.79253, -1.80966,  ...,  1.35842,  1.32417,  1.34130],
         [-1.75828, -1.74116, -1.70691,  ...,  1.32417,  1.34130,  1.35842],
         ...,
         [-1.84391, -1.82678, -1.75828,  ...,  0.62206,  0.19394, -0.35405],
         [-1.80966, -1.79253, -1.72403,  ...,  0.81043,  0.72481,  0.34806],
         [-1.79253, -1.84391, -1.75828,  ...,  0.81043,  0.82755,  0.69056]],

        [[-1.38796, -1.45798, -1.45798,  ...,  1.51821,  1.46569,  1.46569],
         [-1.38796, -1.47549, -1.52801,  ...,  1.50070,  1.46569,  1.48319],
         [-1.42297, -1.47549, -1.49300,  ...,  1.46569,  1.50070,  1.50070],
         ...,
         [-1.70308, -1.68557, -1.61555,  ...,  0.67787,  0.22269, -0.33753],
         [-1.68557, -1.66807, -1.58053,  ...,  0.87045,  0.74790,  0.38025],
         [-1.68557, -1.70308, -1.61555,  ...,  0.87045,  0.85294,  0.71289]],

        [[-1.57786, -1.61272, -1.61272,  ...,  1.66397,  1.61168,  1.59425],
         [-1.54301, -1.59529, -1.64758,  ...,  1.62911,  1.59425,  1.59425],
         [-1.59529, -1.59529, -1.61272,  ...,  1.59425,  1.61168,  1.62911],
         ...,
         [-1.63015, -1.59529, -1.52558,  ...,  0.46135,  0.02562, -0.53211],
         [-1.59529, -1.54301, -1.47329,  ...,  0.72279,  0.61821,  0.21734],
         [-1.59529, -1.59529, -1.50815,  ...,  0.72279,  0.72279,  0.56593]]])
--------------------------------------------------------------------------------
Second element is:  <class 'tuple'> with len of  64
Example output of a label from the dataloader  {'boxes': [tensor([  0.00000,  75.13600,  39.68000, 165.75999]), tensor([ 0.00000,  0.00000, 94.08000, 93.56800])], 'labels': [tensor(1), tensor(1)]}

Implementing the DetectionData class#

The checks in the package validate the model & data by calculating various quantities over the data, labels and predictions. In order to do that, those must be in a pre-defined format, according to the task type. The first step is to implement a class that enables deepchecks to interact with your model and data and transform them to this pre-defined format, which is set for each task type. In this tutorial, we will implement the object detection task type by implementing a class that inherits from the deepchecks.vision.detection_data.DetectionData class.

The DetectionData class contains additional data and general methods intended for easy access to relevant metadata for object detection ML models validation. To learn more about the expected format please visit the API reference for the deepchecks.vision.detection_data.DetectionData class.

from deepchecks.vision.detection_data import DetectionData


class TomatoData(DetectionData):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def batch_to_images(self, batch):
        """
        Convert a batch of data to images in the expected format. The expected format is an iterable of cv2 images,
        where each image is a numpy array of shape (height, width, channels). The numbers in the array should be in the
        range [0, 255] in a uint8 format.
        """
        inp = torch.stack(list(batch[0])).cpu().detach().numpy().transpose((0, 2, 3, 1))
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        # Un-normalize the images
        inp = std * inp + mean
        inp = np.clip(inp, 0, 1)
        return inp * 255

    def batch_to_labels(self, batch):
        """
        Convert a batch of data to labels in the expected format. The expected format is a list of tensors of length N,
        where N is the number of samples. Each tensor element is in a shape of [B, 5], where B is the number of bboxes
        in the image, and each bounding box is in the structure of [class_id, x, y, w, h].
        """
        tensor_annotations = batch[1]
        label = []
        for annotation in tensor_annotations:
            if len(annotation["boxes"]):
                bbox = torch.stack(annotation["boxes"])
                # Convert the Pascal VOC xyxy format to xywh format
                bbox[:, 2:] = bbox[:, 2:] - bbox[:, :2]
                # The label shape is [class_id, x, y, w, h]
                label.append(
                    torch.concat([torch.stack(annotation["labels"]).reshape((-1, 1)), bbox], dim=1)
                )
            else:
                # If it's an empty image, we need to add an empty label
                label.append(torch.tensor([]))
        return label

    def infer_on_batch(self, batch, model, device):
        """
        Returns the predictions for a batch of data. The expected format is a list of tensors of shape length N, where N
        is the number of samples. Each tensor element is in a shape of [B, 6], where B is the number of bboxes in the
        predictions, and each bounding box is in the structure of [x, y, w, h, score, class_id].
        """
        nm_thrs = 0.2
        score_thrs = 0.7
        imgs = list(img.to(device) for img in batch[0])
        # Getting the predictions of the model on the batch
        with torch.no_grad():
            preds = model(imgs)
        processed_pred = []
        for pred in preds:
            # Performoing non-maximum suppression on the detections
            keep_boxes = torchvision.ops.nms(pred['boxes'], pred['scores'], nm_thrs)
            score_filter = pred['scores'][keep_boxes] > score_thrs

            # get the filtered result
            test_boxes = pred['boxes'][keep_boxes][score_filter].reshape((-1, 4))
            test_boxes[:, 2:] = test_boxes[:, 2:] - test_boxes[:, :2]  # xyxy to xywh
            test_labels = pred['labels'][keep_boxes][score_filter]
            test_scores = pred['scores'][keep_boxes][score_filter]

            processed_pred.append(
                torch.concat([test_boxes, test_scores.reshape((-1, 1)), test_labels.reshape((-1, 1))], dim=1))
        return processed_pred

After defining the task class, we can validate it by running the following code:

# We have a single label here, which is the tomato class
# The label_map is a dictionary that maps the class id to the class name, for display purposes.
LABEL_MAP = {
    1: 'Tomato'
}
training_data = TomatoData(data_loader=train_loader, label_map=LABEL_MAP)
test_data = TomatoData(data_loader=test_loader, label_map=LABEL_MAP)

training_data.validate_format(model, device=device)
test_data.validate_format(model, device=device)

# And observe the output:
Deepchecks will try to validate the extractors given...
torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at  ../aten/src/ATen/native/TensorShape.cpp:2157.)
Structure validation
--------------------
Label formatter: Pass!
Prediction formatter: Pass!
Image formatter: Pass!

Content validation
------------------
For validating the content within the structure you have to manually observe the classes, image, label and prediction.
Examples of classes observed in the batch's labels: [[1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1], [1]]
Visual images & label & prediction: should open in a new window
*******************************************************************************
This machine does not support GUI
The formatted image was saved in:
/home/runner/work/deepchecks/deepchecks/docs/source/user-guide/vision/quickstarts/deepchecks_formatted_image (4).jpg
Visual examples of an image with prediction and label data. Label is red, prediction is blue, and deepchecks loves you.
validate_extractors can be set to skip the image saving or change the save path
*******************************************************************************
Deepchecks will try to validate the extractors given...
Structure validation
--------------------
Label formatter: Pass!
Prediction formatter: Pass!
Image formatter: Pass!

Content validation
------------------
For validating the content within the structure you have to manually observe the classes, image, label and prediction.
Examples of classes observed in the batch's labels: [[1, 1, 1, 1], [1], [1, 1, 1, 1, 1, 1, 1], [1], [1]]
Visual images & label & prediction: should open in a new window
*******************************************************************************
This machine does not support GUI
The formatted image was saved in:
/home/runner/work/deepchecks/deepchecks/docs/source/user-guide/vision/quickstarts/deepchecks_formatted_image (5).jpg
Visual examples of an image with prediction and label data. Label is red, prediction is blue, and deepchecks loves you.
validate_extractors can be set to skip the image saving or change the save path
*******************************************************************************

Running Deepchecks’ full suite on our data and model!#

Now that we have defined the task class, we can validate the model with the full suite of deepchecks. This can be done with this simple few lines of code:

from deepchecks.vision.suites import full_suite

suite = full_suite()
result = suite.run(training_data, test_data, model, device=device)
Validating Input:
|     | 0/1 [Time: 00:00]
Validating Input:
|#####| 1/1 [Time: 00:05]
Validating Input:
|#####| 1/1 [Time: 00:05]Properties that have class_id as output_type will be skipped.


Ingesting Batches - Train Dataset:
|             | 0/13 [Time: 00:00]

Ingesting Batches - Train Dataset:
|#            | 1/13 [Time: 00:03]

Ingesting Batches - Train Dataset:
|##           | 2/13 [Time: 00:06]

Ingesting Batches - Train Dataset:
|###          | 3/13 [Time: 00:10]

Ingesting Batches - Train Dataset:
|####         | 4/13 [Time: 00:13]

Ingesting Batches - Train Dataset:
|#####        | 5/13 [Time: 00:17]

Ingesting Batches - Train Dataset:
|######       | 6/13 [Time: 00:20]

Ingesting Batches - Train Dataset:
|#######      | 7/13 [Time: 00:24]

Ingesting Batches - Train Dataset:
|########     | 8/13 [Time: 00:27]

Ingesting Batches - Train Dataset:
|#########    | 9/13 [Time: 00:31]

Ingesting Batches - Train Dataset:
|##########   | 10/13 [Time: 00:34]

Ingesting Batches - Train Dataset:
|###########  | 11/13 [Time: 00:37]

Ingesting Batches - Train Dataset:
|############ | 12/13 [Time: 00:41]

Ingesting Batches - Train Dataset:
|#############| 13/13 [Time: 00:43]

Ingesting Batches - Train Dataset:
|#############| 13/13 [Time: 00:43]


Computing Single Dataset Checks - Train Dataset:
|       | 0/7 [Time: 00:00]


Computing Single Dataset Checks - Train Dataset:
|#      | 1/7 [Time: 00:00, Check=Mean Average Precision Report]


Computing Single Dataset Checks - Train Dataset:
|##     | 2/7 [Time: 00:00, Check=Mean Average Recall Report]


Computing Single Dataset Checks - Train Dataset:
|####   | 4/7 [Time: 00:05, Check=Image Segment Performance]


Computing Single Dataset Checks - Train Dataset:
|#####  | 5/7 [Time: 00:05, Check=Image Property Outliers]


Computing Single Dataset Checks - Train Dataset:
|###### | 6/7 [Time: 00:05, Check=Label Property Outliers]


Computing Single Dataset Checks - Train Dataset:
|#######| 7/7 [Time: 00:05, Check=Property Label Correlation]



Ingesting Batches - Test Dataset:
|     | 0/2 [Time: 00:00]



Ingesting Batches - Test Dataset:
|##5  | 1/2 [Time: 00:03]



Ingesting Batches - Test Dataset:
|#####| 2/2 [Time: 00:05]



Ingesting Batches - Test Dataset:
|#####| 2/2 [Time: 00:05]




Computing Single Dataset Checks - Test Dataset:
|       | 0/7 [Time: 00:00]




Computing Single Dataset Checks - Test Dataset:
|##     | 2/7 [Time: 00:00, Check=Mean Average Recall Report]




Computing Single Dataset Checks - Test Dataset:
|####   | 4/7 [Time: 00:00, Check=Image Segment Performance]




Computing Single Dataset Checks - Test Dataset:
|###### | 6/7 [Time: 00:01, Check=Label Property Outliers]




Computing Single Dataset Checks - Test Dataset:
|#######| 7/7 [Time: 00:01, Check=Property Label Correlation]





Computing Checks:
|          | 0/10 [Time: 00:00]





Computing Checks:
|          | 0/10 [Time: 00:00, Check=Class Performance]





Computing Checks:
|#         | 1/10 [Time: 00:00, Check=Class Performance]





Computing Checks:
|#         | 1/10 [Time: 00:00, Check=Train Test Prediction Drift]





Computing Checks:
|##        | 2/10 [Time: 00:00, Check=Train Test Prediction Drift]





Computing Checks:
|##        | 2/10 [Time: 00:00, Check=Simple Model Comparison]





Computing Checks:
|##        | 2/10 [Time: 00:00, Check=Model Error Analysis]   Default parameter min_samples_leaf will change in version 2.6.See https://github.com/scikit-learn-contrib/category_encoders/issues/327
Default parameter smoothing will change in version 2.6.See https://github.com/scikit-learn-contrib/category_encoders/issues/327






Computing Checks:
|####      | 4/10 [Time: 00:00, Check=Model Error Analysis]





Computing Checks:
|####      | 4/10 [Time: 00:00, Check=New Labels]





Computing Checks:
|####      | 4/10 [Time: 00:00, Check=Heatmap Comparison]





Computing Checks:
|####      | 4/10 [Time: 00:00, Check=Train Test Label Drift]





Computing Checks:
|#######   | 7/10 [Time: 00:00, Check=Train Test Label Drift]





Computing Checks:
|#######   | 7/10 [Time: 00:00, Check=Image Property Drift]





Computing Checks:
|#######   | 7/10 [Time: 00:00, Check=Image Dataset Drift]





Computing Checks:
|######### | 9/10 [Time: 00:01, Check=Image Dataset Drift]





Computing Checks:
|######### | 9/10 [Time: 00:01, Check=Property Label Correlation Change]





Computing Checks:
|##########| 10/10 [Time: 00:01, Check=Property Label Correlation Change]

Observing the results:#

The results can be saved as a html file with the following code:

result.save_as_html('output.html')
'output (3).html'

Or, if working inside a notebook, the output can be displayed directly by simply printing the result object:

result
Full Suite


Total running time of the script: ( 1 minutes 29.734 seconds)

Gallery generated by Sphinx-Gallery