Add the assignments of the lecture 6.

foxik · foxik · commit 7af54d6409e0 · 2025-03-26T23:54:20.000+01:00
diff --git a/labs/06/bboxes_utils.py b/labs/06/bboxes_utils.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+import argparse
+from math import log
+from typing import Callable
+import unittest
+
+import torch
+
+# Bounding boxes and anchors are expected to be PyTorch tensors,
+# where the last dimension has size 4.
+
+# For bounding boxes in pixel coordinates, the 4 values correspond to:
+TOP: int = 0
+LEFT: int = 1
+BOTTOM: int = 2
+RIGHT: int = 3
+
+
+def bboxes_area(bboxes: torch.Tensor) -> torch.Tensor:
+    """Compute area of given set of bboxes.
+
+    Each bbox is parametrized as a four-tuple (top, left, bottom, right).
+
+    If the bboxes.shape is [..., 4], the output shape is bboxes.shape[:-1].
+    """
+    return torch.relu(bboxes[..., BOTTOM] - bboxes[..., TOP]) \
+        * torch.relu(bboxes[..., RIGHT] - bboxes[..., LEFT])
+
+
+def bboxes_iou(xs: torch.Tensor, ys: torch.Tensor) -> torch.Tensor:
+    """Compute IoU of corresponding pairs from two sets of bboxes `xs` and `ys`.
+
+    Each bbox is parametrized as a four-tuple (top, left, bottom, right).
+
+    Note that broadcasting is supported, so passing inputs with
+    `xs.shape=[num_xs, 1, 4]` and `ys.shape=[1, num_ys, 4]` produces an output with
+    shape `[num_xs, num_ys]`, computing IoU for all pairs of bboxes from `xs` and `ys`.
+    Formally, the output shape is `torch.broadcast_shapes(xs.shape, ys.shape)[:-1]`.
+    """
+    intersections = torch.stack([
+        torch.maximum(xs[..., TOP], ys[..., TOP]),
+        torch.maximum(xs[..., LEFT], ys[..., LEFT]),
+        torch.minimum(xs[..., BOTTOM], ys[..., BOTTOM]),
+        torch.minimum(xs[..., RIGHT], ys[..., RIGHT]),
+    ], dim=-1)
+
+    xs_area, ys_area, intersections_area = bboxes_area(xs), bboxes_area(ys), bboxes_area(intersections)
+
+    return intersections_area / (xs_area + ys_area - intersections_area)
+
+
+def bboxes_to_rcnn(anchors: torch.Tensor, bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert `bboxes` to a R-CNN-like representation relative to `anchors`.
+
+    The `anchors` and `bboxes` are arrays of four-tuples (top, left, bottom, right);
+    you can use the TOP, LEFT, BOTTOM, RIGHT constants as indices of the
+    respective coordinates.
+
+    The resulting representation of a single bbox is a four-tuple with:
+    - (bbox_y_center - anchor_y_center) / anchor_height
+    - (bbox_x_center - anchor_x_center) / anchor_width
+    - log(bbox_height / anchor_height)
+    - log(bbox_width / anchor_width)
+
+    If the `anchors.shape` is `[anchors_len, 4]` and `bboxes.shape` is `[anchors_len, 4]`,
+    the output shape is `[anchors_len, 4]`.
+    """
+    # TODO: Implement according to the docstring.
+    raise NotImplementedError()
+
+
+def bboxes_from_rcnn(anchors: torch.Tensor, rcnns: torch.Tensor) -> torch.Tensor:
+    """Convert R-CNN-like representation relative to `anchor` to a `bbox`.
+
+    If the `anchors.shape` is `[anchors_len, 4]` and `rcnns.shape` is `[anchors_len, 4]`,
+    the output shape is `[anchors_len, 4]`.
+    """
+    # TODO: Implement according to the docstring.
+    raise NotImplementedError()
+
+
+def bboxes_training(
+    anchors: torch.Tensor, gold_classes: torch.Tensor, gold_bboxes: torch.Tensor, iou_threshold: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Compute training data for object detection.
+
+    Arguments:
+    - `anchors` is an array of four-tuples (top, left, bottom, right)
+    - `gold_classes` is an array of zero-based classes of the gold objects
+    - `gold_bboxes` is an array of four-tuples (top, left, bottom, right)
+      of the gold objects
+    - `iou_threshold` is a given threshold
+
+    Returns:
+    - `anchor_classes` contains for every anchor either 0 for background
+      (if no gold object is assigned) or `1 + gold_class` if a gold object
+      with `gold_class` is assigned to it
+    - `anchor_bboxes` contains for every anchor a four-tuple
+      `(center_y, center_x, height, width)` representing the gold bbox of
+      a chosen object using parametrization of R-CNN; zeros if no gold object
+      was assigned to the anchor
+    If the `anchors` shape is `[anchors_len, 4]`, the `anchor_classes` shape
+    is `[anchors_len]` and the `anchor_bboxes` shape is `[anchors_len, 4]`.
+
+    Algorithm:
+    - First, for each gold object, assign it to an anchor with the largest IoU
+      (the anchor with smaller index if there are several). In case several gold
+      objects are assigned to a single anchor, use the gold object with smaller
+      index.
+    - For each unused anchor, find the gold object with the largest IoU
+      (again the gold object with smaller index if there are several), and if
+      the IoU is >= iou_threshold, assign the object to the anchor.
+    """
+    # TODO: First, for each gold object, assign it to an anchor with the
+    # largest IoU (the anchor with smaller index if there are several). In case
+    # several gold objects are assigned to a single anchor, use the gold object
+    # with smaller index.
+
+    # TODO: For each unused anchor, find the gold object with the largest IoU
+    # (again the gold object with smaller index if there are several), and if
+    # the IoU is >= threshold, assign the object to the anchor.
+
+    anchor_classes, anchor_bboxes = ..., ...
+
+    return anchor_classes, anchor_bboxes
+
+
+def main(args: argparse.Namespace) -> tuple[Callable, Callable, Callable]:
+    return bboxes_to_rcnn, bboxes_from_rcnn, bboxes_training
+
+
+class Tests(unittest.TestCase):
+    def test_bboxes_to_from_rcnn(self):
+        data = [
+            [[0, 0, 10, 10], [0, 0, 10, 10], [0, 0, 0, 0]],
+            [[0, 0, 10, 10], [5, 0, 15, 10], [.5, 0, 0, 0]],
+            [[0, 0, 10, 10], [0, 5, 10, 15], [0, .5, 0, 0]],
+            [[0, 0, 10, 10], [0, 0, 20, 30], [.5, 1, log(2), log(3)]],
+            [[0, 9, 10, 19], [2, 10, 5, 16], [-0.15, -0.1, -1.20397, -0.51083]],
+            [[5, 3, 15, 13], [7, 7, 10, 9], [-0.15, 0, -1.20397, -1.60944]],
+            [[7, 6, 17, 16], [9, 10, 12, 13], [-0.15, 0.05, -1.20397, -1.20397]],
+            [[5, 6, 15, 16], [7, 7, 10, 10], [-0.15, -0.25, -1.20397, -1.20397]],
+            [[6, 3, 16, 13], [8, 5, 12, 8], [-0.1, -0.15, -0.91629, -1.20397]],
+            [[5, 2, 15, 12], [9, 6, 12, 8], [0.05, 0, -1.20397, -1.60944]],
+            [[2, 10, 12, 20], [6, 11, 8, 17], [0, -0.1, -1.60944, -0.51083]],
+            [[10, 9, 20, 19], [12, 13, 17, 16], [-0.05, 0.05, -0.69315, -1.20397]],
+            [[6, 7, 16, 17], [10, 11, 12, 14], [0, 0.05, -1.60944, -1.20397]],
+            [[2, 2, 12, 12], [3, 5, 8, 8], [-0.15, -0.05, -0.69315, -1.20397]],
+        ]
+        # First run on individual anchors, and then on all together
+        for anchors, bboxes, rcnns in [map(lambda x: [x], row) for row in data] + [zip(*data)]:
+            anchors, bboxes, rcnns = [torch.tensor(data, dtype=torch.float32) for data in [anchors, bboxes, rcnns]]
+            torch.testing.assert_close(bboxes_to_rcnn(anchors, bboxes), rcnns, atol=1e-3, rtol=1e-3)
+            torch.testing.assert_close(bboxes_from_rcnn(anchors, rcnns), bboxes, atol=1e-3, rtol=1e-3)
+
+    def test_bboxes_training(self):
+        anchors = torch.tensor([[0, 0, 10, 10], [0, 10, 10, 20], [10, 0, 20, 10], [10, 10, 20, 20]])
+        for gold_classes, gold_bboxes, anchor_classes, anchor_bboxes, iou in [
+                [[1], [[14, 14, 16, 16]], [0, 0, 0, 2], [[0, 0, 0, 0]] * 3 + [[0, 0, log(.2), log(.2)]], 0.5],
+                [[2], [[0, 0, 20, 20]], [3, 0, 0, 0], [[.5, .5, log(2), log(2)]] + [[0, 0, 0, 0]] * 3, 0.26],
+                [[2], [[0, 0, 20, 20]], [3, 3, 3, 3],
+                 [[y, x, log(2), log(2)] for y in [.5, -.5] for x in [.5, -.5]], 0.24],
+                [[0, 1], [[3, 3, 20, 18], [10, 1, 18, 21]], [0, 0, 0, 1],
+                 [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [-0.35, -0.45, 0.53062, 0.40546]], 0.5],
+                [[0, 1], [[3, 3, 20, 18], [10, 1, 18, 21]], [0, 0, 2, 1],
+                 [[0, 0, 0, 0], [0, 0, 0, 0], [-0.1, 0.6, -0.22314, 0.69314], [-0.35, -0.45, 0.53062, 0.40546]], 0.3],
+                [[0, 1], [[3, 3, 20, 18], [10, 1, 18, 21]], [0, 1, 2, 1],
+                 [[0, 0, 0, 0], [0.65, -0.45, 0.53062, 0.40546], [-0.1, 0.6, -0.22314, 0.69314],
+                  [-0.35, -0.45, 0.53062, 0.40546]], 0.17],
+        ]:
+            gold_classes, anchor_classes = torch.tensor(gold_classes), torch.tensor(anchor_classes)
+            gold_bboxes, anchor_bboxes = torch.tensor(gold_bboxes), torch.tensor(anchor_bboxes)
+            computed_classes, computed_bboxes = bboxes_training(anchors, gold_classes, gold_bboxes, iou)
+            torch.testing.assert_close(computed_classes, anchor_classes, atol=1e-3, rtol=1e-3)
+            torch.testing.assert_close(computed_bboxes, anchor_bboxes, atol=1e-3, rtol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/labs/06/svhn_competition.py b/labs/06/svhn_competition.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+import argparse
+import datetime
+import os
+import re
+
+import numpy as np
+import timm
+import torch
+import torchvision.transforms.v2 as v2
+
+import bboxes_utils
+import npfl138
+npfl138.require_version("2425.6")
+from npfl138.datasets.svhn import SVHN
+
+# TODO: Define reasonable defaults and optionally more parameters.
+# Also, you can set the number of threads to 0 to use all your CPU cores.
+parser = argparse.ArgumentParser()
+parser.add_argument("--batch_size", default=..., type=int, help="Batch size.")
+parser.add_argument("--epochs", default=..., type=int, help="Number of epochs.")
+parser.add_argument("--seed", default=42, type=int, help="Random seed.")
+parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
+
+
+def main(args: argparse.Namespace) -> None:
+    # Set the random seed and the number of threads.
+    npfl138.startup(args.seed, args.threads)
+    npfl138.global_keras_initializers()
+
+    # Create logdir name.
+    args.logdir = os.path.join("logs", "{}-{}-{}".format(
+        os.path.basename(globals().get("__file__", "notebook")),
+        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
+        ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", k), v) for k, v in sorted(vars(args).items())))
+    ))
+
+    # Load the data. The individual examples are dictionaries with the keys:
+    # - "image", a `[3, SIZE, SIZE]` tensor of `torch.uint8` values in [0-255] range,
+    # - "classes", a `[num_digits]` PyTorch vector with classes of image digits,
+    # - "bboxes", a `[num_digits, 4]` PyTorch vector with bounding boxes of image digits.
+    # The `decode_on_demand` argument can be set to `True` to save memory and decode
+    # each image only when accessed, but it will most likely slow down training.
+    svhn = SVHN(decode_on_demand=False)
+
+    # Load the EfficientNetV2-B0 model without the classification layer.
+    # Apart from calling the model as in the classification task, you can call it using
+    #   output, features = efficientnetv2_b0.forward_intermediates(batch_of_images)
+    # obtaining (assuming the input images have 224x224 resolution):
+    # - `output` is a `[N, 1280, 7, 7]` tensor with the final features before global average pooling,
+    # - `features` is a list of intermediate features with resolution 112x112, 56x56, 28x28, 14x14, 7x7.
+    efficientnetv2_b0 = timm.create_model("tf_efficientnetv2_b0.in1k", pretrained=True, num_classes=0)
+
+    # Create a simple preprocessing performing necessary normalization.
+    preprocessing = v2.Compose([
+        v2.ToDtype(torch.float32, scale=True),  # The `scale=True` also rescales the image to [0, 1].
+        v2.Normalize(mean=efficientnetv2_b0.pretrained_cfg["mean"], std=efficientnetv2_b0.pretrained_cfg["std"]),
+    ])
+
+    # TODO: Create the model and train it.
+    model = ...
+
+    # Generate test set annotations, but in `args.logdir` to allow parallel execution.
+    os.makedirs(args.logdir, exist_ok=True)
+    with open(os.path.join(args.logdir, "svhn_competition.txt"), "w", encoding="utf-8") as predictions_file:
+        # TODO: Predict the digits and their bounding boxes on the test set.
+        # Assume that for a single test image we get
+        # - `predicted_classes`: a 1D array with the predicted digits,
+        # - `predicted_bboxes`: a [len(predicted_classes), 4] array with bboxes;
+        for predicted_classes, predicted_bboxes in ...:
+            output = []
+            for label, bbox in zip(predicted_classes, predicted_bboxes):
+                output += [int(label)] + list(map(float, bbox))
+            print(*output, file=predictions_file)
+
+
+if __name__ == "__main__":
+    main_args = parser.parse_args([] if "__file__" not in globals() else None)
+    main(main_args)
diff --git a/lectures/lecture06.md b/lectures/lecture06.md
@@ -5,6 +5,8 @@
 #### Video: https://lectures.ms.mff.cuni.cz/video/rec/npfl138/2425/npfl138-2425-06-czech.mp4, CZ Lecture
 #### Video: https://lectures.ms.mff.cuni.cz/video/rec/npfl138/2425/npfl138-2425-06-english.mp4, EN Lecture
 #### Questions: #lecture_6_questions
+#### Lecture assignment: bboxes_utils
+#### Lecture assignment: svhn_competition
 
 - R-CNN [[R-CNN](https://arxiv.org/abs/1311.2524)]
 - Fast R-CNN [[Fast R-CNN](https://arxiv.org/abs/1504.08083)]
diff --git a/tasks/bboxes_utils.md b/tasks/bboxes_utils.md
@@ -0,0 +1,26 @@
+### Assignment: bboxes_utils
+#### Date: Deadline: Apr 09, 22:00
+#### Points: 2 points
+
+This is a preparatory assignment for `svhn_competition`. The goal is to
+implement several bounding box manipulation routines in the
+[bboxes_utils.py](https://github.com/ufal/npfl138/tree/master/labs/06/bboxes_utils.py)
+module. Notably, you need to implement the following methods:
+- `bboxes_to_rcnn`: convert given bounding boxes to a R-CNN-like
+  representation relative to the given anchors;
+- `bboxes_from_rcnn`: convert R-CNN-like representations relative to
+  given anchors back to bounding boxes;
+- `bboxes_training`: given a list of anchors and gold objects, assign gold
+  objects to anchors and generate suitable training data (the exact algorithm
+  is described in the template).
+
+The [bboxes_utils.py](https://github.com/ufal/npfl138/tree/master/labs/06/bboxes_utils.py)
+contains simple unit tests, which are evaluated when executing the module,
+which you can use to check the validity of your implementation. Note that
+the template does not contain type annotations because Python typing system is
+not flexible enough to describe the tensor shape changes.
+
+When submitting to ReCodEx, the method `main` is executed, returning the
+implemented `bboxes_to_rcnn`, `bboxes_from_rcnn` and `bboxes_training`
+methods. These methods are then executed and compared to the reference
+implementation.
diff --git a/tasks/svhn_competition.md b/tasks/svhn_competition.md
@@ -0,0 +1,45 @@
+### Assignment: svhn_competition
+#### Date: Deadline: Apr 09, 22:00
+#### Points: 5 points+5 bonus
+
+The goal of this assignment is to implement a system performing object
+recognition, optionally utilizing the pretrained EfficientNetV2-B0 backbone
+(or any other model from the [timm](https://huggingface.co/docs/timm) library).
+
+The [Street View House Numbers (SVHN) dataset](https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/demos/svhn_train.html)
+annotates for every photo all digits appearing on it, including their bounding
+boxes. The dataset can be loaded using the [npfl138.datasets.svhn](https://github.com/ufal/npfl138/blob/master/labs/npfl138/datasets/svhn.py)
+module. Similarly to the `CAGS` dataset, the `train/dev/test` are PyTorch
+`torch.utils.data.Dataset`s, and every element is a dictionary with the following keys:
+- `"image"`: a square 3-channel image stored using `torch.Tensor` of type `torch.uint8`,
+- `"classes"`: a 1D `torch.Tensor`  with all digit labels appearing in the image,
+- `"bboxes"`: a `[num_digits, 4]` 2D `torch.Tensor` with bounding boxes of every
+  digit in the image, each represented as `[TOP, LEFT, BOTTOM, RIGHT]`.
+
+Each test set image annotation consists of a sequence of space separated
+five-tuples _label top left bottom right_, and the annotation is considered
+correct, if exactly the gold digits are predicted, each with IoU at least 0.5.
+The whole test set score is then the prediction accuracy of individual images.
+You can again evaluate your predictions using the
+[npfl138.datasets.svhn](https://github.com/ufal/npfl138/blob/master/labs/npfl138/datasets/svhn.py)
+module, either by running with `python3 -m npfl138.datasets.svhn --evaluate=path --dataset=dev/test`
+or using the `svhn.evaluate` method. Futhermore, you can visualize your
+predictions by using `python3 -m npfl138.datasets.svhn --visualize=path --dataset=dev/test`.
+
+The task is a [_competition_](https://ufal.mff.cuni.cz/courses/npfl138/2425-summer#competitions).
+Everyone who submits a solution achieving at least _20%_ test set accuracy gets
+5 points; the remaining 5 bonus points are distributed depending on relative ordering
+of your solutions. Note that I usually need at least _35%_ development set
+accuracy to achieve the required test set performance.
+
+You should start with the
+[svhn_competition.py](https://github.com/ufal/npfl138/tree/master/labs/06/svhn_competition.py)
+template, which generates the test set annotation in the required format.
+
+_A baseline solution can use RetinaNet-like single stage detector,
+using only a single level of convolutional features (no FPN)
+with single-scale and single-aspect anchors. Focal loss is available as
+[torchvision.ops.sigmoid_focal_loss](https://pytorch.org/vision/main/generated/torchvision.ops.sigmoid_focal_loss.html)
+and non-maximum suppression as
+[torchvision.ops.nms](https://pytorch.org/vision/main/generated/torchvision.ops.nms.html) or
+[torchvision.ops.batched_nms](https://pytorch.org/vision/main/generated/torchvision.ops.batched_nms.html)._