From 89458b917ed6468b6219507c044de41e3fe3f1b7 Mon Sep 17 00:00:00 2001
From: KeepTryingTo <196637235@qq.com>
Date: Sat, 18 Jan 2025 13:37:27 +0800
Subject: [PATCH 1/3] =?UTF-8?q?train.py=20&=20dataset.py=20&=20tool/utils.?=
 =?UTF-8?q?py=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 dataset.py    |  6 +++---
 tool/utils.py |  9 ++++++---
 train.py      | 27 ++++++++++++++++++++++-----
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/dataset.py b/dataset.py
index b46259a4..0e57e0ea 100644
--- a/dataset.py
+++ b/dataset.py
@@ -137,7 +137,7 @@ def image_data_augmentation(mat, w, h, pleft, ptop, swidth, sheight, flip, dhue,
         if dsat != 1 or dexp != 1 or dhue != 0:
             if img.shape[2] >= 3:
                 hsv_src = cv2.cvtColor(sized.astype(np.float32), cv2.COLOR_RGB2HSV)  # RGB to HSV
-                hsv = cv2.split(hsv_src)
+                hsv = list(cv2.split(hsv_src))
                 hsv[1] *= dsat
                 hsv[2] *= dexp
                 hsv[0] += 179 * dhue
@@ -431,8 +431,8 @@ def get_image_id(filename:str) -> int:
 
     print("You could also create your own 'get_image_id' function.")
     # print(filename)
-    parts = filename.split('/')
-    id = int(parts[-1][0:-4])
+    parts = filename.split('/')[-1].split('_')[-1]
+    id = int(parts[0:-4])
     # print(id)
     return id
 
diff --git a/tool/utils.py b/tool/utils.py
index a42e6264..427f285c 100644
--- a/tool/utils.py
+++ b/tool/utils.py
@@ -137,10 +137,13 @@ def get_color(c, x, max_val):
             t_size = cv2.getTextSize(msg, 0, 0.7, thickness=bbox_thick // 2)[0]
             c1, c2 = (x1,y1), (x2, y2)
             c3 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3)
-            cv2.rectangle(img, (x1,y1), (np.float32(c3[0]), np.float32(c3[1])), rgb, -1)
-            img = cv2.putText(img, msg, (c1[0], np.float32(c1[1] - 2)), cv2.FONT_HERSHEY_SIMPLEX,0.7, (0,0,0), bbox_thick//2,lineType=cv2.LINE_AA)
+            cv2.rectangle(img, (int(x1),int(y1)),
+                          (int(np.float32(c3[0])), int(np.float32(c3[1]))), rgb, -1)
+            img = cv2.putText(img, msg, (int(c1[0]), int(np.float32(c1[1] - 2))),
+                              cv2.FONT_HERSHEY_SIMPLEX,0.7, (0,0,0),
+                              bbox_thick//2,lineType=cv2.LINE_AA)
         
-        img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, bbox_thick)
+        img = cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), rgb, bbox_thick)
     if savename:
         print("save plot results to %s" % savename)
         cv2.imwrite(savename, img)
diff --git a/train.py b/train.py
index 1eda27a4..a7870e85 100644
--- a/train.py
+++ b/train.py
@@ -37,6 +37,8 @@
 from tool.tv_reference.coco_utils import convert_to_coco_api
 from tool.tv_reference.coco_eval import CocoEvaluator
 
+from torchvision.ops import nms
+
 
 def bboxes_iou(bboxes_a, bboxes_b, xyxy=True, GIoU=False, DIoU=False, CIoU=False):
     """Calculate the Intersection of Unions (IoUs) between bounding boxes.
@@ -128,11 +130,11 @@ def bboxes_iou(bboxes_a, bboxes_b, xyxy=True, GIoU=False, DIoU=False, CIoU=False
 
 
 class Yolo_loss(nn.Module):
-    def __init__(self, n_classes=80, n_anchors=3, device=None, batch=2):
+    def __init__(self, n_classes=80, img_size = 608, n_anchors=3, device=None, batch=2):
         super(Yolo_loss, self).__init__()
         self.device = device
         self.strides = [8, 16, 32]
-        image_size = 608
+        self.image_size = img_size
         self.n_classes = n_classes
         self.n_anchors = n_anchors
 
@@ -149,7 +151,7 @@ def __init__(self, n_classes=80, n_anchors=3, device=None, batch=2):
             ref_anchors[:, 2:] = np.array(all_anchors_grid, dtype=np.float32)
             ref_anchors = torch.from_numpy(ref_anchors)
             # calculate pred - xywh obj cls
-            fsize = image_size // self.strides[i]
+            fsize = self.image_size // self.strides[i]
             grid_x = torch.arange(fsize, dtype=torch.float).repeat(batch, 3, fsize, 1).to(device)
             grid_y = torch.arange(fsize, dtype=torch.float).repeat(batch, 3, fsize, 1).permute(0, 1, 3, 2).to(device)
             anchor_w = torch.from_numpy(masked_anchors[:, 0]).repeat(batch, fsize, fsize, 1).permute(0, 3, 1, 2).to(
@@ -354,7 +356,7 @@ def burnin_schedule(i):
         )
     scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)
 
-    criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes)
+    criterion = Yolo_loss(device=device,img_size=config.width, batch=config.batch // config.subdivisions, n_classes=config.classes)
     # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7)
     # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20)
 
@@ -448,7 +450,7 @@ def burnin_schedule(i):
                     pass
                 save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.pth')
                 if isinstance(model, torch.nn.DataParallel):
-                    torch.save(model.moduel,state_dict(), save_path)
+                    torch.save(model.moduel.state_dict(), save_path)
                 else:
                     torch.save(model.state_dict(), save_path)
                 logging.info(f'Checkpoint {epoch + 1} saved !')
@@ -509,6 +511,21 @@ def evaluate(model, data_loader, cfg, device, logger=None, **kwargs):
             labels = torch.as_tensor(labels, dtype=torch.int64)
             scores = np.max(confs, axis=1).flatten()
             scores = torch.as_tensor(scores, dtype=torch.float32)
+
+            # TODO NMS
+            # print('boxes.shape: {}'.format(boxes.size()))
+            # print('scores.shape: {}'.format(scores.size()))
+            keep = nms(boxes=boxes.squeeze(), scores=scores, iou_threshold=0.5)
+            boxes = boxes[keep]
+            scores = scores[keep]
+            labels = labels[keep]
+
+            # TODO filte lower confidence
+            mask = scores > 0.5
+            boxes = boxes[mask]
+            scores = scores[mask]
+            labels = labels[mask]
+
             res[target["image_id"].item()] = {
                 "boxes": boxes,
                 "scores": scores,

From d9aba51357ebb453bd3d24b3d73a1af4ed2af684 Mon Sep 17 00:00:00 2001
From: KeepTryingTo <96606723+KeepTryingTo@users.noreply.github.com>
Date: Sat, 18 Jan 2025 13:56:33 +0800
Subject: [PATCH 2/3] Delete dataset.py

---
 dataset.py | 452 -----------------------------------------------------
 1 file changed, 452 deletions(-)
 delete mode 100644 dataset.py

diff --git a/dataset.py b/dataset.py
deleted file mode 100644
index 0e57e0ea..00000000
--- a/dataset.py
+++ /dev/null
@@ -1,452 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-@Time          : 2020/05/06 21:09
-@Author        : Tianxiaomo
-@File          : dataset.py
-@Noice         :
-@Modificattion :
-    @Author    :
-    @Time      :
-    @Detail    :
-
-'''
-import os
-import random
-import sys
-
-import cv2
-import numpy as np
-
-import torch
-from torch.utils.data.dataset import Dataset
-
-
-def rand_uniform_strong(min, max):
-    if min > max:
-        swap = min
-        min = max
-        max = swap
-    return random.random() * (max - min) + min
-
-
-def rand_scale(s):
-    scale = rand_uniform_strong(1, s)
-    if random.randint(0, 1) % 2:
-        return scale
-    return 1. / scale
-
-
-def rand_precalc_random(min, max, random_part):
-    if max < min:
-        swap = min
-        min = max
-        max = swap
-    return (random_part * (max - min)) + min
-
-
-def fill_truth_detection(bboxes, num_boxes, classes, flip, dx, dy, sx, sy, net_w, net_h):
-    if bboxes.shape[0] == 0:
-        return bboxes, 10000
-    np.random.shuffle(bboxes)
-    bboxes[:, 0] -= dx
-    bboxes[:, 2] -= dx
-    bboxes[:, 1] -= dy
-    bboxes[:, 3] -= dy
-
-    bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx)
-    bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx)
-
-    bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy)
-    bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy)
-
-    out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) |
-                            ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) |
-                            ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) |
-                            ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0])
-    list_box = list(range(bboxes.shape[0]))
-    for i in out_box:
-        list_box.remove(i)
-    bboxes = bboxes[list_box]
-
-    if bboxes.shape[0] == 0:
-        return bboxes, 10000
-
-    bboxes = bboxes[np.where((bboxes[:, 4] < classes) & (bboxes[:, 4] >= 0))[0]]
-
-    if bboxes.shape[0] > num_boxes:
-        bboxes = bboxes[:num_boxes]
-
-    min_w_h = np.array([bboxes[:, 2] - bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1]]).min()
-
-    bboxes[:, 0] *= (net_w / sx)
-    bboxes[:, 2] *= (net_w / sx)
-    bboxes[:, 1] *= (net_h / sy)
-    bboxes[:, 3] *= (net_h / sy)
-
-    if flip:
-        temp = net_w - bboxes[:, 0]
-        bboxes[:, 0] = net_w - bboxes[:, 2]
-        bboxes[:, 2] = temp
-
-    return bboxes, min_w_h
-
-
-def rect_intersection(a, b):
-    minx = max(a[0], b[0])
-    miny = max(a[1], b[1])
-
-    maxx = min(a[2], b[2])
-    maxy = min(a[3], b[3])
-    return [minx, miny, maxx, maxy]
-
-
-def image_data_augmentation(mat, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp, gaussian_noise, blur,
-                            truth):
-    try:
-        img = mat
-        oh, ow, _ = img.shape
-        pleft, ptop, swidth, sheight = int(pleft), int(ptop), int(swidth), int(sheight)
-        # crop
-        src_rect = [pleft, ptop, swidth + pleft, sheight + ptop]  # x1,y1,x2,y2
-        img_rect = [0, 0, ow, oh]
-        new_src_rect = rect_intersection(src_rect, img_rect)  # 交集
-
-        dst_rect = [max(0, -pleft), max(0, -ptop), max(0, -pleft) + new_src_rect[2] - new_src_rect[0],
-                    max(0, -ptop) + new_src_rect[3] - new_src_rect[1]]
-        # cv2.Mat sized
-
-        if (src_rect[0] == 0 and src_rect[1] == 0 and src_rect[2] == img.shape[0] and src_rect[3] == img.shape[1]):
-            sized = cv2.resize(img, (w, h), cv2.INTER_LINEAR)
-        else:
-            cropped = np.zeros([sheight, swidth, 3])
-            cropped[:, :, ] = np.mean(img, axis=(0, 1))
-
-            cropped[dst_rect[1]:dst_rect[3], dst_rect[0]:dst_rect[2]] = \
-                img[new_src_rect[1]:new_src_rect[3], new_src_rect[0]:new_src_rect[2]]
-
-            # resize
-            sized = cv2.resize(cropped, (w, h), cv2.INTER_LINEAR)
-
-        # flip
-        if flip:
-            # cv2.Mat cropped
-            sized = cv2.flip(sized, 1)  # 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
-
-        # HSV augmentation
-        # cv2.COLOR_BGR2HSV, cv2.COLOR_RGB2HSV, cv2.COLOR_HSV2BGR, cv2.COLOR_HSV2RGB
-        if dsat != 1 or dexp != 1 or dhue != 0:
-            if img.shape[2] >= 3:
-                hsv_src = cv2.cvtColor(sized.astype(np.float32), cv2.COLOR_RGB2HSV)  # RGB to HSV
-                hsv = list(cv2.split(hsv_src))
-                hsv[1] *= dsat
-                hsv[2] *= dexp
-                hsv[0] += 179 * dhue
-                hsv_src = cv2.merge(hsv)
-                sized = np.clip(cv2.cvtColor(hsv_src, cv2.COLOR_HSV2RGB), 0, 255)  # HSV to RGB (the same as previous)
-            else:
-                sized *= dexp
-
-        if blur:
-            if blur == 1:
-                dst = cv2.GaussianBlur(sized, (17, 17), 0)
-                # cv2.bilateralFilter(sized, dst, 17, 75, 75)
-            else:
-                ksize = (blur / 2) * 2 + 1
-                dst = cv2.GaussianBlur(sized, (ksize, ksize), 0)
-
-            if blur == 1:
-                img_rect = [0, 0, sized.cols, sized.rows]
-                for b in truth:
-                    left = (b.x - b.w / 2.) * sized.shape[1]
-                    width = b.w * sized.shape[1]
-                    top = (b.y - b.h / 2.) * sized.shape[0]
-                    height = b.h * sized.shape[0]
-                    roi(left, top, width, height)
-                    roi = roi & img_rect
-                    dst[roi[0]:roi[0] + roi[2], roi[1]:roi[1] + roi[3]] = sized[roi[0]:roi[0] + roi[2],
-                                                                          roi[1]:roi[1] + roi[3]]
-
-            sized = dst
-
-        if gaussian_noise:
-            noise = np.array(sized.shape)
-            gaussian_noise = min(gaussian_noise, 127)
-            gaussian_noise = max(gaussian_noise, 0)
-            cv2.randn(noise, 0, gaussian_noise)  # mean and variance
-            sized = sized + noise
-    except:
-        print("OpenCV can't augment image: " + str(w) + " x " + str(h))
-        sized = mat
-
-    return sized
-
-
-def filter_truth(bboxes, dx, dy, sx, sy, xd, yd):
-    bboxes[:, 0] -= dx
-    bboxes[:, 2] -= dx
-    bboxes[:, 1] -= dy
-    bboxes[:, 3] -= dy
-
-    bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx)
-    bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx)
-
-    bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy)
-    bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy)
-
-    out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) |
-                            ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) |
-                            ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) |
-                            ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0])
-    list_box = list(range(bboxes.shape[0]))
-    for i in out_box:
-        list_box.remove(i)
-    bboxes = bboxes[list_box]
-
-    bboxes[:, 0] += xd
-    bboxes[:, 2] += xd
-    bboxes[:, 1] += yd
-    bboxes[:, 3] += yd
-
-    return bboxes
-
-
-def blend_truth_mosaic(out_img, img, bboxes, w, h, cut_x, cut_y, i_mixup,
-                       left_shift, right_shift, top_shift, bot_shift):
-    left_shift = min(left_shift, w - cut_x)
-    top_shift = min(top_shift, h - cut_y)
-    right_shift = min(right_shift, cut_x)
-    bot_shift = min(bot_shift, cut_y)
-
-    if i_mixup == 0:
-        bboxes = filter_truth(bboxes, left_shift, top_shift, cut_x, cut_y, 0, 0)
-        out_img[:cut_y, :cut_x] = img[top_shift:top_shift + cut_y, left_shift:left_shift + cut_x]
-    if i_mixup == 1:
-        bboxes = filter_truth(bboxes, cut_x - right_shift, top_shift, w - cut_x, cut_y, cut_x, 0)
-        out_img[:cut_y, cut_x:] = img[top_shift:top_shift + cut_y, cut_x - right_shift:w - right_shift]
-    if i_mixup == 2:
-        bboxes = filter_truth(bboxes, left_shift, cut_y - bot_shift, cut_x, h - cut_y, 0, cut_y)
-        out_img[cut_y:, :cut_x] = img[cut_y - bot_shift:h - bot_shift, left_shift:left_shift + cut_x]
-    if i_mixup == 3:
-        bboxes = filter_truth(bboxes, cut_x - right_shift, cut_y - bot_shift, w - cut_x, h - cut_y, cut_x, cut_y)
-        out_img[cut_y:, cut_x:] = img[cut_y - bot_shift:h - bot_shift, cut_x - right_shift:w - right_shift]
-
-    return out_img, bboxes
-
-
-def draw_box(img, bboxes):
-    for b in bboxes:
-        img = cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (0, 255, 0), 2)
-    return img
-
-
-class Yolo_dataset(Dataset):
-    def __init__(self, label_path, cfg, train=True):
-        super(Yolo_dataset, self).__init__()
-        if cfg.mixup == 2:
-            print("cutmix=1 - isn't supported for Detector")
-            raise
-        elif cfg.mixup == 2 and cfg.letter_box:
-            print("Combination: letter_box=1 & mosaic=1 - isn't supported, use only 1 of these parameters")
-            raise
-
-        self.cfg = cfg
-        self.train = train
-
-        truth = {}
-        f = open(label_path, 'r', encoding='utf-8')
-        for line in f.readlines():
-            data = line.split(" ")
-            truth[data[0]] = []
-            for i in data[1:]:
-                truth[data[0]].append([int(float(j)) for j in i.split(',')])
-
-        self.truth = truth
-        self.imgs = list(self.truth.keys())
-
-    def __len__(self):
-        return len(self.truth.keys())
-
-    def __getitem__(self, index):
-        if not self.train:
-            return self._get_val_item(index)
-        img_path = self.imgs[index]
-        bboxes = np.array(self.truth.get(img_path), dtype=np.float)
-        img_path = os.path.join(self.cfg.dataset_dir, img_path)
-        use_mixup = self.cfg.mixup
-        if random.randint(0, 1):
-            use_mixup = 0
-
-        if use_mixup == 3:
-            min_offset = 0.2
-            cut_x = random.randint(int(self.cfg.w * min_offset), int(self.cfg.w * (1 - min_offset)))
-            cut_y = random.randint(int(self.cfg.h * min_offset), int(self.cfg.h * (1 - min_offset)))
-
-        r1, r2, r3, r4, r_scale = 0, 0, 0, 0, 0
-        dhue, dsat, dexp, flip, blur = 0, 0, 0, 0, 0
-        gaussian_noise = 0
-
-        out_img = np.zeros([self.cfg.h, self.cfg.w, 3])
-        out_bboxes = []
-
-        for i in range(use_mixup + 1):
-            if i != 0:
-                img_path = random.choice(list(self.truth.keys()))
-                bboxes = np.array(self.truth.get(img_path), dtype=np.float)
-                img_path = os.path.join(self.cfg.dataset_dir, img_path)
-            img = cv2.imread(img_path)
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-            if img is None:
-                continue
-            oh, ow, oc = img.shape
-            dh, dw, dc = np.array(np.array([oh, ow, oc]) * self.cfg.jitter, dtype=np.int)
-
-            dhue = rand_uniform_strong(-self.cfg.hue, self.cfg.hue)
-            dsat = rand_scale(self.cfg.saturation)
-            dexp = rand_scale(self.cfg.exposure)
-
-            pleft = random.randint(-dw, dw)
-            pright = random.randint(-dw, dw)
-            ptop = random.randint(-dh, dh)
-            pbot = random.randint(-dh, dh)
-
-            flip = random.randint(0, 1) if self.cfg.flip else 0
-
-            if (self.cfg.blur):
-                tmp_blur = random.randint(0, 2)  # 0 - disable, 1 - blur background, 2 - blur the whole image
-                if tmp_blur == 0:
-                    blur = 0
-                elif tmp_blur == 1:
-                    blur = 1
-                else:
-                    blur = self.cfg.blur
-
-            if self.cfg.gaussian and random.randint(0, 1):
-                gaussian_noise = self.cfg.gaussian
-            else:
-                gaussian_noise = 0
-
-            if self.cfg.letter_box:
-                img_ar = ow / oh
-                net_ar = self.cfg.w / self.cfg.h
-                result_ar = img_ar / net_ar
-                # print(" ow = %d, oh = %d, w = %d, h = %d, img_ar = %f, net_ar = %f, result_ar = %f \n", ow, oh, w, h, img_ar, net_ar, result_ar);
-                if result_ar > 1:  # sheight - should be increased
-                    oh_tmp = ow / net_ar
-                    delta_h = (oh_tmp - oh) / 2
-                    ptop = ptop - delta_h
-                    pbot = pbot - delta_h
-                    # print(" result_ar = %f, oh_tmp = %f, delta_h = %d, ptop = %f, pbot = %f \n", result_ar, oh_tmp, delta_h, ptop, pbot);
-                else:  # swidth - should be increased
-                    ow_tmp = oh * net_ar
-                    delta_w = (ow_tmp - ow) / 2
-                    pleft = pleft - delta_w
-                    pright = pright - delta_w
-                    # printf(" result_ar = %f, ow_tmp = %f, delta_w = %d, pleft = %f, pright = %f \n", result_ar, ow_tmp, delta_w, pleft, pright);
-
-            swidth = ow - pleft - pright
-            sheight = oh - ptop - pbot
-
-            truth, min_w_h = fill_truth_detection(bboxes, self.cfg.boxes, self.cfg.classes, flip, pleft, ptop, swidth,
-                                                  sheight, self.cfg.w, self.cfg.h)
-            if (min_w_h / 8) < blur and blur > 1:  # disable blur if one of the objects is too small
-                blur = min_w_h / 8
-
-            ai = image_data_augmentation(img, self.cfg.w, self.cfg.h, pleft, ptop, swidth, sheight, flip,
-                                         dhue, dsat, dexp, gaussian_noise, blur, truth)
-
-            if use_mixup == 0:
-                out_img = ai
-                out_bboxes = truth
-            if use_mixup == 1:
-                if i == 0:
-                    old_img = ai.copy()
-                    old_truth = truth.copy()
-                elif i == 1:
-                    out_img = cv2.addWeighted(ai, 0.5, old_img, 0.5)
-                    out_bboxes = np.concatenate([old_truth, truth], axis=0)
-            elif use_mixup == 3:
-                if flip:
-                    tmp = pleft
-                    pleft = pright
-                    pright = tmp
-
-                left_shift = int(min(cut_x, max(0, (-int(pleft) * self.cfg.w / swidth))))
-                top_shift = int(min(cut_y, max(0, (-int(ptop) * self.cfg.h / sheight))))
-
-                right_shift = int(min((self.cfg.w - cut_x), max(0, (-int(pright) * self.cfg.w / swidth))))
-                bot_shift = int(min(self.cfg.h - cut_y, max(0, (-int(pbot) * self.cfg.h / sheight))))
-
-                out_img, out_bbox = blend_truth_mosaic(out_img, ai, truth.copy(), self.cfg.w, self.cfg.h, cut_x,
-                                                       cut_y, i, left_shift, right_shift, top_shift, bot_shift)
-                out_bboxes.append(out_bbox)
-                # print(img_path)
-        if use_mixup == 3:
-            out_bboxes = np.concatenate(out_bboxes, axis=0)
-        out_bboxes1 = np.zeros([self.cfg.boxes, 5])
-        out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
-        return out_img, out_bboxes1
-
-    def _get_val_item(self, index):
-        """
-        """
-        img_path = self.imgs[index]
-        bboxes_with_cls_id = np.array(self.truth.get(img_path), dtype=np.float)
-        img = cv2.imread(os.path.join(self.cfg.dataset_dir, img_path))
-        # img_height, img_width = img.shape[:2]
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        # img = cv2.resize(img, (self.cfg.w, self.cfg.h))
-        # img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
-        num_objs = len(bboxes_with_cls_id)
-        target = {}
-        # boxes to coco format
-        boxes = bboxes_with_cls_id[...,:4]
-        boxes[..., 2:] = boxes[..., 2:] - boxes[..., :2]  # box width, box height
-        target['boxes'] = torch.as_tensor(boxes, dtype=torch.float32)
-        target['labels'] = torch.as_tensor(bboxes_with_cls_id[...,-1].flatten(), dtype=torch.int64)
-        target['image_id'] = torch.tensor([get_image_id(img_path)])
-        target['area'] = (target['boxes'][:,3])*(target['boxes'][:,2])
-        target['iscrowd'] = torch.zeros((num_objs,), dtype=torch.int64)
-        return img, target
-
-
-def get_image_id(filename:str) -> int:
-    """
-    Convert a string to a integer.
-    Make sure that the images and the `image_id`s are in one-one correspondence.
-    There are already `image_id`s in annotations of the COCO dataset,
-    in which case this function is unnecessary.
-    For creating one's own `get_image_id` function, one can refer to
-    https://github.com/google/automl/blob/master/efficientdet/dataset/create_pascal_tfrecord.py#L86
-    or refer to the following code (where the filenames are like 'level1_123.jpg')
-    >>> lv, no = os.path.splitext(os.path.basename(filename))[0].split("_")
-    >>> lv = lv.replace("level", "")
-    >>> no = f"{int(no):04d}"
-    >>> return int(lv+no)
-    """
-    # raise NotImplementedError("Create your own 'get_image_id' function")
-    # lv, no = os.path.splitext(os.path.basename(filename))[0].split("_")
-    # lv = lv.replace("level", "")
-    # no = f"{int(no):04d}"
-    # return int(lv+no)
-
-    print("You could also create your own 'get_image_id' function.")
-    # print(filename)
-    parts = filename.split('/')[-1].split('_')[-1]
-    id = int(parts[0:-4])
-    # print(id)
-    return id
-
-
-if __name__ == "__main__":
-    from cfg import Cfg
-    import matplotlib.pyplot as plt
-
-    random.seed(2020)
-    np.random.seed(2020)
-    Cfg.dataset_dir = '/mnt/e/Dataset'
-    dataset = Yolo_dataset(Cfg.train_label, Cfg)
-    for i in range(100):
-        out_img, out_bboxes = dataset.__getitem__(i)
-        a = draw_box(out_img.copy(), out_bboxes.astype(np.int32))
-        plt.imshow(a.astype(np.int32))
-        plt.show()

From 269b2d9c90fab2ff8b52a01b2fdc8cf9d7f5863c Mon Sep 17 00:00:00 2001
From: KeepTryingTo <196637235@qq.com>
Date: Sat, 18 Jan 2025 13:57:58 +0800
Subject: [PATCH 3/3] =?UTF-8?q?train.py=20&=20dataset.py=20&=20tool/utils.?=
 =?UTF-8?q?py=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 train.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index a7870e85..8ed2c3a1 100644
--- a/train.py
+++ b/train.py
@@ -499,7 +499,6 @@ def evaluate(model, data_loader, cfg, device, logger=None, **kwargs):
             img_height, img_width = img.shape[:2]
             # boxes = output[...,:4].copy()  # output boxes in yolo format
             boxes = boxes.squeeze(2).cpu().detach().numpy()
-            boxes[...,2:] = boxes[...,2:] - boxes[...,:2] # Transform [x1, y1, x2, y2] to [x1, y1, w, h]
             boxes[...,0] = boxes[...,0]*img_width
             boxes[...,1] = boxes[...,1]*img_height
             boxes[...,2] = boxes[...,2]*img_width
@@ -515,6 +514,7 @@ def evaluate(model, data_loader, cfg, device, logger=None, **kwargs):
             # TODO NMS
             # print('boxes.shape: {}'.format(boxes.size()))
             # print('scores.shape: {}'.format(scores.size()))
+            from torchvision.ops import nms
             keep = nms(boxes=boxes.squeeze(), scores=scores, iou_threshold=0.5)
             boxes = boxes[keep]
             scores = scores[keep]
@@ -526,6 +526,8 @@ def evaluate(model, data_loader, cfg, device, logger=None, **kwargs):
             scores = scores[mask]
             labels = labels[mask]
 
+            boxes[..., 2:] = boxes[..., 2:] - boxes[..., :2]  # Transform [x1, y1, x2, y2] to [x1, y1, w, h]
+
             res[target["image_id"].item()] = {
                 "boxes": boxes,
                 "scores": scores,