Skip to content

[QUESTION] Does CV-CUDA support for multigpu? #212

Open
@zhengjs

Description

@zhengjs

Hi, I want to use this great work in torch based distributed training to speed up, it works well when only use single gpu, but when use more than one gpu, it get crash and get the error as following:
terminate called after throwing an instance of 'pybind11::error_already_set' what(): ValueError: Hold resources failed: cudaErrorInvalidResourceHandle: invalid resource handle
I have tried to print some info to debug this problem, it can be found that all things is fine in rank_0, but cvcuda get crash in rank_1,
image

the main code is shown as below:
`

 # Define the cuda device, context and streams.
        cuda_device = cuda.Device(self.rank)
        cuda_ctx = cuda_device.retain_primary_context()
        cuda_ctx.push()
        cvcuda_stream = cvcuda.Stream().current
        torch_stream = torch.cuda.default_stream(device=cuda_device)

        print(f'rank_{self.rank} start train, cvcuda stream: {cvcuda_stream}, torch_stream: {torch_stream}')
        self.data_preprocessor = PreprocessorCvcuda(
            self.rank, 
            cuda_ctx,
            cvcuda_stream,
        )

        #  Do everything in streams.
        with cvcuda_stream, torch.cuda.stream(torch_stream):
            self.train(train_dataloaders, test_dataloaders, iterations=iterations)
            cuda_ctx.pop()
`
````python
class ImageBatchDecoder:
    def __init__(
        self,
        device_id,
        cuda_ctx,
        cuda_stream,
        cvcuda_perf=None,
    ):
        self.device_id = device_id
        self.cuda_ctx = cuda_ctx
        self.cuda_stream = cuda_stream
        self.cvcuda_perf = cvcuda_perf
        self.decoder = nvimgcodec.Decoder(device_id=device_id)

    def __call__(self, batch: list, aug_params: dict):
        # args: 
        #   batch: batch of undecoded images bytes
        if self.cvcuda_perf is not None:
            self.cvcuda_perf.push_range("decoder.nvimagecodec")

        data_batch = [img for frame in batch for img in frame]

        tensor_list = []
        print(f'rank_{self.device_id} start decode, stream: {self.cuda_stream}...', flush=True)
        image_list = self.decoder.decode(data_batch, cuda_stream=self.cuda_stream)
        print(f'rank_{self.device_id} end decode...', flush=True)

        resize = aug_params['resize'].view(-1, 2).cpu().numpy()
        crop = aug_params['crop'].view(-1, 4).cpu().numpy()
        rotate = aug_params['rotate'].view(-1).cpu().numpy()
        rotate_rad = rotate * 3.1415926535897932384626433832795 / 180
        sin_r = np.sin(rotate_rad)
        cor_r = np.cos(rotate_rad)
        # Convert the decoded images to nvcv tensors in a list.
        for i in range(len(image_list)):
            print(f'rank_{self.device_id} start resize_crop_convert_reformat...', flush=True)
            aug_img = cvcuda.resize_crop_convert_reformat(
                cvcuda.as_tensor(image_list[i], "HWC"),
                (resize[i, 0], resize[i, 1]),
                cvcuda.Interp.LINEAR,
                cvcuda.RectI(
                    crop[i, 0], 
                    crop[i, 1], 
                    round(crop[i, 2] - crop[i, 0]), 
                    round(crop[i, 3] - crop[i, 1])),
                layout="HWC",
                data_type=nvcv.Type.U8,
                # manip=cvcuda.ChannelManip.REVERSE,
                # scale=1. / 255,
                stream=self.cuda_stream,
            )
            print(f'rank_{self.device_id} start rotate...', flush=True)
            aug_img = cvcuda.rotate(
                aug_img,
                rotate[i],
                [0.5 * (aug_img.shape[1] - aug_img.shape[1] * cor_r[i] - aug_img.shape[0] * sin_r[i]),
                 0.5 * (aug_img.shape[0] + aug_img.shape[1] * sin_r[i] - aug_img.shape[0] * cor_r[i])], 
                cvcuda.Interp.LINEAR,
                stream=self.cuda_stream,
            )
            tensor_list.append(aug_img)

        # Stack the list of tensors to a single NHWC tensor and convert to NCHW.
        print(f'rank_{self.device_id} start reformat...', flush=True)
        cvcuda_decoded_tensor = cvcuda.reformat(cvcuda.stack(tensor_list), "NCHW", stream=self.cuda_stream)

        if self.cvcuda_perf is not None:
            self.cvcuda_perf.pop_range()
        print(f'rank_{self.device_id} end of ImageBatchDecoder...', flush=True)
        return cvcuda_decoded_tensor

Metadata

Metadata

Labels

questionQuestion(s) from user.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions