[QUESTION] Does CV-CUDA support for multigpu?

[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
[//]: # "SPDX-License-Identifier: Apache-2.0"
[//]: # ""
[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
[//]: # "you may not use this file except in compliance with the License."
[//]: # "You may obtain a copy of the License at"
[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
[//]: # ""
[//]: # "Unless required by applicable law or agreed to in writing, software"
[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
[//]: # "See the License for the specific language governing permissions and"
[//]: # "limitations under the License."

Hi, I want to use this great work in torch based distributed training to speed up, it works well when only use single gpu, but when use more than one gpu, it get crash and get the error as following:
`terminate called after throwing an instance of 'pybind11::error_already_set'
what(): ValueError: Hold resources failed: cudaErrorInvalidResourceHandle: invalid resource handle`
I have tried to print some info to debug this problem, it can be found that all things is fine in rank_0, but cvcuda get crash in rank_1, 
![image](https://github.com/user-attachments/assets/03520fa9-9f73-4af6-bdc8-297cde085c31)

the main code is shown as below:
`
````python       
 # Define the cuda device, context and streams.
        cuda_device = cuda.Device(self.rank)
        cuda_ctx = cuda_device.retain_primary_context()
        cuda_ctx.push()
        cvcuda_stream = cvcuda.Stream().current
        torch_stream = torch.cuda.default_stream(device=cuda_device)

        print(f'rank_{self.rank} start train, cvcuda stream: {cvcuda_stream}, torch_stream: {torch_stream}')
        self.data_preprocessor = PreprocessorCvcuda(
            self.rank, 
            cuda_ctx,
            cvcuda_stream,
        )

        #  Do everything in streams.
        with cvcuda_stream, torch.cuda.stream(torch_stream):
            self.train(train_dataloaders, test_dataloaders, iterations=iterations)
            cuda_ctx.pop()
`
````python
class ImageBatchDecoder:
    def __init__(
        self,
        device_id,
        cuda_ctx,
        cuda_stream,
        cvcuda_perf=None,
    ):
        self.device_id = device_id
        self.cuda_ctx = cuda_ctx
        self.cuda_stream = cuda_stream
        self.cvcuda_perf = cvcuda_perf
        self.decoder = nvimgcodec.Decoder(device_id=device_id)

    def __call__(self, batch: list, aug_params: dict):
        # args: 
        #   batch: batch of undecoded images bytes
        if self.cvcuda_perf is not None:
            self.cvcuda_perf.push_range("decoder.nvimagecodec")

        data_batch = [img for frame in batch for img in frame]

        tensor_list = []
        print(f'rank_{self.device_id} start decode, stream: {self.cuda_stream}...', flush=True)
        image_list = self.decoder.decode(data_batch, cuda_stream=self.cuda_stream)
        print(f'rank_{self.device_id} end decode...', flush=True)

        resize = aug_params['resize'].view(-1, 2).cpu().numpy()
        crop = aug_params['crop'].view(-1, 4).cpu().numpy()
        rotate = aug_params['rotate'].view(-1).cpu().numpy()
        rotate_rad = rotate * 3.1415926535897932384626433832795 / 180
        sin_r = np.sin(rotate_rad)
        cor_r = np.cos(rotate_rad)
        # Convert the decoded images to nvcv tensors in a list.
        for i in range(len(image_list)):
            print(f'rank_{self.device_id} start resize_crop_convert_reformat...', flush=True)
            aug_img = cvcuda.resize_crop_convert_reformat(
                cvcuda.as_tensor(image_list[i], "HWC"),
                (resize[i, 0], resize[i, 1]),
                cvcuda.Interp.LINEAR,
                cvcuda.RectI(
                    crop[i, 0], 
                    crop[i, 1], 
                    round(crop[i, 2] - crop[i, 0]), 
                    round(crop[i, 3] - crop[i, 1])),
                layout="HWC",
                data_type=nvcv.Type.U8,
                # manip=cvcuda.ChannelManip.REVERSE,
                # scale=1. / 255,
                stream=self.cuda_stream,
            )
            print(f'rank_{self.device_id} start rotate...', flush=True)
            aug_img = cvcuda.rotate(
                aug_img,
                rotate[i],
                [0.5 * (aug_img.shape[1] - aug_img.shape[1] * cor_r[i] - aug_img.shape[0] * sin_r[i]),
                 0.5 * (aug_img.shape[0] + aug_img.shape[1] * sin_r[i] - aug_img.shape[0] * cor_r[i])], 
                cvcuda.Interp.LINEAR,
                stream=self.cuda_stream,
            )
            tensor_list.append(aug_img)

        # Stack the list of tensors to a single NHWC tensor and convert to NCHW.
        print(f'rank_{self.device_id} start reformat...', flush=True)
        cvcuda_decoded_tensor = cvcuda.reformat(cvcuda.stack(tensor_list), "NCHW", stream=self.cuda_stream)

        if self.cvcuda_perf is not None:
            self.cvcuda_perf.pop_range()
        print(f'rank_{self.device_id} end of ImageBatchDecoder...', flush=True)
        return cvcuda_decoded_tensor

````

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[QUESTION] Does CV-CUDA support for multigpu? #212

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[QUESTION] Does CV-CUDA support for multigpu? #212

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions