Open
Description
Hi, I want to use this great work in torch based distributed training to speed up, it works well when only use single gpu, but when use more than one gpu, it get crash and get the error as following:
terminate called after throwing an instance of 'pybind11::error_already_set' what(): ValueError: Hold resources failed: cudaErrorInvalidResourceHandle: invalid resource handle
I have tried to print some info to debug this problem, it can be found that all things is fine in rank_0, but cvcuda get crash in rank_1,
the main code is shown as below:
`
# Define the cuda device, context and streams.
cuda_device = cuda.Device(self.rank)
cuda_ctx = cuda_device.retain_primary_context()
cuda_ctx.push()
cvcuda_stream = cvcuda.Stream().current
torch_stream = torch.cuda.default_stream(device=cuda_device)
print(f'rank_{self.rank} start train, cvcuda stream: {cvcuda_stream}, torch_stream: {torch_stream}')
self.data_preprocessor = PreprocessorCvcuda(
self.rank,
cuda_ctx,
cvcuda_stream,
)
# Do everything in streams.
with cvcuda_stream, torch.cuda.stream(torch_stream):
self.train(train_dataloaders, test_dataloaders, iterations=iterations)
cuda_ctx.pop()
`
````python
class ImageBatchDecoder:
def __init__(
self,
device_id,
cuda_ctx,
cuda_stream,
cvcuda_perf=None,
):
self.device_id = device_id
self.cuda_ctx = cuda_ctx
self.cuda_stream = cuda_stream
self.cvcuda_perf = cvcuda_perf
self.decoder = nvimgcodec.Decoder(device_id=device_id)
def __call__(self, batch: list, aug_params: dict):
# args:
# batch: batch of undecoded images bytes
if self.cvcuda_perf is not None:
self.cvcuda_perf.push_range("decoder.nvimagecodec")
data_batch = [img for frame in batch for img in frame]
tensor_list = []
print(f'rank_{self.device_id} start decode, stream: {self.cuda_stream}...', flush=True)
image_list = self.decoder.decode(data_batch, cuda_stream=self.cuda_stream)
print(f'rank_{self.device_id} end decode...', flush=True)
resize = aug_params['resize'].view(-1, 2).cpu().numpy()
crop = aug_params['crop'].view(-1, 4).cpu().numpy()
rotate = aug_params['rotate'].view(-1).cpu().numpy()
rotate_rad = rotate * 3.1415926535897932384626433832795 / 180
sin_r = np.sin(rotate_rad)
cor_r = np.cos(rotate_rad)
# Convert the decoded images to nvcv tensors in a list.
for i in range(len(image_list)):
print(f'rank_{self.device_id} start resize_crop_convert_reformat...', flush=True)
aug_img = cvcuda.resize_crop_convert_reformat(
cvcuda.as_tensor(image_list[i], "HWC"),
(resize[i, 0], resize[i, 1]),
cvcuda.Interp.LINEAR,
cvcuda.RectI(
crop[i, 0],
crop[i, 1],
round(crop[i, 2] - crop[i, 0]),
round(crop[i, 3] - crop[i, 1])),
layout="HWC",
data_type=nvcv.Type.U8,
# manip=cvcuda.ChannelManip.REVERSE,
# scale=1. / 255,
stream=self.cuda_stream,
)
print(f'rank_{self.device_id} start rotate...', flush=True)
aug_img = cvcuda.rotate(
aug_img,
rotate[i],
[0.5 * (aug_img.shape[1] - aug_img.shape[1] * cor_r[i] - aug_img.shape[0] * sin_r[i]),
0.5 * (aug_img.shape[0] + aug_img.shape[1] * sin_r[i] - aug_img.shape[0] * cor_r[i])],
cvcuda.Interp.LINEAR,
stream=self.cuda_stream,
)
tensor_list.append(aug_img)
# Stack the list of tensors to a single NHWC tensor and convert to NCHW.
print(f'rank_{self.device_id} start reformat...', flush=True)
cvcuda_decoded_tensor = cvcuda.reformat(cvcuda.stack(tensor_list), "NCHW", stream=self.cuda_stream)
if self.cvcuda_perf is not None:
self.cvcuda_perf.pop_range()
print(f'rank_{self.device_id} end of ImageBatchDecoder...', flush=True)
return cvcuda_decoded_tensor