@@ -91,6 +91,91 @@ namespace mesos {
91
91
namespace internal {
92
92
namespace slave {
93
93
94
+ namespace {
95
+
96
+ Try<Nothing> allowDevice (
97
+ const std::string& hierarchy,
98
+ const std::string& cgroup,
99
+ unsigned int major,
100
+ unsigned int minor)
101
+ {
102
+ cgroups::devices::Entry entry;
103
+ entry.selector .type = Entry::Selector::Type::CHARACTER;
104
+ entry.selector .major = major;
105
+ entry.selector .minor = minor;
106
+ entry.access .read = true ;
107
+ entry.access .write = true ;
108
+ entry.access .mknod = true ;
109
+
110
+ Try<Nothing> allow = cgroups::devices::allow (
111
+ hierarchy, cgroup, entry);
112
+
113
+ if (allow.isError ()) {
114
+ return Error (" Failed to allow device '" + stringify (entry)
115
+ + " ': " + allow.error ());
116
+ }
117
+
118
+ return Nothing ();
119
+ }
120
+
121
+
122
+ Try<Nothing> denyDevice (
123
+ const std::string& hierarchy,
124
+ const std::string& cgroup,
125
+ unsigned int major,
126
+ unsigned int minor)
127
+ {
128
+ cgroups::devices::Entry entry;
129
+ entry.selector .type = Entry::Selector::Type::CHARACTER;
130
+ entry.selector .major = major;
131
+ entry.selector .minor = minor;
132
+ entry.access .read = true ;
133
+ entry.access .write = true ;
134
+ entry.access .mknod = true ;
135
+
136
+ Try<Nothing> deny = cgroups::devices::deny (
137
+ hierarchy, cgroup, entry);
138
+
139
+ if (deny.isError ()) {
140
+ return Error (" Failed to deny device '" + stringify (entry)
141
+ + " ': " + deny.error ());
142
+ }
143
+
144
+ return Nothing ();
145
+ }
146
+
147
+
148
+ Try<Nothing> addDeviceToContainer (
149
+ const string& device,
150
+ const string& devicesDir,
151
+ const string& rootfsDir,
152
+ ContainerLaunchInfo& launchInfo)
153
+ {
154
+ const string devicePath = path::join (
155
+ devicesDir, strings::remove (device, " /dev/" , strings::PREFIX), device);
156
+
157
+ Try<Nothing> mknod =
158
+ fs::chroot::copyDeviceNode (device, devicePath);
159
+ if (mknod .isError ()) {
160
+ return Error (" Failed to copy device: " + mknod .error ());
161
+ }
162
+
163
+ // Since we are adding the GPU devices to the container, make
164
+ // them read/write to guarantee that they are accessible inside
165
+ // the container.
166
+ Try<Nothing> chmod = os::chmod (devicePath, 0666 );
167
+ if (chmod.isError ()) {
168
+ return Error (" Failed to set permissions: " + chmod.error ());
169
+ }
170
+
171
+ *launchInfo.add_mounts () = protobuf::slave::createContainerMount (
172
+ devicePath, path::join (rootfsDir, device), MS_BIND);
173
+
174
+ return Nothing ();
175
+ }
176
+
177
+ } // namespace {
178
+
94
179
NvidiaGpuIsolatorProcess::NvidiaGpuIsolatorProcess (
95
180
const Flags& _flags,
96
181
const string& _hierarchy,
@@ -297,9 +382,24 @@ Future<Nothing> NvidiaGpuIsolatorProcess::recover(
297
382
foreach (const Gpu& gpu, available) {
298
383
if (entry.selector .major == gpu.major &&
299
384
entry.selector .minor == gpu.minor ) {
300
- containerGpus.insert (gpu);
301
- break ;
302
- }
385
+ if (gpu.ismig ) {
386
+ // The GPU device itself; only a match with a GPU that
387
+ // isn't a MIG instance, as MIG instances need access to
388
+ // the GPU device and the MIG devices.
389
+ continue ;
390
+ }
391
+
392
+ containerGpus.insert (gpu);
393
+ break ;
394
+ }
395
+
396
+ // Match up MIG devices
397
+ if ((entry.selector .major == gpu.caps_major )
398
+ && ((entry.selector .minor == gpu.gi_minor )
399
+ || (entry.selector .minor == gpu.ci_minor ))) {
400
+ containerGpus.insert (gpu);
401
+ break ;
402
+ }
303
403
}
304
404
}
305
405
@@ -443,39 +543,23 @@ Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::_prepare(
443
543
}
444
544
445
545
foreach (const string& device, nvidia.get ()) {
446
- // The directory `/dev/nvidia-caps` was introduced in CUDA 11.0, just
447
- // ignore it since we only care about the Nvidia GPU device files.
448
- //
449
- // TODO(qianzhang): Figure out how to handle the directory
450
- // `/dev/nvidia-caps` more properly.
546
+ // Ignore /dev/nvidia-caps, we'll handle that directory later on
451
547
if (device == " /dev/nvidia-caps" ) {
452
548
continue ;
453
549
}
454
550
455
- const string devicePath = path::join (
456
- devicesDir, strings::remove (device, " /dev/" , strings::PREFIX), device);
457
-
458
- Try<Nothing> mknod =
459
- fs::chroot::copyDeviceNode (device, devicePath);
460
- if (mknod .isError ()) {
461
- return Failure (
462
- " Failed to copy device '" + device + " ': " + mknod .error ());
551
+ Try<Nothing> added = addDeviceToContainer (device, devicesDir, containerConfig.rootfs (), launchInfo);
552
+ if (added.isError ()) {
553
+ return Failure (" Could not add device '" + device + " ' to container: " + added.error ());
463
554
}
555
+ }
464
556
465
- // Since we are adding the GPU devices to the container, make
466
- // them read/write to guarantee that they are accessible inside
467
- // the container.
468
- Try<Nothing> chmod = os::chmod (devicePath, 0666 );
469
- if (chmod.isError ()) {
470
- return Failure (
471
- " Failed to set permissions on device '" + device + " ': " +
472
- chmod.error ());
557
+ Try<list<string>> caps = os::glob (" /dev/nvidia-caps/*" );
558
+ foreach (const string& device, caps.get ()) {
559
+ Try<Nothing> added = addDeviceToContainer (device, devicesDir, containerConfig.rootfs (), launchInfo);
560
+ if (added.isError ()) {
561
+ return Failure (" Could not add device '" + device + " ' to container: " + added.error ());
473
562
}
474
-
475
- *launchInfo.add_mounts () = protobuf::slave::createContainerMount (
476
- devicePath,
477
- path::join (containerConfig.rootfs (), device),
478
- MS_BIND);
479
563
}
480
564
481
565
return launchInfo;
@@ -520,31 +604,55 @@ Future<Nothing> NvidiaGpuIsolatorProcess::update(
520
604
} else if (requested < info->allocated .size ()) {
521
605
size_t fewer = info->allocated .size () - requested;
522
606
607
+ set<std::pair<unsigned int , unsigned int >> deallocated_devs;
523
608
set<Gpu> deallocated;
524
609
525
610
for (size_t i = 0 ; i < fewer; i++) {
526
611
const auto gpu = info->allocated .begin ();
527
612
528
- cgroups::devices::Entry entry;
529
- entry.selector .type = Entry::Selector::Type::CHARACTER;
530
- entry.selector .major = gpu->major ;
531
- entry.selector .minor = gpu->minor ;
532
- entry.access .read = true ;
533
- entry.access .write = true ;
534
- entry.access .mknod = true ;
535
-
536
- Try<Nothing> deny = cgroups::devices::deny (
537
- hierarchy, info->cgroup , entry);
538
-
539
- if (deny.isError ()) {
540
- return Failure (" Failed to deny cgroups access to GPU device"
541
- " '" + stringify (entry) + " ': " + deny.error ());
613
+ // We can't blindly deny the main GPU device, as it is needed
614
+ // by other MIG devices on that same GPU.
615
+ deallocated_devs.insert (std::make_pair (gpu->major , gpu->minor ));
616
+
617
+ if (gpu->ismig ) {
618
+ // MIG GPU instance
619
+ Try<Nothing> deny = denyDevice (hierarchy, info->cgroup , gpu->caps_major , gpu->gi_minor );
620
+ if (deny.isError ()) {
621
+ return Failure (" Failed to deny cgroups access to MIG GI device: " + deny.error ());
622
+ }
623
+
624
+ // MIG Compute instance
625
+ deny = denyDevice (hierarchy, info->cgroup , gpu->caps_major , gpu->ci_minor );
626
+ if (deny.isError ()) {
627
+ return Failure (" Failed to deny cgroups access to MIG CI device: " + deny.error ());
628
+ }
542
629
}
543
630
544
631
deallocated.insert (*gpu);
545
632
info->allocated .erase (gpu);
546
633
}
547
634
635
+ set<std::pair<unsigned int , unsigned int >> allocated_devs;
636
+ foreach (Gpu gpu, info->allocated ) {
637
+ allocated_devs.insert (std::make_pair (gpu.major , gpu.minor ));
638
+ }
639
+
640
+ // Any GPU device present in the difference of the two sets can now
641
+ // be denied, as it is not needed by any of the remaining allocated
642
+ // GPUs.
643
+ set<std::pair<unsigned int , unsigned int >> safe_deny;
644
+ std::set_difference (deallocated_devs.begin (), deallocated_devs.end (),
645
+ allocated_devs.begin (), allocated_devs.end (),
646
+ std::inserter (safe_deny, safe_deny.begin ()));
647
+
648
+ foreach (auto dev, safe_deny) {
649
+ // Main GPU device node
650
+ Try<Nothing> deny = denyDevice (hierarchy, info->cgroup , dev.first , dev.second );
651
+ if (deny.isError ()) {
652
+ return Failure (" Failed to deny cgroups access to GPU device: " + deny.error ());
653
+ }
654
+ }
655
+
548
656
return allocator.deallocate (deallocated);
549
657
}
550
658
@@ -563,20 +671,21 @@ Future<Nothing> NvidiaGpuIsolatorProcess::_update(
563
671
Info* info = CHECK_NOTNULL (infos.at (containerId));
564
672
565
673
foreach (const Gpu& gpu, allocation) {
566
- cgroups::devices::Entry entry;
567
- entry.selector .type = Entry::Selector::Type::CHARACTER;
568
- entry.selector .major = gpu.major ;
569
- entry.selector .minor = gpu.minor ;
570
- entry.access .read = true ;
571
- entry.access .write = true ;
572
- entry.access .mknod = true ;
674
+ Try<Nothing> allow = allowDevice (hierarchy, info->cgroup , gpu.major , gpu.minor );
675
+ if (allow.isError ()) {
676
+ return Failure (" Failed to grant cgroups access to GPU device: " + allow.error ());
677
+ }
573
678
574
- Try<Nothing> allow = cgroups::devices::allow (
575
- hierarchy, info->cgroup , entry);
679
+ if (gpu.ismig ) {
680
+ allow = allowDevice (hierarchy, info->cgroup , gpu.caps_major , gpu.gi_minor );
681
+ if (allow.isError ()) {
682
+ return Failure (" Failed to grant cgroups access to MIG GI device: " + allow.error ());
683
+ }
576
684
577
- if (allow.isError ()) {
578
- return Failure (" Failed to grant cgroups access to GPU device"
579
- " '" + stringify (entry) + " ': " + allow.error ());
685
+ allow = allowDevice (hierarchy, info->cgroup , gpu.caps_major , gpu.ci_minor );
686
+ if (allow.isError ()) {
687
+ return Failure (" Failed to grant cgroups access to MIG CI device: " + allow.error ());
688
+ }
580
689
}
581
690
}
582
691
0 commit comments