Skip to content

Commit 7147fe9

Browse files
committed
MIG support in GPU isolator
1 parent c77bc4b commit 7147fe9

File tree

1 file changed

+164
-55
lines changed

1 file changed

+164
-55
lines changed

src/slave/containerizer/mesos/isolators/gpu/isolator.cpp

Lines changed: 164 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,91 @@ namespace mesos {
9191
namespace internal {
9292
namespace slave {
9393

94+
namespace {
95+
96+
Try<Nothing> allowDevice(
97+
const std::string& hierarchy,
98+
const std::string& cgroup,
99+
unsigned int major,
100+
unsigned int minor)
101+
{
102+
cgroups::devices::Entry entry;
103+
entry.selector.type = Entry::Selector::Type::CHARACTER;
104+
entry.selector.major = major;
105+
entry.selector.minor = minor;
106+
entry.access.read = true;
107+
entry.access.write = true;
108+
entry.access.mknod = true;
109+
110+
Try<Nothing> allow = cgroups::devices::allow(
111+
hierarchy, cgroup, entry);
112+
113+
if (allow.isError()) {
114+
return Error("Failed to allow device '" + stringify(entry)
115+
+ "': " + allow.error());
116+
}
117+
118+
return Nothing();
119+
}
120+
121+
122+
Try<Nothing> denyDevice(
123+
const std::string& hierarchy,
124+
const std::string& cgroup,
125+
unsigned int major,
126+
unsigned int minor)
127+
{
128+
cgroups::devices::Entry entry;
129+
entry.selector.type = Entry::Selector::Type::CHARACTER;
130+
entry.selector.major = major;
131+
entry.selector.minor = minor;
132+
entry.access.read = true;
133+
entry.access.write = true;
134+
entry.access.mknod = true;
135+
136+
Try<Nothing> deny = cgroups::devices::deny(
137+
hierarchy, cgroup, entry);
138+
139+
if (deny.isError()) {
140+
return Error("Failed to deny device '" + stringify(entry)
141+
+ "': " + deny.error());
142+
}
143+
144+
return Nothing();
145+
}
146+
147+
148+
Try<Nothing> addDeviceToContainer(
149+
const string& device,
150+
const string& devicesDir,
151+
const string& rootfsDir,
152+
ContainerLaunchInfo& launchInfo)
153+
{
154+
const string devicePath = path::join(
155+
devicesDir, strings::remove(device, "/dev/", strings::PREFIX), device);
156+
157+
Try<Nothing> mknod =
158+
fs::chroot::copyDeviceNode(device, devicePath);
159+
if (mknod.isError()) {
160+
return Error("Failed to copy device: " + mknod.error());
161+
}
162+
163+
// Since we are adding the GPU devices to the container, make
164+
// them read/write to guarantee that they are accessible inside
165+
// the container.
166+
Try<Nothing> chmod = os::chmod(devicePath, 0666);
167+
if (chmod.isError()) {
168+
return Error("Failed to set permissions: " + chmod.error());
169+
}
170+
171+
*launchInfo.add_mounts() = protobuf::slave::createContainerMount(
172+
devicePath, path::join(rootfsDir, device), MS_BIND);
173+
174+
return Nothing();
175+
}
176+
177+
} // namespace {
178+
94179
NvidiaGpuIsolatorProcess::NvidiaGpuIsolatorProcess(
95180
const Flags& _flags,
96181
const string& _hierarchy,
@@ -297,9 +382,24 @@ Future<Nothing> NvidiaGpuIsolatorProcess::recover(
297382
foreach (const Gpu& gpu, available) {
298383
if (entry.selector.major == gpu.major &&
299384
entry.selector.minor == gpu.minor) {
300-
containerGpus.insert(gpu);
301-
break;
302-
}
385+
if (gpu.ismig) {
386+
// The GPU device itself; only a match with a GPU that
387+
// isn't a MIG instance, as MIG instances need access to
388+
// the GPU device and the MIG devices.
389+
continue;
390+
}
391+
392+
containerGpus.insert(gpu);
393+
break;
394+
}
395+
396+
// Match up MIG devices
397+
if ((entry.selector.major == gpu.caps_major)
398+
&& ((entry.selector.minor == gpu.gi_minor)
399+
|| (entry.selector.minor == gpu.ci_minor))) {
400+
containerGpus.insert(gpu);
401+
break;
402+
}
303403
}
304404
}
305405

@@ -443,39 +543,23 @@ Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::_prepare(
443543
}
444544

445545
foreach (const string& device, nvidia.get()) {
446-
// The directory `/dev/nvidia-caps` was introduced in CUDA 11.0, just
447-
// ignore it since we only care about the Nvidia GPU device files.
448-
//
449-
// TODO(qianzhang): Figure out how to handle the directory
450-
// `/dev/nvidia-caps` more properly.
546+
// Ignore /dev/nvidia-caps, we'll handle that directory later on
451547
if (device == "/dev/nvidia-caps") {
452548
continue;
453549
}
454550

455-
const string devicePath = path::join(
456-
devicesDir, strings::remove(device, "/dev/", strings::PREFIX), device);
457-
458-
Try<Nothing> mknod =
459-
fs::chroot::copyDeviceNode(device, devicePath);
460-
if (mknod.isError()) {
461-
return Failure(
462-
"Failed to copy device '" + device + "': " + mknod.error());
551+
Try<Nothing> added = addDeviceToContainer(device, devicesDir, containerConfig.rootfs(), launchInfo);
552+
if (added.isError()) {
553+
return Failure("Could not add device '" + device + "' to container: " + added.error());
463554
}
555+
}
464556

465-
// Since we are adding the GPU devices to the container, make
466-
// them read/write to guarantee that they are accessible inside
467-
// the container.
468-
Try<Nothing> chmod = os::chmod(devicePath, 0666);
469-
if (chmod.isError()) {
470-
return Failure(
471-
"Failed to set permissions on device '" + device + "': " +
472-
chmod.error());
557+
Try<list<string>> caps = os::glob("/dev/nvidia-caps/*");
558+
foreach (const string& device, caps.get()) {
559+
Try<Nothing> added = addDeviceToContainer(device, devicesDir, containerConfig.rootfs(), launchInfo);
560+
if (added.isError()) {
561+
return Failure("Could not add device '" + device + "' to container: " + added.error());
473562
}
474-
475-
*launchInfo.add_mounts() = protobuf::slave::createContainerMount(
476-
devicePath,
477-
path::join(containerConfig.rootfs(), device),
478-
MS_BIND);
479563
}
480564

481565
return launchInfo;
@@ -520,31 +604,55 @@ Future<Nothing> NvidiaGpuIsolatorProcess::update(
520604
} else if (requested < info->allocated.size()) {
521605
size_t fewer = info->allocated.size() - requested;
522606

607+
set<std::pair<unsigned int, unsigned int>> deallocated_devs;
523608
set<Gpu> deallocated;
524609

525610
for (size_t i = 0; i < fewer; i++) {
526611
const auto gpu = info->allocated.begin();
527612

528-
cgroups::devices::Entry entry;
529-
entry.selector.type = Entry::Selector::Type::CHARACTER;
530-
entry.selector.major = gpu->major;
531-
entry.selector.minor = gpu->minor;
532-
entry.access.read = true;
533-
entry.access.write = true;
534-
entry.access.mknod = true;
535-
536-
Try<Nothing> deny = cgroups::devices::deny(
537-
hierarchy, info->cgroup, entry);
538-
539-
if (deny.isError()) {
540-
return Failure("Failed to deny cgroups access to GPU device"
541-
" '" + stringify(entry) + "': " + deny.error());
613+
// We can't blindly deny the main GPU device, as it is needed
614+
// by other MIG devices on that same GPU.
615+
deallocated_devs.insert(std::make_pair(gpu->major, gpu->minor));
616+
617+
if (gpu->ismig) {
618+
// MIG GPU instance
619+
Try<Nothing> deny = denyDevice(hierarchy, info->cgroup, gpu->caps_major, gpu->gi_minor);
620+
if (deny.isError()) {
621+
return Failure("Failed to deny cgroups access to MIG GI device: " + deny.error());
622+
}
623+
624+
// MIG Compute instance
625+
deny = denyDevice(hierarchy, info->cgroup, gpu->caps_major, gpu->ci_minor);
626+
if (deny.isError()) {
627+
return Failure("Failed to deny cgroups access to MIG CI device: " + deny.error());
628+
}
542629
}
543630

544631
deallocated.insert(*gpu);
545632
info->allocated.erase(gpu);
546633
}
547634

635+
set<std::pair<unsigned int, unsigned int>> allocated_devs;
636+
foreach (Gpu gpu, info->allocated) {
637+
allocated_devs.insert(std::make_pair(gpu.major, gpu.minor));
638+
}
639+
640+
// Any GPU device present in the difference of the two sets can now
641+
// be denied, as it is not needed by any of the remaining allocated
642+
// GPUs.
643+
set<std::pair<unsigned int, unsigned int>> safe_deny;
644+
std::set_difference(deallocated_devs.begin(), deallocated_devs.end(),
645+
allocated_devs.begin(), allocated_devs.end(),
646+
std::inserter(safe_deny, safe_deny.begin()));
647+
648+
foreach (auto dev, safe_deny) {
649+
// Main GPU device node
650+
Try<Nothing> deny = denyDevice(hierarchy, info->cgroup, dev.first, dev.second);
651+
if (deny.isError()) {
652+
return Failure("Failed to deny cgroups access to GPU device: " + deny.error());
653+
}
654+
}
655+
548656
return allocator.deallocate(deallocated);
549657
}
550658

@@ -563,20 +671,21 @@ Future<Nothing> NvidiaGpuIsolatorProcess::_update(
563671
Info* info = CHECK_NOTNULL(infos.at(containerId));
564672

565673
foreach (const Gpu& gpu, allocation) {
566-
cgroups::devices::Entry entry;
567-
entry.selector.type = Entry::Selector::Type::CHARACTER;
568-
entry.selector.major = gpu.major;
569-
entry.selector.minor = gpu.minor;
570-
entry.access.read = true;
571-
entry.access.write = true;
572-
entry.access.mknod = true;
674+
Try<Nothing> allow = allowDevice(hierarchy, info->cgroup, gpu.major, gpu.minor);
675+
if (allow.isError()) {
676+
return Failure("Failed to grant cgroups access to GPU device: " + allow.error());
677+
}
573678

574-
Try<Nothing> allow = cgroups::devices::allow(
575-
hierarchy, info->cgroup, entry);
679+
if (gpu.ismig) {
680+
allow = allowDevice(hierarchy, info->cgroup, gpu.caps_major, gpu.gi_minor);
681+
if (allow.isError()) {
682+
return Failure("Failed to grant cgroups access to MIG GI device: " + allow.error());
683+
}
576684

577-
if (allow.isError()) {
578-
return Failure("Failed to grant cgroups access to GPU device"
579-
" '" + stringify(entry) + "': " + allow.error());
685+
allow = allowDevice(hierarchy, info->cgroup, gpu.caps_major, gpu.ci_minor);
686+
if (allow.isError()) {
687+
return Failure("Failed to grant cgroups access to MIG CI device: " + allow.error());
688+
}
580689
}
581690
}
582691

0 commit comments

Comments
 (0)