diff --git a/cmd/nvidia-cdi-hook/commands/commands.go b/cmd/nvidia-cdi-hook/commands/commands.go index e5d7c78a..8917c25d 100644 --- a/cmd/nvidia-cdi-hook/commands/commands.go +++ b/cmd/nvidia-cdi-hook/commands/commands.go @@ -22,6 +22,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod" symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks" "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat" + disabledevicenodemodification "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/disable-device-node-modification" ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" ) @@ -34,6 +35,7 @@ func New(logger logger.Interface) []*cli.Command { symlinks.NewCommand(logger), chmod.NewCommand(logger), cudacompat.NewCommand(logger), + disabledevicenodemodification.NewCommand(logger), } } diff --git a/cmd/nvidia-cdi-hook/disable-device-node-modification/disable-device-node-modification.go b/cmd/nvidia-cdi-hook/disable-device-node-modification/disable-device-node-modification.go new file mode 100644 index 00000000..961c0e44 --- /dev/null +++ b/cmd/nvidia-cdi-hook/disable-device-node-modification/disable-device-node-modification.go @@ -0,0 +1,144 @@ +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package disabledevicenodemodification + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "os" + "strings" + + "github.com/urfave/cli/v2" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" +) + +const ( + nvidiaDriverParamsPath = "/proc/driver/nvidia/params" +) + +type options struct { + containerSpec string +} + +// NewCommand constructs an disable-device-node-modification subcommand with the specified logger +func NewCommand(logger logger.Interface) *cli.Command { + cfg := options{} + + c := cli.Command{ + Name: "disable-device-node-modification", + Usage: "Ensure that the /proc/driver/nvidia/params file present in the container does not allow device node modifications.", + Before: func(c *cli.Context) error { + return validateFlags(c, &cfg) + }, + Action: func(c *cli.Context) error { + return run(c, &cfg) + }, + } + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "container-spec", + Hidden: true, + Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN", + Destination: &cfg.containerSpec, + }, + } + + return &c +} + +func validateFlags(c *cli.Context, cfg *options) error { + return nil +} + +func run(_ *cli.Context, cfg *options) error { + modifiedParamsFileContents, err := getModifiedNVIDIAParamsContents() + if err != nil { + return fmt.Errorf("failed to get modified params file contents: %w", err) + } + if len(modifiedParamsFileContents) == 0 { + return nil + } + + s, err := oci.LoadContainerState(cfg.containerSpec) + if err != nil { + return fmt.Errorf("failed to load container state: %w", err) + } + + containerRootDirPath, err := s.GetContainerRoot() + if err != nil { + return fmt.Errorf("failed to determined container root: %w", err) + } + + return createParamsFileInContainer(containerRootDirPath, modifiedParamsFileContents) +} + +func getModifiedNVIDIAParamsContents() ([]byte, error) { + hostNvidiaParamsFile, err := os.Open(nvidiaDriverParamsPath) + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("failed to load params file: %w", err) + } + defer hostNvidiaParamsFile.Close() + + modifiedContents, err := getModifiedParamsFileContentsFromReader(hostNvidiaParamsFile) + if err != nil { + return nil, fmt.Errorf("failed to get modfied params file contents: %w", err) + } + + return modifiedContents, nil +} + +// getModifiedParamsFileContentsFromReader returns the contents of a modified params file from the specified reader. +func getModifiedParamsFileContentsFromReader(r io.Reader) ([]byte, error) { + var modified bytes.Buffer + scanner := bufio.NewScanner(r) + + var requiresModification bool + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, "ModifyDeviceFiles: ") { + if line == "ModifyDeviceFiles: 0" { + return nil, nil + } + if line == "ModifyDeviceFiles: 1" { + line = "ModifyDeviceFiles: 0" + requiresModification = true + } + } + if _, err := modified.WriteString(line + "\n"); err != nil { + return nil, fmt.Errorf("failed to create output buffer: %w", err) + } + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("failed to read params file: %w", err) + } + + if !requiresModification { + return nil, nil + } + + return modified.Bytes(), nil +} diff --git a/cmd/nvidia-cdi-hook/disable-device-node-modification/disable-device-node-modification_test.go b/cmd/nvidia-cdi-hook/disable-device-node-modification/disable-device-node-modification_test.go new file mode 100644 index 00000000..9ddb67ad --- /dev/null +++ b/cmd/nvidia-cdi-hook/disable-device-node-modification/disable-device-node-modification_test.go @@ -0,0 +1,91 @@ +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package disabledevicenodemodification + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGetModifiedParamsFileContentsFromReader(t *testing.T) { + testCases := map[string]struct { + contents []byte + expectedError error + expectedContents []byte + }{ + "no contents": { + contents: nil, + expectedError: nil, + expectedContents: nil, + }, + "other contents are ignored": { + contents: []byte(`# Some other content + that we don't care about + `), + expectedError: nil, + expectedContents: nil, + }, + "already zero requires no modification": { + contents: []byte("ModifyDeviceFiles: 0"), + expectedError: nil, + expectedContents: nil, + }, + "leading spaces require no modification": { + contents: []byte(" ModifyDeviceFiles: 1"), + }, + "Trailing spaces require no modification": { + contents: []byte("ModifyDeviceFiles: 1 "), + }, + "Not 1 require no modification": { + contents: []byte("ModifyDeviceFiles: 11"), + }, + "single line requires modification": { + contents: []byte("ModifyDeviceFiles: 1"), + expectedError: nil, + expectedContents: []byte("ModifyDeviceFiles: 0\n"), + }, + "single line with trailing newline requires modification": { + contents: []byte("ModifyDeviceFiles: 1\n"), + expectedError: nil, + expectedContents: []byte("ModifyDeviceFiles: 0\n"), + }, + "other content is maintained": { + contents: []byte(`ModifyDeviceFiles: 1 + other content + that + is maintained`), + expectedError: nil, + expectedContents: []byte(`ModifyDeviceFiles: 0 + other content + that + is maintained +`), + }, + } + + for description, tc := range testCases { + t.Run(description, func(t *testing.T) { + contents, err := getModifiedParamsFileContentsFromReader(bytes.NewReader(tc.contents)) + require.EqualValues(t, tc.expectedError, err) + require.EqualValues(t, string(tc.expectedContents), string(contents)) + }) + } + +} diff --git a/cmd/nvidia-cdi-hook/disable-device-node-modification/params_linux.go b/cmd/nvidia-cdi-hook/disable-device-node-modification/params_linux.go new file mode 100644 index 00000000..b14e1821 --- /dev/null +++ b/cmd/nvidia-cdi-hook/disable-device-node-modification/params_linux.go @@ -0,0 +1,63 @@ +//go:build linux + +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package disabledevicenodemodification + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/utils" + "golang.org/x/sys/unix" +) + +func createParamsFileInContainer(containerRootDirPath string, contents []byte) error { + tmpRoot, err := os.MkdirTemp("", "nvct-empty-dir*") + if err != nil { + return fmt.Errorf("failed to create temp root: %w", err) + } + + if err := createTmpFs(tmpRoot, len(contents)); err != nil { + return fmt.Errorf("failed to create tmpfs mount for params file: %w", err) + } + + modifiedParamsFile, err := os.OpenFile(filepath.Join(tmpRoot, "nvct-params"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0444) + if err != nil { + return fmt.Errorf("failed to open modified params file: %w", err) + } + defer modifiedParamsFile.Close() + + if _, err := modifiedParamsFile.Write(contents); err != nil { + return fmt.Errorf("failed to write temporary params file: %w", err) + } + + err = utils.WithProcfd(containerRootDirPath, nvidiaDriverParamsPath, func(nvidiaDriverParamsFdPath string) error { + return unix.Mount(modifiedParamsFile.Name(), nvidiaDriverParamsFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "") + }) + if err != nil { + return fmt.Errorf("failed to mount modified params file: %w", err) + } + + return nil +} + +func createTmpFs(target string, size int) error { + return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size)) +} diff --git a/cmd/nvidia-cdi-hook/disable-device-node-modification/params_other.go b/cmd/nvidia-cdi-hook/disable-device-node-modification/params_other.go new file mode 100644 index 00000000..9032e514 --- /dev/null +++ b/cmd/nvidia-cdi-hook/disable-device-node-modification/params_other.go @@ -0,0 +1,27 @@ +//go:build !linux +// +build !linux + +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package disabledevicenodemodification + +import "fmt" + +func createParamsFileInContainer(containerRootDirPath string, contents []byte) error { + return fmt.Errorf("not supported") +} diff --git a/cmd/nvidia-ctk/cdi/generate/generate_test.go b/cmd/nvidia-ctk/cdi/generate/generate_test.go index a08ab19c..6f762761 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate_test.go +++ b/cmd/nvidia-ctk/cdi/generate/generate_test.go @@ -111,6 +111,13 @@ containerEdits: - /lib/x86_64-linux-gnu env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - disable-device-node-modification + env: + - NVIDIA_CTK_DEBUG=false mounts: - hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77 containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77 @@ -180,6 +187,13 @@ containerEdits: - /lib/x86_64-linux-gnu env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - disable-device-node-modification + env: + - NVIDIA_CTK_DEBUG=false mounts: - hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77 containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77 @@ -240,6 +254,13 @@ containerEdits: - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - disable-device-node-modification + env: + - NVIDIA_CTK_DEBUG=false mounts: - hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77 containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77 diff --git a/internal/discover/hooks.go b/internal/discover/hooks.go index 5e2cdec4..3f2c9ebb 100644 --- a/internal/discover/hooks.go +++ b/internal/discover/hooks.go @@ -35,6 +35,11 @@ const ( ChmodHook = HookName("chmod") // A CreateSymlinksHook is used to create symlinks in the container. CreateSymlinksHook = HookName("create-symlinks") + // DisableDeviceNodeModificationHook refers to the hook used to ensure that + // device nodes are not created by libnvidia-ml.so or nvidia-smi in a + // container. + // Added in v1.17.8 + DisableDeviceNodeModificationHook = HookName("disable-device-node-modification") // An EnableCudaCompatHook is used to enabled CUDA Forward Compatibility. // Added in v1.17.5 EnableCudaCompatHook = HookName("enable-cuda-compat") diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index 4ff11e47..fa05029c 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -46,6 +46,11 @@ const ( // A CreateSymlinksHook is used to create symlinks in the container. CreateSymlinksHook = discover.CreateSymlinksHook + // DisableDeviceNodeModificationHook refers to the hook used to ensure that + // device nodes are not created by libnvidia-ml.so or nvidia-smi in a + // container. + // Added in v1.17.8 + DisableDeviceNodeModificationHook = discover.DisableDeviceNodeModificationHook // An EnableCudaCompatHook is used to enabled CUDA Forward Compatibility. // Added in v1.17.5 EnableCudaCompatHook = discover.EnableCudaCompatHook diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index ff02ac72..764b648a 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -113,6 +113,9 @@ func (l *nvcdilib) NewDriverLibraryDiscoverer(version string) (discover.Discover updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.hookCreator, l.ldconfigPath) discoverers = append(discoverers, updateLDCache) + disableDeviceNodeModification := l.hookCreator.Create(DisableDeviceNodeModificationHook) + discoverers = append(discoverers, disableDeviceNodeModification) + d := discover.Merge(discoverers...) return d, nil diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 409721ef..8d7177fc 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -130,7 +130,7 @@ func New(opts ...Option) (Interface, error) { l.vendor = "management.nvidia.com" } // Management containers in general do not require CUDA Forward compatibility. - l.disabledHooks = append(l.disabledHooks, HookEnableCudaCompat) + l.disabledHooks = append(l.disabledHooks, HookEnableCudaCompat, DisableDeviceNodeModificationHook) lib = (*managementlib)(l) case ModeNvml: lib = (*nvmllib)(l) diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index 2b1ef289..9fcee330 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -216,4 +216,23 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { Expect(ldconfigOut).To(ContainSubstring("/usr/lib64")) }) }) + + Describe("Disabling device node creation", Ordered, func() { + BeforeAll(func(ctx context.Context) { + _, _, err := runner.Run("docker pull ubuntu") + Expect(err).ToNot(HaveOccurred()) + }) + + It("should work with nvidia-container-runtime-hook", func(ctx context.Context) { + output, _, err := runner.Run("docker run --rm -i --runtime=runc --gpus=all ubuntu bash -c \"grep ModifyDeviceFiles: /proc/driver/nvidia/params\"") + Expect(err).ToNot(HaveOccurred()) + Expect(output).To(Equal("ModifyDeviceFiles: 0\n")) + }) + + It("should work with automatic CDI spec generation", func(ctx context.Context) { + output, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu bash -c \"grep ModifyDeviceFiles: /proc/driver/nvidia/params\"") + Expect(err).ToNot(HaveOccurred()) + Expect(output).To(Equal("ModifyDeviceFiles: 0\n")) + }) + }) })