Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip injection of nvidia-persistenced socket by default #694

Merged
merged 4 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/nvidia-container-runtime-hook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ func doPrestart() {
rootfs := getRootfsPath(container)

args := []string{getCLIPath(cli)}

// Only include the nvidia-persistenced socket if it is explicitly enabled.
if !hook.Features.IncludePersistencedSocket.IsEnabled() {
args = append(args, "--no-persistenced")
}

if cli.Root != "" {
args = append(args, fmt.Sprintf("--root=%s", cli.Root))
}
Expand Down
8 changes: 8 additions & 0 deletions cmd/nvidia-ctk/cdi/generate/generate.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ type options struct {
files cli.StringSlice
ignorePatterns cli.StringSlice
}

includePersistencedSocket bool
}

// NewCommand constructs a generate-cdi command with the specified logger
Expand Down Expand Up @@ -169,6 +171,11 @@ func (m command) build() *cli.Command {
Usage: "Specify a pattern the CSV mount specifications.",
Destination: &opts.csv.ignorePatterns,
},
&cli.BoolFlag{
Name: "include-persistenced-socket",
Usage: "Include the nvidia-persistenced socket in the generated CDI specification.",
Destination: &opts.includePersistencedSocket,
},
}

return &c
Expand Down Expand Up @@ -273,6 +280,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()),
nvcdi.WithCSVFiles(opts.csv.files.Value()),
nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()),
nvcdi.WithOptInFeature("include-persistenced-socket", opts.includePersistencedSocket),
)
if err != nil {
return nil, fmt.Errorf("failed to create CDI library: %v", err)
Expand Down
65 changes: 35 additions & 30 deletions internal/config/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ package config
type featureName string

const (
FeatureGDS = featureName("gds")
FeatureMOFED = featureName("mofed")
FeatureNVSWITCH = featureName("nvswitch")
FeatureGDRCopy = featureName("gdrcopy")
FeatureGDS = featureName("gds")
FeatureMOFED = featureName("mofed")
FeatureNVSWITCH = featureName("nvswitch")
FeatureGDRCopy = featureName("gdrcopy")
FeatureIncludePersistencedSocket = featureName("include-persistenced-socket")
)

// features specifies a set of named features.
Expand All @@ -31,53 +32,57 @@ type features struct {
MOFED *feature `toml:"mofed,omitempty"`
NVSWITCH *feature `toml:"nvswitch,omitempty"`
GDRCopy *feature `toml:"gdrcopy,omitempty"`
// IncludePersistencedSocket enables the injection of the nvidia-persistenced
// socket into containers.
IncludePersistencedSocket *feature `toml:"include-persistenced-socket,omitempty"`
}

type feature bool

// IsEnabled checks whether a specified named feature is enabled.
// IsEnabledInEnvironment checks whether a specified named feature is enabled.
// An optional list of environments to check for feature-specific environment
// variables can also be supplied.
func (fs features) IsEnabled(n featureName, in ...getenver) bool {
featureEnvvars := map[featureName]string{
FeatureGDS: "NVIDIA_GDS",
FeatureMOFED: "NVIDIA_MOFED",
FeatureNVSWITCH: "NVIDIA_NVSWITCH",
FeatureGDRCopy: "NVIDIA_GDRCOPY",
}

envvar := featureEnvvars[n]
func (fs features) IsEnabledInEnvironment(n featureName, in ...getenver) bool {
switch n {
// Features with envvar overrides
case FeatureGDS:
return fs.GDS.isEnabled(envvar, in...)
return fs.GDS.isEnabledWithEnvvarOverride("NVIDIA_GDS", in...)
case FeatureMOFED:
return fs.MOFED.isEnabled(envvar, in...)
return fs.MOFED.isEnabledWithEnvvarOverride("NVIDIA_MOFED", in...)
case FeatureNVSWITCH:
return fs.NVSWITCH.isEnabled(envvar, in...)
return fs.NVSWITCH.isEnabledWithEnvvarOverride("NVIDIA_NVSWITCH", in...)
case FeatureGDRCopy:
return fs.GDRCopy.isEnabled(envvar, in...)
return fs.GDRCopy.isEnabledWithEnvvarOverride("NVIDIA_GDRCOPY", in...)
// Features without envvar overrides
case FeatureIncludePersistencedSocket:
return fs.IncludePersistencedSocket.IsEnabled()
default:
return false
}
}

// isEnabled checks whether a feature is enabled.
// If the enabled value is explicitly set, this is returned, otherwise the
// associated envvar is checked in the specified getenver for the string "enabled"
// A CUDA container / image can be passed here.
func (f *feature) isEnabled(envvar string, ins ...getenver) bool {
// IsEnabled checks whether a feature is enabled.
func (f *feature) IsEnabled() bool {
if f != nil {
return bool(*f)
}
if envvar == "" {
return false
}
for _, in := range ins {
if in.Getenv(envvar) == "enabled" {
return true
return false
}

// isEnabledWithEnvvarOverride checks whether a feature is enabled and allows an envvar to overide the feature.
// If the enabled value is explicitly set, this is returned, otherwise the
// associated envvar is checked in the specified getenver for the string "enabled"
// A CUDA container / image can be passed here.
func (f *feature) isEnabledWithEnvvarOverride(envvar string, ins ...getenver) bool {
if envvar != "" {
for _, in := range ins {
if in.Getenv(envvar) == "enabled" {
return true
}
}
}
return false

return f.IsEnabled()
}

type getenver interface {
Expand Down
13 changes: 8 additions & 5 deletions internal/discover/ipc.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ import (
type ipcMounts mounts

// NewIPCDiscoverer creats a discoverer for NVIDIA IPC sockets.
func NewIPCDiscoverer(logger logger.Interface, driverRoot string) (Discover, error) {
func NewIPCDiscoverer(logger logger.Interface, driverRoot string, includePersistencedSocket bool) (Discover, error) {
var requiredSockets []string
if includePersistencedSocket {
requiredSockets = append(requiredSockets, "/nvidia-persistenced/socket")
}
requiredSockets = append(requiredSockets, "/nvidia-fabricmanager/socket")

sockets := newMounts(
logger,
lookup.NewFileLocator(
Expand All @@ -34,10 +40,7 @@ func NewIPCDiscoverer(logger logger.Interface, driverRoot string) (Discover, err
lookup.WithCount(1),
),
driverRoot,
[]string{
"/nvidia-persistenced/socket",
"/nvidia-fabricmanager/socket",
cdesiniotis marked this conversation as resolved.
Show resolved Hide resolved
},
requiredSockets,
)

mps := newMounts(
Expand Down
1 change: 1 addition & 0 deletions internal/modifier/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
nvcdi.WithVendor("runtime.nvidia.com"),
nvcdi.WithClass("gpu"),
nvcdi.WithOptInFeature("include-persistenced-socket", cfg.Features.IncludePersistencedSocket.IsEnabled()),
)
if err != nil {
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
Expand Down
8 changes: 4 additions & 4 deletions internal/modifier/gated.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,31 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
driverRoot := cfg.NVIDIAContainerCLIConfig.Root
devRoot := cfg.NVIDIAContainerCLIConfig.Root

if cfg.Features.IsEnabled(config.FeatureGDS, image) {
if cfg.Features.IsEnabledInEnvironment(config.FeatureGDS, image) {
cdesiniotis marked this conversation as resolved.
Show resolved Hide resolved
d, err := discover.NewGDSDiscoverer(logger, driverRoot, devRoot)
if err != nil {
return nil, fmt.Errorf("failed to construct discoverer for GDS devices: %w", err)
}
discoverers = append(discoverers, d)
}

if cfg.Features.IsEnabled(config.FeatureMOFED, image) {
if cfg.Features.IsEnabledInEnvironment(config.FeatureMOFED, image) {
d, err := discover.NewMOFEDDiscoverer(logger, devRoot)
if err != nil {
return nil, fmt.Errorf("failed to construct discoverer for MOFED devices: %w", err)
}
discoverers = append(discoverers, d)
}

if cfg.Features.IsEnabled(config.FeatureNVSWITCH, image) {
if cfg.Features.IsEnabledInEnvironment(config.FeatureNVSWITCH, image) {
d, err := discover.NewNvSwitchDiscoverer(logger, devRoot)
if err != nil {
return nil, fmt.Errorf("failed to construct discoverer for NVSWITCH devices: %w", err)
}
discoverers = append(discoverers, d)
}

if cfg.Features.IsEnabled(config.FeatureGDRCopy, image) {
if cfg.Features.IsEnabledInEnvironment(config.FeatureGDRCopy, image) {
d, err := discover.NewGDRCopyDiscoverer(logger, devRoot)
if err != nil {
return nil, fmt.Errorf("failed to construct discoverer for GDRCopy devices: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/nvcdi/common-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
}

driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib)
driverFiles, err := l.NewDriverDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
}
Expand Down
22 changes: 11 additions & 11 deletions pkg/nvcdi/driver-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,41 +34,41 @@ import (

// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
// The supplied NVML Library is used to query the expected driver version.
func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) {
if r := nvmllib.Init(); r != nvml.SUCCESS {
func (l *nvmllib) NewDriverDiscoverer() (discover.Discover, error) {
if r := l.nvmllib.Init(); r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
}
defer func() {
if r := nvmllib.Shutdown(); r != nvml.SUCCESS {
logger.Warningf("failed to shutdown NVML: %v", r)
if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS {
l.logger.Warningf("failed to shutdown NVML: %v", r)
}
}()

version, r := nvmllib.SystemGetDriverVersion()
version, r := l.nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to determine driver version: %v", r)
}

return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
return (*nvcdilib)(l).newDriverVersionDiscoverer(version)
}

func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) {
libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover, error) {
libraries, err := NewDriverLibraryDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
}

ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root)
ipcs, err := discover.NewIPCDiscoverer(l.logger, l.driver.Root, l.optInFeatures["include-persistenced-socket"])
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err)
}

firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version)
firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err)
}

binaries := NewDriverBinariesDiscoverer(logger, driver.Root)
binaries := NewDriverBinariesDiscoverer(l.logger, l.driver.Root)

d := discover.Merge(
libraries,
Expand Down
5 changes: 5 additions & 0 deletions pkg/nvcdi/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ type nvcdilib struct {
infolib info.Interface

mergedDeviceOptions []transform.MergedDeviceOption

optInFeatures map[string]bool
}

// New creates a new nvcdi library
Expand Down Expand Up @@ -130,6 +132,9 @@ func New(opts ...Option) (Interface, error) {
if l.vendor == "" {
l.vendor = "management.nvidia.com"
}
// For management specifications we always allow the fabricmanager and
// persistenced sockets.
WithOptInFeature("include-persistenced-socket", true)(l)
lib = (*managementlib)(l)
case ModeNvml:
lib = (*nvmllib)(l)
Expand Down
2 changes: 1 addition & 1 deletion pkg/nvcdi/management.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
}

driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version)
driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version)
if err != nil {
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
}
Expand Down
11 changes: 11 additions & 0 deletions pkg/nvcdi/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,14 @@ func WithLibrarySearchPaths(paths []string) Option {
o.librarySearchPaths = paths
}
}

// WithOptInFeature sets a specific opt-in feature.
// Note that previous opt-in-features are not removed.
func WithOptInFeature(feature string, enabled bool) Option {
return func(n *nvcdilib) {
if n.optInFeatures == nil {
n.optInFeatures = make(map[string]bool)
}
n.optInFeatures[feature] = enabled
}
}
12 changes: 12 additions & 0 deletions tools/container/toolkit/toolkit.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ type options struct {
acceptNVIDIAVisibleDevicesAsVolumeMounts bool

ignoreErrors bool

optInFeatures cli.StringSlice
}

func main() {
Expand Down Expand Up @@ -250,6 +252,12 @@ func main() {
Destination: &opts.createDeviceNodes,
EnvVars: []string{"CREATE_DEVICE_NODES"},
},
&cli.StringSliceFlag{
Name: "opt-in-feature",
Hidden: true,
Destination: &opts.optInFeatures,
EnvVars: []string{"NVIDIA_CONTAINER_TOOLKIT_OPT_IN_FEATURES"},
},
}

// Update the subcommand flags with the common subcommand flags
Expand Down Expand Up @@ -518,6 +526,10 @@ func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContai
"nvidia-container-runtime.runtimes": opts.ContainerRuntimeRuntimes,
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
}
for _, feature := range opts.optInFeatures.Value() {
optionalConfigValues["features."+feature] = true
}

for key, value := range optionalConfigValues {
if !c.IsSet(key) {
log.Infof("Skipping unset option: %v", key)
Expand Down