Gluttony-Cluster/cluster/nvidia/helmrelease-nvidia-operator.yaml

apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
  name: gpu-operator
  namespace: nvidia-system
spec:
  chart:
    spec:
      chart: gpu-operator
      sourceRef:
        kind: HelmRepository
        name: nvidia-operator
        namespace: flux-system
  interval: 15m0s
  timeout: 5m
  releaseName: gpu-operator
  values:
    # Default values for gpu-operator.
    # This is a YAML-formatted file.
    # Declare variables to be passed into your templates.

    platform:
      openshift: false

    nfd:
      enabled: true
      nodefeaturerules: false

    psa:
      enabled: false

    cdi:
      enabled: false
      default: false

    sandboxWorkloads:
      enabled: false
      defaultWorkload: "container"

    daemonsets:
      labels: {}
      annotations: {}
      priorityClassName: system-node-critical
      tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
      # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands
      # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions
      updateStrategy: "RollingUpdate"
      # configuration for controlling rolling update of GPU Operands
      rollingUpdate:
        # maximum number of nodes to simultaneously apply pod updates on.
        # can be specified either as number or percentage of nodes. Default 1.
        maxUnavailable: "1"

    validator:
      repository: nvcr.io/nvidia/cloud-native
      image: gpu-operator-validator
      # If version is not specified, then default is to use chart.AppVersion
      #version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      args: []
      resources: {}
      plugin:
        env:
          - name: WITH_WORKLOAD
            value: "false"

    operator:
      repository: nvcr.io/nvidia
      image: gpu-operator
      # If version is not specified, then default is to use chart.AppVersion
      #version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      priorityClassName: system-node-critical
      defaultRuntime: docker
      runtimeClass: nvidia
      use_ocp_driver_toolkit: false
      # cleanup CRD on chart un-install
      cleanupCRD: false
      # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
      # to be passed during helm upgrade.
      upgradeCRD: false
      initContainer:
        image: cuda
        repository: nvcr.io/nvidia
        version: 12.3.2-base-ubi8
        imagePullPolicy: IfNotPresent
      tolerations:
      - key: "node-role.kubernetes.io/master"
        operator: "Equal"
        value: ""
        effect: "NoSchedule"
      - key: "node-role.kubernetes.io/control-plane"
        operator: "Equal"
        value: ""
        effect: "NoSchedule"
      annotations:
        openshift.io/scc: restricted-readonly
      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 1
              preference:
                matchExpressions:
                  - key: "node-role.kubernetes.io/master"
                    operator: In
                    values: [""]
            - weight: 1
              preference:
                matchExpressions:
                  - key: "node-role.kubernetes.io/control-plane"
                    operator: In
                    values: [""]
      logging:
        # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')
        timeEncoding: epoch
        # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity
        level: info
        # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)
        # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)
        develMode: false
      resources:
        limits:
          cpu: 500m
          memory: 350Mi
        requests:
          cpu: 200m
          memory: 100Mi

    mig:
      strategy: single

    driver:
      enabled: true
      nvidiaDriverCRD:
        enabled: false
        deployDefaultCR: true
        driverType: gpu
        nodeSelector: {}
      useOpenKernelModules: false
      # use pre-compiled packages for NVIDIA driver installation.
      # only supported for as a tech-preview feature on ubuntu22.04 kernels.
      usePrecompiled: false
      repository: nvcr.io/nvidia
      image: driver
      version: "550.54.15"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      startupProbe:
        initialDelaySeconds: 60
        periodSeconds: 10
        # nvidia-smi can take longer than 30s in some cases
        # ensure enough timeout is set
        timeoutSeconds: 60
        failureThreshold: 120
      rdma:
        enabled: false
        useHostMofed: false
      upgradePolicy:
        # global switch for automatic upgrade feature
        # if set to false all other options are ignored
        autoUpgrade: true
        # how many nodes can be upgraded in parallel
        # 0 means no limit, all nodes will be upgraded in parallel
        maxParallelUpgrades: 1
        # maximum number of nodes with the driver installed, that can be unavailable during
        # the upgrade. Value can be an absolute number (ex: 5) or
        # a percentage of total nodes at the start of upgrade (ex:
        # 10%). Absolute number is calculated from percentage by rounding
        # up. By default, a fixed value of 25% is used.'
        maxUnavailable: 25%
        # options for waiting on pod(job) completions
        waitForCompletion:
          timeoutSeconds: 0
          podSelector: ""
        # options for gpu pod deletion
        gpuPodDeletion:
          force: false
          timeoutSeconds: 300
          deleteEmptyDir: false
        # options for node drain (`kubectl drain`) before the driver reload
        # this is required only if default GPU pod deletions done by the operator
        # are not sufficient to re-install the driver
        drain:
          enable: false
          force: false
          podSelector: ""
          # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
          timeoutSeconds: 300
          deleteEmptyDir: false
      manager:
        image: k8s-driver-manager
        repository: nvcr.io/nvidia/cloud-native
        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
        version: v0.6.7
        imagePullPolicy: IfNotPresent
        env:
          - name: ENABLE_GPU_POD_EVICTION
            value: "true"
          - name: ENABLE_AUTO_DRAIN
            value: "false"
          - name: DRAIN_USE_FORCE
            value: "false"
          - name: DRAIN_POD_SELECTOR_LABEL
            value: ""
          - name: DRAIN_TIMEOUT_SECONDS
            value: "0s"
          - name: DRAIN_DELETE_EMPTYDIR_DATA
            value: "false"
      env: []
      resources: {}
      # Private mirror repository configuration
      repoConfig:
        configMapName: ""
      # custom ssl key/certificate configuration
      certConfig:
        name: ""
      # vGPU licensing configuration
      licensingConfig:
        configMapName: ""
        nlsEnabled: true
      # vGPU topology daemon configuration
      virtualTopology:
        config: ""
      # kernel module configuration for NVIDIA driver
      kernelModuleConfig:
        name: ""

    toolkit:
      enabled: true
      repository: nvcr.io/nvidia/k8s
      image: container-toolkit
      version: v1.15.0-rc.4-ubuntu20.04
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: CONTAINERD_CONFIG
          value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
        - name: CONTAINERD_SOCKET
          value: /run/k3s/containerd/containerd.sock
      resources: {}
      installDir: "/usr/local/nvidia"

    devicePlugin:
      enabled: true
      repository: nvcr.io/nvidia
      image: k8s-device-plugin
      version: v0.15.0-rc.2-ubi8
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      args: []
      env:
        - name: PASS_DEVICE_SPECS
          value: "true"
        - name: FAIL_ON_INIT_ERROR
          value: "true"
        - name: DEVICE_LIST_STRATEGY
          value: envvar
        - name: DEVICE_ID_STRATEGY
          value: uuid
        - name: NVIDIA_VISIBLE_DEVICES
          value: all
        - name: NVIDIA_DRIVER_CAPABILITIES
          value: all
      resources: {}
      # Plugin configuration
      # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
      # Use "data" to build an integrated ConfigMap from a set of configurations as
      # part of this helm chart. An example of setting "data" might be:
      # config:
      #   name: device-plugin-config
      #   create: true
      #   data:
      #     default: |-
      #       version: v1
      #       flags:
      #         migStrategy: none
      #     mig-single: |-
      #       version: v1
      #       flags:
      #         migStrategy: single
      #     mig-mixed: |-
      #       version: v1
      #       flags:
      #         migStrategy: mixed
      config:
        # Create a ConfigMap (default: false)
        create: false
        # ConfigMap name (either exiting or to create a new one with create=true above)
        name: ""
        # Default config name within the ConfigMap
        default: ""
        # Data section for the ConfigMap to create (i.e only applies when create=true)
        data: {}
      # MPS related configuration for the plugin
      mps:
        # MPS root path on the host
        root: "/run/nvidia/mps"

    # standalone dcgm hostengine
    dcgm:
      # disabled by default to use embedded nv-hostengine by exporter
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: dcgm
      version: 3.3.3-1-ubuntu22.04
      imagePullPolicy: IfNotPresent
      hostPort: 5555
      args: []
      env: []
      resources: {}

    dcgmExporter:
      enabled: true
      repository: nvcr.io/nvidia/k8s
      image: dcgm-exporter
      version: 3.3.5-3.4.0-ubuntu22.04
      imagePullPolicy: IfNotPresent
      env:
        - name: DCGM_EXPORTER_LISTEN
          value: ":9400"
        - name: DCGM_EXPORTER_KUBERNETES
          value: "true"
        - name: DCGM_EXPORTER_COLLECTORS
          value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
      resources: {}
      serviceMonitor:
        enabled: false
        interval: 15s
        honorLabels: false
        additionalLabels: {}
        relabelings: []
        # - source_labels:
        #     - __meta_kubernetes_pod_node_name
        #   regex: (.*)
        #   target_label: instance
        #   replacement: $1
        #   action: replace

    gfd:
      enabled: true
      repository: nvcr.io/nvidia
      image: k8s-device-plugin
      version: v0.15.0-rc.2-ubi8
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: GFD_SLEEP_INTERVAL
          value: 60s
        - name: GFD_FAIL_ON_INIT_ERROR
          value: "true"
      resources: {}

    migManager:
      enabled: true
      repository: nvcr.io/nvidia/cloud-native
      image: k8s-mig-manager
      version: v0.6.0-ubuntu20.04
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: WITH_REBOOT
          value: "false"
      resources: {}
      config:
        name: "default-mig-parted-config"
        default: "all-disabled"
      gpuClientsConfig:
        name: ""

    nodeStatusExporter:
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: gpu-operator-validator
      # If version is not specified, then default is to use chart.AppVersion
      #version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      resources: {}

    gds:
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: nvidia-fs
      version: "2.17.5"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      args: []

    gdrcopy:
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: gdrdrv
      version: "v2.4.1"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      args: []

    vgpuManager:
      enabled: false
      repository: ""
      image: vgpu-manager
      version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      resources: {}
      driverManager:
        image: k8s-driver-manager
        repository: nvcr.io/nvidia/cloud-native
        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
        version: v0.6.7
        imagePullPolicy: IfNotPresent
        env:
          - name: ENABLE_GPU_POD_EVICTION
            value: "false"
          - name: ENABLE_AUTO_DRAIN
            value: "false"

    vgpuDeviceManager:
      enabled: true
      repository: nvcr.io/nvidia/cloud-native
      image: vgpu-device-manager
      version: "v0.2.5"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      config:
        name: ""
        default: "default"

    vfioManager:
      enabled: true
      repository: nvcr.io/nvidia
      image: cuda
      version: 12.3.2-base-ubi8
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      resources: {}
      driverManager:
        image: k8s-driver-manager
        repository: nvcr.io/nvidia/cloud-native
        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
        version: v0.6.7
        imagePullPolicy: IfNotPresent
        env:
          - name: ENABLE_GPU_POD_EVICTION
            value: "false"
          - name: ENABLE_AUTO_DRAIN
            value: "false"

    kataManager:
      enabled: false
      config:
        artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
        runtimeClasses:
          - name: kata-qemu-nvidia-gpu
            nodeSelector: {}
            artifacts:
              url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
              pullSecret: ""
          - name: kata-qemu-nvidia-gpu-snp
            nodeSelector:
              "nvidia.com/cc.capable": "true"
            artifacts:
              url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
              pullSecret: ""
      repository: nvcr.io/nvidia/cloud-native
      image: k8s-kata-manager
      version: v0.1.2
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      resources: {}

    sandboxDevicePlugin:
      enabled: true
      repository: nvcr.io/nvidia
      image: kubevirt-gpu-device-plugin
      version: v1.2.6
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      args: []
      env: []
      resources: {}

    ccManager:
      enabled: false
      defaultMode: "off"
      repository: nvcr.io/nvidia/cloud-native
      image: k8s-cc-manager
      version: v0.1.1
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: CC_CAPABLE_DEVICE_IDS
          value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d"
      resources: {}

    node-feature-discovery:
      enableNodeFeatureApi: true
      gc:
        enable: true
        replicaCount: 1
        serviceAccount:
          name: node-feature-discovery
          create: false
      worker:
        serviceAccount:
          name: node-feature-discovery
          # disable creation to avoid duplicate serviceaccount creation by master spec below
          create: false
        tolerations:
        - key: "node-role.kubernetes.io/master"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: "node-role.kubernetes.io/control-plane"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
        config:
          sources:
            pci:
              deviceClassWhitelist:
              - "02"
              - "0200"
              - "0207"
              - "0300"
              - "0302"
              deviceLabelFields:
              - vendor
      master:
        serviceAccount:
          name: node-feature-discovery
          create: true
        config:
          extraLabelNs: ["nvidia.com"]
          # noPublish: false
          # resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"]
          # enableTaints: false
          # labelWhiteList: "nvidia.com/gpu"
Add nvidia operator 2024-11-03 20:52:15 +00:00			`apiVersion: helm.toolkit.fluxcd.io/v2beta1`
			`kind: HelmRelease`
			`metadata:`
			`name: gpu-operator`
			`namespace: nvidia-system`
			`spec:`
			`chart:`
			`spec:`
			`chart: gpu-operator`
			`sourceRef:`
			`kind: HelmRepository`
			`name: nvidia-operator`
			`namespace: flux-system`
			`interval: 15m0s`
			`timeout: 5m`
			`releaseName: gpu-operator`
			`values:`
			`# Default values for gpu-operator.`
			`# This is a YAML-formatted file.`
			`# Declare variables to be passed into your templates.`

			`platform:`
			`openshift: false`

			`nfd:`
			`enabled: true`
			`nodefeaturerules: false`

			`psa:`
			`enabled: false`

			`cdi:`
			`enabled: false`
			`default: false`

			`sandboxWorkloads:`
			`enabled: false`
			`defaultWorkload: "container"`

			`daemonsets:`
			`labels: {}`
			`annotations: {}`
			`priorityClassName: system-node-critical`
			`tolerations:`
			`- key: nvidia.com/gpu`
			`operator: Exists`
			`effect: NoSchedule`
			`# configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands`
			`# note that driver Daemonset is always set with OnDelete to avoid unintended disruptions`
			`updateStrategy: "RollingUpdate"`
			`# configuration for controlling rolling update of GPU Operands`
			`rollingUpdate:`
			`# maximum number of nodes to simultaneously apply pod updates on.`
			`# can be specified either as number or percentage of nodes. Default 1.`
			`maxUnavailable: "1"`

			`validator:`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: gpu-operator-validator`
			`# If version is not specified, then default is to use chart.AppVersion`
			`#version: ""`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`args: []`
			`resources: {}`
			`plugin:`
			`env:`
			`- name: WITH_WORKLOAD`
			`value: "false"`

			`operator:`
			`repository: nvcr.io/nvidia`
			`image: gpu-operator`
			`# If version is not specified, then default is to use chart.AppVersion`
			`#version: ""`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`priorityClassName: system-node-critical`
			`defaultRuntime: docker`
			`runtimeClass: nvidia`
			`use_ocp_driver_toolkit: false`
			`# cleanup CRD on chart un-install`
			`cleanupCRD: false`
			`# upgrade CRD on chart upgrade, requires --disable-openapi-validation flag`
			`# to be passed during helm upgrade.`
			`upgradeCRD: false`
			`initContainer:`
			`image: cuda`
			`repository: nvcr.io/nvidia`
			`version: 12.3.2-base-ubi8`
			`imagePullPolicy: IfNotPresent`
			`tolerations:`
			`- key: "node-role.kubernetes.io/master"`
			`operator: "Equal"`
			`value: ""`
			`effect: "NoSchedule"`
			`- key: "node-role.kubernetes.io/control-plane"`
			`operator: "Equal"`
			`value: ""`
			`effect: "NoSchedule"`
			`annotations:`
			`openshift.io/scc: restricted-readonly`
			`affinity:`
			`nodeAffinity:`
			`preferredDuringSchedulingIgnoredDuringExecution:`
			`- weight: 1`
			`preference:`
			`matchExpressions:`
			`- key: "node-role.kubernetes.io/master"`
			`operator: In`
			`values: [""]`
			`- weight: 1`
			`preference:`
			`matchExpressions:`
			`- key: "node-role.kubernetes.io/control-plane"`
			`operator: In`
			`values: [""]`
			`logging:`
			`# Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')`
			`timeEncoding: epoch`
			`# Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity`
			`level: info`
			`# Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)`
			`# Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)`
			`develMode: false`
			`resources:`
			`limits:`
			`cpu: 500m`
			`memory: 350Mi`
			`requests:`
			`cpu: 200m`
			`memory: 100Mi`

			`mig:`
			`strategy: single`

			`driver:`
			`enabled: true`
			`nvidiaDriverCRD:`
			`enabled: false`
			`deployDefaultCR: true`
			`driverType: gpu`
			`nodeSelector: {}`
			`useOpenKernelModules: false`
			`# use pre-compiled packages for NVIDIA driver installation.`
			`# only supported for as a tech-preview feature on ubuntu22.04 kernels.`
			`usePrecompiled: false`
			`repository: nvcr.io/nvidia`
			`image: driver`
			`version: "550.54.15"`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`startupProbe:`
			`initialDelaySeconds: 60`
			`periodSeconds: 10`
			`# nvidia-smi can take longer than 30s in some cases`
			`# ensure enough timeout is set`
			`timeoutSeconds: 60`
			`failureThreshold: 120`
			`rdma:`
			`enabled: false`
			`useHostMofed: false`
			`upgradePolicy:`
			`# global switch for automatic upgrade feature`
			`# if set to false all other options are ignored`
			`autoUpgrade: true`
			`# how many nodes can be upgraded in parallel`
			`# 0 means no limit, all nodes will be upgraded in parallel`
			`maxParallelUpgrades: 1`
			`# maximum number of nodes with the driver installed, that can be unavailable during`
			`# the upgrade. Value can be an absolute number (ex: 5) or`
			`# a percentage of total nodes at the start of upgrade (ex:`
			`# 10%). Absolute number is calculated from percentage by rounding`
			`# up. By default, a fixed value of 25% is used.'`
			`maxUnavailable: 25%`
			`# options for waiting on pod(job) completions`
			`waitForCompletion:`
			`timeoutSeconds: 0`
			`podSelector: ""`
			`# options for gpu pod deletion`
			`gpuPodDeletion:`
			`force: false`
			`timeoutSeconds: 300`
			`deleteEmptyDir: false`
			# options for node drain (`kubectl drain`) before the driver reload
			`# this is required only if default GPU pod deletions done by the operator`
			`# are not sufficient to re-install the driver`
			`drain:`
			`enable: false`
			`force: false`
			`podSelector: ""`
			`# It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries`
			`timeoutSeconds: 300`
			`deleteEmptyDir: false`
			`manager:`
			`image: k8s-driver-manager`
			`repository: nvcr.io/nvidia/cloud-native`
			`# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4`
			`# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0`
			`version: v0.6.7`
			`imagePullPolicy: IfNotPresent`
			`env:`
			`- name: ENABLE_GPU_POD_EVICTION`
			`value: "true"`
			`- name: ENABLE_AUTO_DRAIN`
			`value: "false"`
			`- name: DRAIN_USE_FORCE`
			`value: "false"`
			`- name: DRAIN_POD_SELECTOR_LABEL`
			`value: ""`
			`- name: DRAIN_TIMEOUT_SECONDS`
			`value: "0s"`
			`- name: DRAIN_DELETE_EMPTYDIR_DATA`
			`value: "false"`
			`env: []`
			`resources: {}`
			`# Private mirror repository configuration`
			`repoConfig:`
			`configMapName: ""`
			`# custom ssl key/certificate configuration`
			`certConfig:`
			`name: ""`
			`# vGPU licensing configuration`
			`licensingConfig:`
			`configMapName: ""`
			`nlsEnabled: true`
			`# vGPU topology daemon configuration`
			`virtualTopology:`
			`config: ""`
			`# kernel module configuration for NVIDIA driver`
			`kernelModuleConfig:`
			`name: ""`

			`toolkit:`
			`enabled: true`
			`repository: nvcr.io/nvidia/k8s`
			`image: container-toolkit`
			`version: v1.15.0-rc.4-ubuntu20.04`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env:`
			`- name: CONTAINERD_CONFIG`
			`value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml`
			`- name: CONTAINERD_SOCKET`
			`value: /run/k3s/containerd/containerd.sock`
			`resources: {}`
			`installDir: "/usr/local/nvidia"`

			`devicePlugin:`
			`enabled: true`
			`repository: nvcr.io/nvidia`
			`image: k8s-device-plugin`
			`version: v0.15.0-rc.2-ubi8`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`args: []`
			`env:`
			`- name: PASS_DEVICE_SPECS`
			`value: "true"`
			`- name: FAIL_ON_INIT_ERROR`
			`value: "true"`
			`- name: DEVICE_LIST_STRATEGY`
			`value: envvar`
			`- name: DEVICE_ID_STRATEGY`
			`value: uuid`
			`- name: NVIDIA_VISIBLE_DEVICES`
			`value: all`
			`- name: NVIDIA_DRIVER_CAPABILITIES`
			`value: all`
			`resources: {}`
			`# Plugin configuration`
			`# Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).`
			`# Use "data" to build an integrated ConfigMap from a set of configurations as`
			`# part of this helm chart. An example of setting "data" might be:`
			`# config:`
			`# name: device-plugin-config`
			`# create: true`
			`# data:`
			`# default: \|-`
			`# version: v1`
			`# flags:`
			`# migStrategy: none`
			`# mig-single: \|-`
			`# version: v1`
			`# flags:`
			`# migStrategy: single`
			`# mig-mixed: \|-`
			`# version: v1`
			`# flags:`
			`# migStrategy: mixed`
			`config:`
			`# Create a ConfigMap (default: false)`
			`create: false`
			`# ConfigMap name (either exiting or to create a new one with create=true above)`
			`name: ""`
			`# Default config name within the ConfigMap`
			`default: ""`
			`# Data section for the ConfigMap to create (i.e only applies when create=true)`
			`data: {}`
			`# MPS related configuration for the plugin`
			`mps:`
			`# MPS root path on the host`
			`root: "/run/nvidia/mps"`

			`# standalone dcgm hostengine`
			`dcgm:`
			`# disabled by default to use embedded nv-hostengine by exporter`
			`enabled: false`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: dcgm`
			`version: 3.3.3-1-ubuntu22.04`
			`imagePullPolicy: IfNotPresent`
			`hostPort: 5555`
			`args: []`
			`env: []`
			`resources: {}`

			`dcgmExporter:`
			`enabled: true`
			`repository: nvcr.io/nvidia/k8s`
			`image: dcgm-exporter`
			`version: 3.3.5-3.4.0-ubuntu22.04`
			`imagePullPolicy: IfNotPresent`
			`env:`
			`- name: DCGM_EXPORTER_LISTEN`
			`value: ":9400"`
			`- name: DCGM_EXPORTER_KUBERNETES`
			`value: "true"`
			`- name: DCGM_EXPORTER_COLLECTORS`
			`value: "/etc/dcgm-exporter/dcp-metrics-included.csv"`
			`resources: {}`
			`serviceMonitor:`
			`enabled: false`
			`interval: 15s`
			`honorLabels: false`
			`additionalLabels: {}`
			`relabelings: []`
			`# - source_labels:`
			`# - __meta_kubernetes_pod_node_name`
			`# regex: (.*)`
			`# target_label: instance`
			`# replacement: $1`
			`# action: replace`

			`gfd:`
			`enabled: true`
			`repository: nvcr.io/nvidia`
			`image: k8s-device-plugin`
			`version: v0.15.0-rc.2-ubi8`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env:`
			`- name: GFD_SLEEP_INTERVAL`
			`value: 60s`
			`- name: GFD_FAIL_ON_INIT_ERROR`
			`value: "true"`
			`resources: {}`

			`migManager:`
			`enabled: true`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: k8s-mig-manager`
			`version: v0.6.0-ubuntu20.04`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env:`
			`- name: WITH_REBOOT`
			`value: "false"`
			`resources: {}`
			`config:`
			`name: "default-mig-parted-config"`
			`default: "all-disabled"`
			`gpuClientsConfig:`
			`name: ""`

			`nodeStatusExporter:`
			`enabled: false`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: gpu-operator-validator`
			`# If version is not specified, then default is to use chart.AppVersion`
			`#version: ""`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`resources: {}`

			`gds:`
			`enabled: false`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: nvidia-fs`
			`version: "2.17.5"`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`args: []`

			`gdrcopy:`
			`enabled: false`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: gdrdrv`
			`version: "v2.4.1"`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`args: []`

			`vgpuManager:`
			`enabled: false`
			`repository: ""`
			`image: vgpu-manager`
			`version: ""`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`resources: {}`
			`driverManager:`
			`image: k8s-driver-manager`
			`repository: nvcr.io/nvidia/cloud-native`
			`# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4`
			`# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0`
			`version: v0.6.7`
			`imagePullPolicy: IfNotPresent`
			`env:`
			`- name: ENABLE_GPU_POD_EVICTION`
			`value: "false"`
			`- name: ENABLE_AUTO_DRAIN`
			`value: "false"`

			`vgpuDeviceManager:`
			`enabled: true`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: vgpu-device-manager`
			`version: "v0.2.5"`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`config:`
			`name: ""`
			`default: "default"`

			`vfioManager:`
			`enabled: true`
			`repository: nvcr.io/nvidia`
			`image: cuda`
			`version: 12.3.2-base-ubi8`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`resources: {}`
			`driverManager:`
			`image: k8s-driver-manager`
			`repository: nvcr.io/nvidia/cloud-native`
			`# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4`
			`# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0`
			`version: v0.6.7`
			`imagePullPolicy: IfNotPresent`
			`env:`
			`- name: ENABLE_GPU_POD_EVICTION`
			`value: "false"`
			`- name: ENABLE_AUTO_DRAIN`
			`value: "false"`

			`kataManager:`
			`enabled: false`
			`config:`
			`artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"`
			`runtimeClasses:`
			`- name: kata-qemu-nvidia-gpu`
			`nodeSelector: {}`
			`artifacts:`
			`url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03`
			`pullSecret: ""`
			`- name: kata-qemu-nvidia-gpu-snp`
			`nodeSelector:`
			`"nvidia.com/cc.capable": "true"`
			`artifacts:`
			`url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp`
			`pullSecret: ""`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: k8s-kata-manager`
			`version: v0.1.2`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env: []`
			`resources: {}`

			`sandboxDevicePlugin:`
			`enabled: true`
			`repository: nvcr.io/nvidia`
			`image: kubevirt-gpu-device-plugin`
			`version: v1.2.6`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`args: []`
			`env: []`
			`resources: {}`

			`ccManager:`
			`enabled: false`
			`defaultMode: "off"`
			`repository: nvcr.io/nvidia/cloud-native`
			`image: k8s-cc-manager`
			`version: v0.1.1`
			`imagePullPolicy: IfNotPresent`
			`imagePullSecrets: []`
			`env:`
			`- name: CC_CAPABLE_DEVICE_IDS`
			`value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d"`
			`resources: {}`

			`node-feature-discovery:`
			`enableNodeFeatureApi: true`
			`gc:`
			`enable: true`
			`replicaCount: 1`
			`serviceAccount:`
			`name: node-feature-discovery`
			`create: false`
			`worker:`
			`serviceAccount:`
			`name: node-feature-discovery`
			`# disable creation to avoid duplicate serviceaccount creation by master spec below`
			`create: false`
			`tolerations:`
			`- key: "node-role.kubernetes.io/master"`
			`operator: "Equal"`
			`value: ""`
			`effect: "NoSchedule"`
			`- key: "node-role.kubernetes.io/control-plane"`
			`operator: "Equal"`
			`value: ""`
			`effect: "NoSchedule"`
			`- key: nvidia.com/gpu`
			`operator: Exists`
			`effect: NoSchedule`
			`config:`
			`sources:`
			`pci:`
			`deviceClassWhitelist:`
			`- "02"`
			`- "0200"`
			`- "0207"`
			`- "0300"`
			`- "0302"`
			`deviceLabelFields:`
			`- vendor`
			`master:`
			`serviceAccount:`
			`name: node-feature-discovery`
			`create: true`
			`config:`
			`extraLabelNs: ["nvidia.com"]`
			`# noPublish: false`
			`# resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"]`
			`# enableTaints: false`
			`# labelWhiteList: "nvidia.com/gpu"`