Add nvidia operator

2024-10-16 09:55:38 -04:00 · 2024-10-16 09:55:38 -04:00 · c51dc2e2d7
commit c51dc2e2d7
parent 019293c13f
1 changed files with 556 additions and 0 deletions
--- a/infra/nvidia/helmrelease-nvidia-operator.yaml
+++ b/infra/nvidia/helmrelease-nvidia-operator.yaml
@ -0,0 +1,556 @@
+apiVersion: helm.toolkit.fluxcd.io/v2beta1
+kind: HelmRelease
+metadata:
+  name: gpu-operator
+  namespace: nvidia-system
+spec:
+  chart:
+    spec:
+      chart: gpu-operator
+      sourceRef:
+        kind: HelmRepository
+        name: nvidia-operator
+        namespace: flux-system
+  interval: 15m0s
+  timeout: 5m
+  releaseName: gpu-operator
+  values:
+    # Default values for gpu-operator.
+    # This is a YAML-formatted file.
+    # Declare variables to be passed into your templates.
+
+    platform:
+      openshift: false
+
+    nfd:
+      enabled: true
+      nodefeaturerules: false
+
+    psa:
+      enabled: false
+
+    cdi:
+      enabled: false
+      default: false
+
+    sandboxWorkloads:
+      enabled: false
+      defaultWorkload: "container"
+
+    daemonsets:
+      labels: {}
+      annotations: {}
+      priorityClassName: system-node-critical
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands
+      # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions
+      updateStrategy: "RollingUpdate"
+      # configuration for controlling rolling update of GPU Operands
+      rollingUpdate:
+        # maximum number of nodes to simultaneously apply pod updates on.
+        # can be specified either as number or percentage of nodes. Default 1.
+        maxUnavailable: "1"
+
+    validator:
+      repository: nvcr.io/nvidia/cloud-native
+      image: gpu-operator-validator
+      # If version is not specified, then default is to use chart.AppVersion
+      #version: ""
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      args: []
+      resources: {}
+      plugin:
+        env:
+          - name: WITH_WORKLOAD
+            value: "false"
+
+    operator:
+      repository: nvcr.io/nvidia
+      image: gpu-operator
+      # If version is not specified, then default is to use chart.AppVersion
+      #version: ""
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      priorityClassName: system-node-critical
+      defaultRuntime: docker
+      runtimeClass: nvidia
+      use_ocp_driver_toolkit: false
+      # cleanup CRD on chart un-install
+      cleanupCRD: false
+      # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
+      # to be passed during helm upgrade.
+      upgradeCRD: false
+      initContainer:
+        image: cuda
+        repository: nvcr.io/nvidia
+        version: 12.3.2-base-ubi8
+        imagePullPolicy: IfNotPresent
+      tolerations:
+      - key: "node-role.kubernetes.io/master"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
+      - key: "node-role.kubernetes.io/control-plane"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
+      annotations:
+        openshift.io/scc: restricted-readonly
+      affinity:
+        nodeAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 1
+              preference:
+                matchExpressions:
+                  - key: "node-role.kubernetes.io/master"
+                    operator: In
+                    values: [""]
+            - weight: 1
+              preference:
+                matchExpressions:
+                  - key: "node-role.kubernetes.io/control-plane"
+                    operator: In
+                    values: [""]
+      logging:
+        # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')
+        timeEncoding: epoch
+        # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity
+        level: info
+        # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)
+        # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)
+        develMode: false
+      resources:
+        limits:
+          cpu: 500m
+          memory: 350Mi
+        requests:
+          cpu: 200m
+          memory: 100Mi
+
+    mig:
+      strategy: single
+
+    driver:
+      enabled: true
+      nvidiaDriverCRD:
+        enabled: false
+        deployDefaultCR: true
+        driverType: gpu
+        nodeSelector: {}
+      useOpenKernelModules: false
+      # use pre-compiled packages for NVIDIA driver installation.
+      # only supported for as a tech-preview feature on ubuntu22.04 kernels.
+      usePrecompiled: false
+      repository: nvcr.io/nvidia
+      image: driver
+      version: "550.54.15"
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      startupProbe:
+        initialDelaySeconds: 60
+        periodSeconds: 10
+        # nvidia-smi can take longer than 30s in some cases
+        # ensure enough timeout is set
+        timeoutSeconds: 60
+        failureThreshold: 120
+      rdma:
+        enabled: false
+        useHostMofed: false
+      upgradePolicy:
+        # global switch for automatic upgrade feature
+        # if set to false all other options are ignored
+        autoUpgrade: true
+        # how many nodes can be upgraded in parallel
+        # 0 means no limit, all nodes will be upgraded in parallel
+        maxParallelUpgrades: 1
+        # maximum number of nodes with the driver installed, that can be unavailable during
+        # the upgrade. Value can be an absolute number (ex: 5) or
+        # a percentage of total nodes at the start of upgrade (ex:
+        # 10%). Absolute number is calculated from percentage by rounding
+        # up. By default, a fixed value of 25% is used.'
+        maxUnavailable: 25%
+        # options for waiting on pod(job) completions
+        waitForCompletion:
+          timeoutSeconds: 0
+          podSelector: ""
+        # options for gpu pod deletion
+        gpuPodDeletion:
+          force: false
+          timeoutSeconds: 300
+          deleteEmptyDir: false
+        # options for node drain (`kubectl drain`) before the driver reload
+        # this is required only if default GPU pod deletions done by the operator
+        # are not sufficient to re-install the driver
+        drain:
+          enable: false
+          force: false
+          podSelector: ""
+          # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
+          timeoutSeconds: 300
+          deleteEmptyDir: false
+      manager:
+        image: k8s-driver-manager
+        repository: nvcr.io/nvidia/cloud-native
+        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
+        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
+        version: v0.6.7
+        imagePullPolicy: IfNotPresent
+        env:
+          - name: ENABLE_GPU_POD_EVICTION
+            value: "true"
+          - name: ENABLE_AUTO_DRAIN
+            value: "false"
+          - name: DRAIN_USE_FORCE
+            value: "false"
+          - name: DRAIN_POD_SELECTOR_LABEL
+            value: ""
+          - name: DRAIN_TIMEOUT_SECONDS
+            value: "0s"
+          - name: DRAIN_DELETE_EMPTYDIR_DATA
+            value: "false"
+      env: []
+      resources: {}
+      # Private mirror repository configuration
+      repoConfig:
+        configMapName: ""
+      # custom ssl key/certificate configuration
+      certConfig:
+        name: ""
+      # vGPU licensing configuration
+      licensingConfig:
+        configMapName: ""
+        nlsEnabled: true
+      # vGPU topology daemon configuration
+      virtualTopology:
+        config: ""
+      # kernel module configuration for NVIDIA driver
+      kernelModuleConfig:
+        name: ""
+
+    toolkit:
+      enabled: true
+      repository: nvcr.io/nvidia/k8s
+      image: container-toolkit
+      version: v1.15.0-rc.4-ubuntu20.04
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env:
+        - name: CONTAINERD_CONFIG
+          value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
+        - name: CONTAINERD_SOCKET
+          value: /run/k3s/containerd/containerd.sock
+      resources: {}
+      installDir: "/usr/local/nvidia"
+
+    devicePlugin:
+      enabled: true
+      repository: nvcr.io/nvidia
+      image: k8s-device-plugin
+      version: v0.15.0-rc.2-ubi8
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      args: []
+      env:
+        - name: PASS_DEVICE_SPECS
+          value: "true"
+        - name: FAIL_ON_INIT_ERROR
+          value: "true"
+        - name: DEVICE_LIST_STRATEGY
+          value: envvar
+        - name: DEVICE_ID_STRATEGY
+          value: uuid
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: all
+        - name: NVIDIA_DRIVER_CAPABILITIES
+          value: all
+      resources: {}
+      # Plugin configuration
+      # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
+      # Use "data" to build an integrated ConfigMap from a set of configurations as
+      # part of this helm chart. An example of setting "data" might be:
+      # config:
+      #   name: device-plugin-config
+      #   create: true
+      #   data:
+      #     default: |-
+      #       version: v1
+      #       flags:
+      #         migStrategy: none
+      #     mig-single: |-
+      #       version: v1
+      #       flags:
+      #         migStrategy: single
+      #     mig-mixed: |-
+      #       version: v1
+      #       flags:
+      #         migStrategy: mixed
+      config:
+        # Create a ConfigMap (default: false)
+        create: false
+        # ConfigMap name (either exiting or to create a new one with create=true above)
+        name: ""
+        # Default config name within the ConfigMap
+        default: ""
+        # Data section for the ConfigMap to create (i.e only applies when create=true)
+        data: {}
+      # MPS related configuration for the plugin
+      mps:
+        # MPS root path on the host
+        root: "/run/nvidia/mps"
+
+    # standalone dcgm hostengine
+    dcgm:
+      # disabled by default to use embedded nv-hostengine by exporter
+      enabled: false
+      repository: nvcr.io/nvidia/cloud-native
+      image: dcgm
+      version: 3.3.3-1-ubuntu22.04
+      imagePullPolicy: IfNotPresent
+      hostPort: 5555
+      args: []
+      env: []
+      resources: {}
+
+    dcgmExporter:
+      enabled: true
+      repository: nvcr.io/nvidia/k8s
+      image: dcgm-exporter
+      version: 3.3.5-3.4.0-ubuntu22.04
+      imagePullPolicy: IfNotPresent
+      env:
+        - name: DCGM_EXPORTER_LISTEN
+          value: ":9400"
+        - name: DCGM_EXPORTER_KUBERNETES
+          value: "true"
+        - name: DCGM_EXPORTER_COLLECTORS
+          value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
+      resources: {}
+      serviceMonitor:
+        enabled: false
+        interval: 15s
+        honorLabels: false
+        additionalLabels: {}
+        relabelings: []
+        # - source_labels:
+        #     - __meta_kubernetes_pod_node_name
+        #   regex: (.*)
+        #   target_label: instance
+        #   replacement: $1
+        #   action: replace
+
+    gfd:
+      enabled: true
+      repository: nvcr.io/nvidia
+      image: k8s-device-plugin
+      version: v0.15.0-rc.2-ubi8
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env:
+        - name: GFD_SLEEP_INTERVAL
+          value: 60s
+        - name: GFD_FAIL_ON_INIT_ERROR
+          value: "true"
+      resources: {}
+
+    migManager:
+      enabled: true
+      repository: nvcr.io/nvidia/cloud-native
+      image: k8s-mig-manager
+      version: v0.6.0-ubuntu20.04
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env:
+        - name: WITH_REBOOT
+          value: "false"
+      resources: {}
+      config:
+        name: "default-mig-parted-config"
+        default: "all-disabled"
+      gpuClientsConfig:
+        name: ""
+
+    nodeStatusExporter:
+      enabled: false
+      repository: nvcr.io/nvidia/cloud-native
+      image: gpu-operator-validator
+      # If version is not specified, then default is to use chart.AppVersion
+      #version: ""
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      resources: {}
+
+    gds:
+      enabled: false
+      repository: nvcr.io/nvidia/cloud-native
+      image: nvidia-fs
+      version: "2.17.5"
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      args: []
+
+    gdrcopy:
+      enabled: false
+      repository: nvcr.io/nvidia/cloud-native
+      image: gdrdrv
+      version: "v2.4.1"
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      args: []
+
+    vgpuManager:
+      enabled: false
+      repository: ""
+      image: vgpu-manager
+      version: ""
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      resources: {}
+      driverManager:
+        image: k8s-driver-manager
+        repository: nvcr.io/nvidia/cloud-native
+        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
+        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
+        version: v0.6.7
+        imagePullPolicy: IfNotPresent
+        env:
+          - name: ENABLE_GPU_POD_EVICTION
+            value: "false"
+          - name: ENABLE_AUTO_DRAIN
+            value: "false"
+
+    vgpuDeviceManager:
+      enabled: true
+      repository: nvcr.io/nvidia/cloud-native
+      image: vgpu-device-manager
+      version: "v0.2.5"
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      config:
+        name: ""
+        default: "default"
+
+    vfioManager:
+      enabled: true
+      repository: nvcr.io/nvidia
+      image: cuda
+      version: 12.3.2-base-ubi8
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      resources: {}
+      driverManager:
+        image: k8s-driver-manager
+        repository: nvcr.io/nvidia/cloud-native
+        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
+        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
+        version: v0.6.7
+        imagePullPolicy: IfNotPresent
+        env:
+          - name: ENABLE_GPU_POD_EVICTION
+            value: "false"
+          - name: ENABLE_AUTO_DRAIN
+            value: "false"
+
+    kataManager:
+      enabled: false
+      config:
+        artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
+        runtimeClasses:
+          - name: kata-qemu-nvidia-gpu
+            nodeSelector: {}
+            artifacts:
+              url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
+              pullSecret: ""
+          - name: kata-qemu-nvidia-gpu-snp
+            nodeSelector:
+              "nvidia.com/cc.capable": "true"
+            artifacts:
+              url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
+              pullSecret: ""
+      repository: nvcr.io/nvidia/cloud-native
+      image: k8s-kata-manager
+      version: v0.1.2
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env: []
+      resources: {}
+
+    sandboxDevicePlugin:
+      enabled: true
+      repository: nvcr.io/nvidia
+      image: kubevirt-gpu-device-plugin
+      version: v1.2.6
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      args: []
+      env: []
+      resources: {}
+
+    ccManager:
+      enabled: false
+      defaultMode: "off"
+      repository: nvcr.io/nvidia/cloud-native
+      image: k8s-cc-manager
+      version: v0.1.1
+      imagePullPolicy: IfNotPresent
+      imagePullSecrets: []
+      env:
+        - name: CC_CAPABLE_DEVICE_IDS
+          value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d"
+      resources: {}
+
+    node-feature-discovery:
+      enableNodeFeatureApi: true
+      gc:
+        enable: true
+        replicaCount: 1
+        serviceAccount:
+          name: node-feature-discovery
+          create: false
+      worker:
+        serviceAccount:
+          name: node-feature-discovery
+          # disable creation to avoid duplicate serviceaccount creation by master spec below
+          create: false
+        tolerations:
+        - key: "node-role.kubernetes.io/master"
+          operator: "Equal"
+          value: ""
+          effect: "NoSchedule"
+        - key: "node-role.kubernetes.io/control-plane"
+          operator: "Equal"
+          value: ""
+          effect: "NoSchedule"
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+        config:
+          sources:
+            pci:
+              deviceClassWhitelist:
+              - "02"
+              - "0200"
+              - "0207"
+              - "0300"
+              - "0302"
+              deviceLabelFields:
+              - vendor
+      master:
+        serviceAccount:
+          name: node-feature-discovery
+          create: true
+        config:
+          extraLabelNs: ["nvidia.com"]
+          # noPublish: false
+          # resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"]
+          # enableTaints: false
+          # labelWhiteList: "nvidia.com/gpu"