Add nvidia operator

2024-11-03 15:52:15 -05:00 · 2024-11-03 15:52:15 -05:00 · 3c809e6437
commit 3c809e6437
parent 2e2fd890a8
1 changed files with 556 additions and 0 deletions
--- a/cluster/nvidia/helmrelease-nvidia-operator.yaml
+++ b/cluster/nvidia/helmrelease-nvidia-operator.yaml
@ -0,0 +1,556 @@
 apiVersion: helm.toolkit.fluxcd.io/v2beta1
 kind: HelmRelease
 metadata:
  name: gpu-operator
  namespace: nvidia-system
 spec:
  chart:
    spec:
      chart: gpu-operator
      sourceRef:
        kind: HelmRepository
        name: nvidia-operator
        namespace: flux-system
  interval: 15m0s
  timeout: 5m
  releaseName: gpu-operator
  values:
    # Default values for gpu-operator.
    # This is a YAML-formatted file.
    # Declare variables to be passed into your templates.
    platform:
      openshift: false
    nfd:
      enabled: true
      nodefeaturerules: false
    psa:
      enabled: false
    cdi:
      enabled: false
      default: false
    sandboxWorkloads:
      enabled: false
      defaultWorkload: "container"
    daemonsets:
      labels: {}
      annotations: {}
      priorityClassName: system-node-critical
      tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
      # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands
      # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions
      updateStrategy: "RollingUpdate"
      # configuration for controlling rolling update of GPU Operands
      rollingUpdate:
        # maximum number of nodes to simultaneously apply pod updates on.
        # can be specified either as number or percentage of nodes. Default 1.
        maxUnavailable: "1"
    validator:
      repository: nvcr.io/nvidia/cloud-native
      image: gpu-operator-validator
      # If version is not specified, then default is to use chart.AppVersion
      #version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      args: []
      resources: {}
      plugin:
        env:
          - name: WITH_WORKLOAD
            value: "false"
    operator:
      repository: nvcr.io/nvidia
      image: gpu-operator
      # If version is not specified, then default is to use chart.AppVersion
      #version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      priorityClassName: system-node-critical
      defaultRuntime: docker
      runtimeClass: nvidia
      use_ocp_driver_toolkit: false
      # cleanup CRD on chart un-install
      cleanupCRD: false
      # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
      # to be passed during helm upgrade.
      upgradeCRD: false
      initContainer:
        image: cuda
        repository: nvcr.io/nvidia
        version: 12.3.2-base-ubi8
        imagePullPolicy: IfNotPresent
      tolerations:
      - key: "node-role.kubernetes.io/master"
        operator: "Equal"
        value: ""
        effect: "NoSchedule"
      - key: "node-role.kubernetes.io/control-plane"
        operator: "Equal"
        value: ""
        effect: "NoSchedule"
      annotations:
        openshift.io/scc: restricted-readonly
      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 1
              preference:
                matchExpressions:
                  - key: "node-role.kubernetes.io/master"
                    operator: In
                    values: [""]
            - weight: 1
              preference:
                matchExpressions:
                  - key: "node-role.kubernetes.io/control-plane"
                    operator: In
                    values: [""]
      logging:
        # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')
        timeEncoding: epoch
        # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity
        level: info
        # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)
        # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)
        develMode: false
      resources:
        limits:
          cpu: 500m
          memory: 350Mi
        requests:
          cpu: 200m
          memory: 100Mi
    mig:
      strategy: single
    driver:
      enabled: true
      nvidiaDriverCRD:
        enabled: false
        deployDefaultCR: true
        driverType: gpu
        nodeSelector: {}
      useOpenKernelModules: false
      # use pre-compiled packages for NVIDIA driver installation.
      # only supported for as a tech-preview feature on ubuntu22.04 kernels.
      usePrecompiled: false
      repository: nvcr.io/nvidia
      image: driver
      version: "550.54.15"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      startupProbe:
        initialDelaySeconds: 60
        periodSeconds: 10
        # nvidia-smi can take longer than 30s in some cases
        # ensure enough timeout is set
        timeoutSeconds: 60
        failureThreshold: 120
      rdma:
        enabled: false
        useHostMofed: false
      upgradePolicy:
        # global switch for automatic upgrade feature
        # if set to false all other options are ignored
        autoUpgrade: true
        # how many nodes can be upgraded in parallel
        # 0 means no limit, all nodes will be upgraded in parallel
        maxParallelUpgrades: 1
        # maximum number of nodes with the driver installed, that can be unavailable during
        # the upgrade. Value can be an absolute number (ex: 5) or
        # a percentage of total nodes at the start of upgrade (ex:
        # 10%). Absolute number is calculated from percentage by rounding
        # up. By default, a fixed value of 25% is used.'
        maxUnavailable: 25%
        # options for waiting on pod(job) completions
        waitForCompletion:
          timeoutSeconds: 0
          podSelector: ""
        # options for gpu pod deletion
        gpuPodDeletion:
          force: false
          timeoutSeconds: 300
          deleteEmptyDir: false
        # options for node drain (`kubectl drain`) before the driver reload
        # this is required only if default GPU pod deletions done by the operator
        # are not sufficient to re-install the driver
        drain:
          enable: false
          force: false
          podSelector: ""
          # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
          timeoutSeconds: 300
          deleteEmptyDir: false
      manager:
        image: k8s-driver-manager
        repository: nvcr.io/nvidia/cloud-native
        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
        version: v0.6.7
        imagePullPolicy: IfNotPresent
        env:
          - name: ENABLE_GPU_POD_EVICTION
            value: "true"
          - name: ENABLE_AUTO_DRAIN
            value: "false"
          - name: DRAIN_USE_FORCE
            value: "false"
          - name: DRAIN_POD_SELECTOR_LABEL
            value: ""
          - name: DRAIN_TIMEOUT_SECONDS
            value: "0s"
          - name: DRAIN_DELETE_EMPTYDIR_DATA
            value: "false"
      env: []
      resources: {}
      # Private mirror repository configuration
      repoConfig:
        configMapName: ""
      # custom ssl key/certificate configuration
      certConfig:
        name: ""
      # vGPU licensing configuration
      licensingConfig:
        configMapName: ""
        nlsEnabled: true
      # vGPU topology daemon configuration
      virtualTopology:
        config: ""
      # kernel module configuration for NVIDIA driver
      kernelModuleConfig:
        name: ""
    toolkit:
      enabled: true
      repository: nvcr.io/nvidia/k8s
      image: container-toolkit
      version: v1.15.0-rc.4-ubuntu20.04
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: CONTAINERD_CONFIG
          value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
        - name: CONTAINERD_SOCKET
          value: /run/k3s/containerd/containerd.sock
      resources: {}
      installDir: "/usr/local/nvidia"
    devicePlugin:
      enabled: true
      repository: nvcr.io/nvidia
      image: k8s-device-plugin
      version: v0.15.0-rc.2-ubi8
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      args: []
      env:
        - name: PASS_DEVICE_SPECS
          value: "true"
        - name: FAIL_ON_INIT_ERROR
          value: "true"
        - name: DEVICE_LIST_STRATEGY
          value: envvar
        - name: DEVICE_ID_STRATEGY
          value: uuid
        - name: NVIDIA_VISIBLE_DEVICES
          value: all
        - name: NVIDIA_DRIVER_CAPABILITIES
          value: all
      resources: {}
      # Plugin configuration
      # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
      # Use "data" to build an integrated ConfigMap from a set of configurations as
      # part of this helm chart. An example of setting "data" might be:
      # config:
      #   name: device-plugin-config
      #   create: true
      #   data:
      #     default: |-
      #       version: v1
      #       flags:
      #         migStrategy: none
      #     mig-single: |-
      #       version: v1
      #       flags:
      #         migStrategy: single
      #     mig-mixed: |-
      #       version: v1
      #       flags:
      #         migStrategy: mixed
      config:
        # Create a ConfigMap (default: false)
        create: false
        # ConfigMap name (either exiting or to create a new one with create=true above)
        name: ""
        # Default config name within the ConfigMap
        default: ""
        # Data section for the ConfigMap to create (i.e only applies when create=true)
        data: {}
      # MPS related configuration for the plugin
      mps:
        # MPS root path on the host
        root: "/run/nvidia/mps"
    # standalone dcgm hostengine
    dcgm:
      # disabled by default to use embedded nv-hostengine by exporter
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: dcgm
      version: 3.3.3-1-ubuntu22.04
      imagePullPolicy: IfNotPresent
      hostPort: 5555
      args: []
      env: []
      resources: {}
    dcgmExporter:
      enabled: true
      repository: nvcr.io/nvidia/k8s
      image: dcgm-exporter
      version: 3.3.5-3.4.0-ubuntu22.04
      imagePullPolicy: IfNotPresent
      env:
        - name: DCGM_EXPORTER_LISTEN
          value: ":9400"
        - name: DCGM_EXPORTER_KUBERNETES
          value: "true"
        - name: DCGM_EXPORTER_COLLECTORS
          value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
      resources: {}
      serviceMonitor:
        enabled: false
        interval: 15s
        honorLabels: false
        additionalLabels: {}
        relabelings: []
        # - source_labels:
        #     - __meta_kubernetes_pod_node_name
        #   regex: (.*)
        #   target_label: instance
        #   replacement: $1
        #   action: replace
    gfd:
      enabled: true
      repository: nvcr.io/nvidia
      image: k8s-device-plugin
      version: v0.15.0-rc.2-ubi8
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: GFD_SLEEP_INTERVAL
          value: 60s
        - name: GFD_FAIL_ON_INIT_ERROR
          value: "true"
      resources: {}
    migManager:
      enabled: true
      repository: nvcr.io/nvidia/cloud-native
      image: k8s-mig-manager
      version: v0.6.0-ubuntu20.04
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: WITH_REBOOT
          value: "false"
      resources: {}
      config:
        name: "default-mig-parted-config"
        default: "all-disabled"
      gpuClientsConfig:
        name: ""
    nodeStatusExporter:
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: gpu-operator-validator
      # If version is not specified, then default is to use chart.AppVersion
      #version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      resources: {}
    gds:
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: nvidia-fs
      version: "2.17.5"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      args: []
    gdrcopy:
      enabled: false
      repository: nvcr.io/nvidia/cloud-native
      image: gdrdrv
      version: "v2.4.1"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      args: []
    vgpuManager:
      enabled: false
      repository: ""
      image: vgpu-manager
      version: ""
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      resources: {}
      driverManager:
        image: k8s-driver-manager
        repository: nvcr.io/nvidia/cloud-native
        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
        version: v0.6.7
        imagePullPolicy: IfNotPresent
        env:
          - name: ENABLE_GPU_POD_EVICTION
            value: "false"
          - name: ENABLE_AUTO_DRAIN
            value: "false"
    vgpuDeviceManager:
      enabled: true
      repository: nvcr.io/nvidia/cloud-native
      image: vgpu-device-manager
      version: "v0.2.5"
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      config:
        name: ""
        default: "default"
    vfioManager:
      enabled: true
      repository: nvcr.io/nvidia
      image: cuda
      version: 12.3.2-base-ubi8
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      resources: {}
      driverManager:
        image: k8s-driver-manager
        repository: nvcr.io/nvidia/cloud-native
        # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
        # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
        version: v0.6.7
        imagePullPolicy: IfNotPresent
        env:
          - name: ENABLE_GPU_POD_EVICTION
            value: "false"
          - name: ENABLE_AUTO_DRAIN
            value: "false"
    kataManager:
      enabled: false
      config:
        artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
        runtimeClasses:
          - name: kata-qemu-nvidia-gpu
            nodeSelector: {}
            artifacts:
              url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
              pullSecret: ""
          - name: kata-qemu-nvidia-gpu-snp
            nodeSelector:
              "nvidia.com/cc.capable": "true"
            artifacts:
              url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
              pullSecret: ""
      repository: nvcr.io/nvidia/cloud-native
      image: k8s-kata-manager
      version: v0.1.2
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env: []
      resources: {}
    sandboxDevicePlugin:
      enabled: true
      repository: nvcr.io/nvidia
      image: kubevirt-gpu-device-plugin
      version: v1.2.6
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      args: []
      env: []
      resources: {}
    ccManager:
      enabled: false
      defaultMode: "off"
      repository: nvcr.io/nvidia/cloud-native
      image: k8s-cc-manager
      version: v0.1.1
      imagePullPolicy: IfNotPresent
      imagePullSecrets: []
      env:
        - name: CC_CAPABLE_DEVICE_IDS
          value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d"
      resources: {}
    node-feature-discovery:
      enableNodeFeatureApi: true
      gc:
        enable: true
        replicaCount: 1
        serviceAccount:
          name: node-feature-discovery
          create: false
      worker:
        serviceAccount:
          name: node-feature-discovery
          # disable creation to avoid duplicate serviceaccount creation by master spec below
          create: false
        tolerations:
        - key: "node-role.kubernetes.io/master"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: "node-role.kubernetes.io/control-plane"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
        config:
          sources:
            pci:
              deviceClassWhitelist:
              - "02"
              - "0200"
              - "0207"
              - "0300"
              - "0302"
              deviceLabelFields:
              - vendor
      master:
        serviceAccount:
          name: node-feature-discovery
          create: true
        config:
          extraLabelNs: ["nvidia.com"]
          # noPublish: false
          # resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"]
          # enableTaints: false
          # labelWhiteList: "nvidia.com/gpu"