From c51dc2e2d709312f293ce8a7e84b61399f52470e Mon Sep 17 00:00:00 2001 From: Tyler Perkins Date: Wed, 16 Oct 2024 09:55:38 -0400 Subject: [PATCH] Add nvidia operator --- infra/nvidia/helmrelease-nvidia-operator.yaml | 556 ++++++++++++++++++ 1 file changed, 556 insertions(+) create mode 100644 infra/nvidia/helmrelease-nvidia-operator.yaml diff --git a/infra/nvidia/helmrelease-nvidia-operator.yaml b/infra/nvidia/helmrelease-nvidia-operator.yaml new file mode 100644 index 0000000..424eace --- /dev/null +++ b/infra/nvidia/helmrelease-nvidia-operator.yaml @@ -0,0 +1,556 @@ +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: gpu-operator + namespace: nvidia-system +spec: + chart: + spec: + chart: gpu-operator + sourceRef: + kind: HelmRepository + name: nvidia-operator + namespace: flux-system + interval: 15m0s + timeout: 5m + releaseName: gpu-operator + values: + # Default values for gpu-operator. + # This is a YAML-formatted file. + # Declare variables to be passed into your templates. + + platform: + openshift: false + + nfd: + enabled: true + nodefeaturerules: false + + psa: + enabled: false + + cdi: + enabled: false + default: false + + sandboxWorkloads: + enabled: false + defaultWorkload: "container" + + daemonsets: + labels: {} + annotations: {} + priorityClassName: system-node-critical + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands + # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions + updateStrategy: "RollingUpdate" + # configuration for controlling rolling update of GPU Operands + rollingUpdate: + # maximum number of nodes to simultaneously apply pod updates on. + # can be specified either as number or percentage of nodes. Default 1. + maxUnavailable: "1" + + validator: + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + resources: {} + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + + operator: + repository: nvcr.io/nvidia + image: gpu-operator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + priorityClassName: system-node-critical + defaultRuntime: docker + runtimeClass: nvidia + use_ocp_driver_toolkit: false + # cleanup CRD on chart un-install + cleanupCRD: false + # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag + # to be passed during helm upgrade. + upgradeCRD: false + initContainer: + image: cuda + repository: nvcr.io/nvidia + version: 12.3.2-base-ubi8 + imagePullPolicy: IfNotPresent + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + annotations: + openshift.io/scc: restricted-readonly + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + logging: + # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano') + timeEncoding: epoch + # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity + level: info + # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn) + # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) + develMode: false + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + + mig: + strategy: single + + driver: + enabled: true + nvidiaDriverCRD: + enabled: false + deployDefaultCR: true + driverType: gpu + nodeSelector: {} + useOpenKernelModules: false + # use pre-compiled packages for NVIDIA driver installation. + # only supported for as a tech-preview feature on ubuntu22.04 kernels. + usePrecompiled: false + repository: nvcr.io/nvidia + image: driver + version: "550.54.15" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 10 + # nvidia-smi can take longer than 30s in some cases + # ensure enough timeout is set + timeoutSeconds: 60 + failureThreshold: 120 + rdma: + enabled: false + useHostMofed: false + upgradePolicy: + # global switch for automatic upgrade feature + # if set to false all other options are ignored + autoUpgrade: true + # how many nodes can be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel + maxParallelUpgrades: 1 + # maximum number of nodes with the driver installed, that can be unavailable during + # the upgrade. Value can be an absolute number (ex: 5) or + # a percentage of total nodes at the start of upgrade (ex: + # 10%). Absolute number is calculated from percentage by rounding + # up. By default, a fixed value of 25% is used.' + maxUnavailable: 25% + # options for waiting on pod(job) completions + waitForCompletion: + timeoutSeconds: 0 + podSelector: "" + # options for gpu pod deletion + gpuPodDeletion: + force: false + timeoutSeconds: 300 + deleteEmptyDir: false + # options for node drain (`kubectl drain`) before the driver reload + # this is required only if default GPU pod deletions done by the operator + # are not sufficient to re-install the driver + drain: + enable: false + force: false + podSelector: "" + # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries + timeoutSeconds: 300 + deleteEmptyDir: false + manager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.6.7 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: "0s" + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + env: [] + resources: {} + # Private mirror repository configuration + repoConfig: + configMapName: "" + # custom ssl key/certificate configuration + certConfig: + name: "" + # vGPU licensing configuration + licensingConfig: + configMapName: "" + nlsEnabled: true + # vGPU topology daemon configuration + virtualTopology: + config: "" + # kernel module configuration for NVIDIA driver + kernelModuleConfig: + name: "" + + toolkit: + enabled: true + repository: nvcr.io/nvidia/k8s + image: container-toolkit + version: v1.15.0-rc.4-ubuntu20.04 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: CONTAINERD_CONFIG + value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + resources: {} + installDir: "/usr/local/nvidia" + + devicePlugin: + enabled: true + repository: nvcr.io/nvidia + image: k8s-device-plugin + version: v0.15.0-rc.2-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + resources: {} + # Plugin configuration + # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). + # Use "data" to build an integrated ConfigMap from a set of configurations as + # part of this helm chart. An example of setting "data" might be: + # config: + # name: device-plugin-config + # create: true + # data: + # default: |- + # version: v1 + # flags: + # migStrategy: none + # mig-single: |- + # version: v1 + # flags: + # migStrategy: single + # mig-mixed: |- + # version: v1 + # flags: + # migStrategy: mixed + config: + # Create a ConfigMap (default: false) + create: false + # ConfigMap name (either exiting or to create a new one with create=true above) + name: "" + # Default config name within the ConfigMap + default: "" + # Data section for the ConfigMap to create (i.e only applies when create=true) + data: {} + # MPS related configuration for the plugin + mps: + # MPS root path on the host + root: "/run/nvidia/mps" + + # standalone dcgm hostengine + dcgm: + # disabled by default to use embedded nv-hostengine by exporter + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: dcgm + version: 3.3.3-1-ubuntu22.04 + imagePullPolicy: IfNotPresent + hostPort: 5555 + args: [] + env: [] + resources: {} + + dcgmExporter: + enabled: true + repository: nvcr.io/nvidia/k8s + image: dcgm-exporter + version: 3.3.5-3.4.0-ubuntu22.04 + imagePullPolicy: IfNotPresent + env: + - name: DCGM_EXPORTER_LISTEN + value: ":9400" + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + resources: {} + serviceMonitor: + enabled: false + interval: 15s + honorLabels: false + additionalLabels: {} + relabelings: [] + # - source_labels: + # - __meta_kubernetes_pod_node_name + # regex: (.*) + # target_label: instance + # replacement: $1 + # action: replace + + gfd: + enabled: true + repository: nvcr.io/nvidia + image: k8s-device-plugin + version: v0.15.0-rc.2-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + resources: {} + + migManager: + enabled: true + repository: nvcr.io/nvidia/cloud-native + image: k8s-mig-manager + version: v0.6.0-ubuntu20.04 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: WITH_REBOOT + value: "false" + resources: {} + config: + name: "default-mig-parted-config" + default: "all-disabled" + gpuClientsConfig: + name: "" + + nodeStatusExporter: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + resources: {} + + gds: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: nvidia-fs + version: "2.17.5" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + + gdrcopy: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gdrdrv + version: "v2.4.1" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + + vgpuManager: + enabled: false + repository: "" + image: vgpu-manager + version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + driverManager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.6.7 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + + vgpuDeviceManager: + enabled: true + repository: nvcr.io/nvidia/cloud-native + image: vgpu-device-manager + version: "v0.2.5" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + config: + name: "" + default: "default" + + vfioManager: + enabled: true + repository: nvcr.io/nvidia + image: cuda + version: 12.3.2-base-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + driverManager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.6.7 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + + kataManager: + enabled: false + config: + artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses" + runtimeClasses: + - name: kata-qemu-nvidia-gpu + nodeSelector: {} + artifacts: + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 + pullSecret: "" + - name: kata-qemu-nvidia-gpu-snp + nodeSelector: + "nvidia.com/cc.capable": "true" + artifacts: + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp + pullSecret: "" + repository: nvcr.io/nvidia/cloud-native + image: k8s-kata-manager + version: v0.1.2 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + + sandboxDevicePlugin: + enabled: true + repository: nvcr.io/nvidia + image: kubevirt-gpu-device-plugin + version: v1.2.6 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: [] + resources: {} + + ccManager: + enabled: false + defaultMode: "off" + repository: nvcr.io/nvidia/cloud-native + image: k8s-cc-manager + version: v0.1.1 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: CC_CAPABLE_DEVICE_IDS + value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d" + resources: {} + + node-feature-discovery: + enableNodeFeatureApi: true + gc: + enable: true + replicaCount: 1 + serviceAccount: + name: node-feature-discovery + create: false + worker: + serviceAccount: + name: node-feature-discovery + # disable creation to avoid duplicate serviceaccount creation by master spec below + create: false + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "0200" + - "0207" + - "0300" + - "0302" + deviceLabelFields: + - vendor + master: + serviceAccount: + name: node-feature-discovery + create: true + config: + extraLabelNs: ["nvidia.com"] + # noPublish: false + # resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"] + # enableTaints: false + # labelWhiteList: "nvidia.com/gpu"