Add nvidia operator
This commit is contained in:
parent
019293c13f
commit
c51dc2e2d7
556
infra/nvidia/helmrelease-nvidia-operator.yaml
Normal file
556
infra/nvidia/helmrelease-nvidia-operator.yaml
Normal file
@ -0,0 +1,556 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
namespace: nvidia-system
|
||||
spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: gpu-operator
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: nvidia-operator
|
||||
namespace: flux-system
|
||||
interval: 15m0s
|
||||
timeout: 5m
|
||||
releaseName: gpu-operator
|
||||
values:
|
||||
# Default values for gpu-operator.
|
||||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
|
||||
platform:
|
||||
openshift: false
|
||||
|
||||
nfd:
|
||||
enabled: true
|
||||
nodefeaturerules: false
|
||||
|
||||
psa:
|
||||
enabled: false
|
||||
|
||||
cdi:
|
||||
enabled: false
|
||||
default: false
|
||||
|
||||
sandboxWorkloads:
|
||||
enabled: false
|
||||
defaultWorkload: "container"
|
||||
|
||||
daemonsets:
|
||||
labels: {}
|
||||
annotations: {}
|
||||
priorityClassName: system-node-critical
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
# configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands
|
||||
# note that driver Daemonset is always set with OnDelete to avoid unintended disruptions
|
||||
updateStrategy: "RollingUpdate"
|
||||
# configuration for controlling rolling update of GPU Operands
|
||||
rollingUpdate:
|
||||
# maximum number of nodes to simultaneously apply pod updates on.
|
||||
# can be specified either as number or percentage of nodes. Default 1.
|
||||
maxUnavailable: "1"
|
||||
|
||||
validator:
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: gpu-operator-validator
|
||||
# If version is not specified, then default is to use chart.AppVersion
|
||||
#version: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
args: []
|
||||
resources: {}
|
||||
plugin:
|
||||
env:
|
||||
- name: WITH_WORKLOAD
|
||||
value: "false"
|
||||
|
||||
operator:
|
||||
repository: nvcr.io/nvidia
|
||||
image: gpu-operator
|
||||
# If version is not specified, then default is to use chart.AppVersion
|
||||
#version: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
priorityClassName: system-node-critical
|
||||
defaultRuntime: docker
|
||||
runtimeClass: nvidia
|
||||
use_ocp_driver_toolkit: false
|
||||
# cleanup CRD on chart un-install
|
||||
cleanupCRD: false
|
||||
# upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
|
||||
# to be passed during helm upgrade.
|
||||
upgradeCRD: false
|
||||
initContainer:
|
||||
image: cuda
|
||||
repository: nvcr.io/nvidia
|
||||
version: 12.3.2-base-ubi8
|
||||
imagePullPolicy: IfNotPresent
|
||||
tolerations:
|
||||
- key: "node-role.kubernetes.io/master"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
- key: "node-role.kubernetes.io/control-plane"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
annotations:
|
||||
openshift.io/scc: restricted-readonly
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 1
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: "node-role.kubernetes.io/master"
|
||||
operator: In
|
||||
values: [""]
|
||||
- weight: 1
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: "node-role.kubernetes.io/control-plane"
|
||||
operator: In
|
||||
values: [""]
|
||||
logging:
|
||||
# Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')
|
||||
timeEncoding: epoch
|
||||
# Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity
|
||||
level: info
|
||||
# Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)
|
||||
# Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)
|
||||
develMode: false
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 350Mi
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 100Mi
|
||||
|
||||
mig:
|
||||
strategy: single
|
||||
|
||||
driver:
|
||||
enabled: true
|
||||
nvidiaDriverCRD:
|
||||
enabled: false
|
||||
deployDefaultCR: true
|
||||
driverType: gpu
|
||||
nodeSelector: {}
|
||||
useOpenKernelModules: false
|
||||
# use pre-compiled packages for NVIDIA driver installation.
|
||||
# only supported for as a tech-preview feature on ubuntu22.04 kernels.
|
||||
usePrecompiled: false
|
||||
repository: nvcr.io/nvidia
|
||||
image: driver
|
||||
version: "550.54.15"
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
startupProbe:
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 10
|
||||
# nvidia-smi can take longer than 30s in some cases
|
||||
# ensure enough timeout is set
|
||||
timeoutSeconds: 60
|
||||
failureThreshold: 120
|
||||
rdma:
|
||||
enabled: false
|
||||
useHostMofed: false
|
||||
upgradePolicy:
|
||||
# global switch for automatic upgrade feature
|
||||
# if set to false all other options are ignored
|
||||
autoUpgrade: true
|
||||
# how many nodes can be upgraded in parallel
|
||||
# 0 means no limit, all nodes will be upgraded in parallel
|
||||
maxParallelUpgrades: 1
|
||||
# maximum number of nodes with the driver installed, that can be unavailable during
|
||||
# the upgrade. Value can be an absolute number (ex: 5) or
|
||||
# a percentage of total nodes at the start of upgrade (ex:
|
||||
# 10%). Absolute number is calculated from percentage by rounding
|
||||
# up. By default, a fixed value of 25% is used.'
|
||||
maxUnavailable: 25%
|
||||
# options for waiting on pod(job) completions
|
||||
waitForCompletion:
|
||||
timeoutSeconds: 0
|
||||
podSelector: ""
|
||||
# options for gpu pod deletion
|
||||
gpuPodDeletion:
|
||||
force: false
|
||||
timeoutSeconds: 300
|
||||
deleteEmptyDir: false
|
||||
# options for node drain (`kubectl drain`) before the driver reload
|
||||
# this is required only if default GPU pod deletions done by the operator
|
||||
# are not sufficient to re-install the driver
|
||||
drain:
|
||||
enable: false
|
||||
force: false
|
||||
podSelector: ""
|
||||
# It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
|
||||
timeoutSeconds: 300
|
||||
deleteEmptyDir: false
|
||||
manager:
|
||||
image: k8s-driver-manager
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
|
||||
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
|
||||
version: v0.6.7
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: ENABLE_GPU_POD_EVICTION
|
||||
value: "true"
|
||||
- name: ENABLE_AUTO_DRAIN
|
||||
value: "false"
|
||||
- name: DRAIN_USE_FORCE
|
||||
value: "false"
|
||||
- name: DRAIN_POD_SELECTOR_LABEL
|
||||
value: ""
|
||||
- name: DRAIN_TIMEOUT_SECONDS
|
||||
value: "0s"
|
||||
- name: DRAIN_DELETE_EMPTYDIR_DATA
|
||||
value: "false"
|
||||
env: []
|
||||
resources: {}
|
||||
# Private mirror repository configuration
|
||||
repoConfig:
|
||||
configMapName: ""
|
||||
# custom ssl key/certificate configuration
|
||||
certConfig:
|
||||
name: ""
|
||||
# vGPU licensing configuration
|
||||
licensingConfig:
|
||||
configMapName: ""
|
||||
nlsEnabled: true
|
||||
# vGPU topology daemon configuration
|
||||
virtualTopology:
|
||||
config: ""
|
||||
# kernel module configuration for NVIDIA driver
|
||||
kernelModuleConfig:
|
||||
name: ""
|
||||
|
||||
toolkit:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia/k8s
|
||||
image: container-toolkit
|
||||
version: v1.15.0-rc.4-ubuntu20.04
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env:
|
||||
- name: CONTAINERD_CONFIG
|
||||
value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
|
||||
- name: CONTAINERD_SOCKET
|
||||
value: /run/k3s/containerd/containerd.sock
|
||||
resources: {}
|
||||
installDir: "/usr/local/nvidia"
|
||||
|
||||
devicePlugin:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia
|
||||
image: k8s-device-plugin
|
||||
version: v0.15.0-rc.2-ubi8
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
args: []
|
||||
env:
|
||||
- name: PASS_DEVICE_SPECS
|
||||
value: "true"
|
||||
- name: FAIL_ON_INIT_ERROR
|
||||
value: "true"
|
||||
- name: DEVICE_LIST_STRATEGY
|
||||
value: envvar
|
||||
- name: DEVICE_ID_STRATEGY
|
||||
value: uuid
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: all
|
||||
resources: {}
|
||||
# Plugin configuration
|
||||
# Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
|
||||
# Use "data" to build an integrated ConfigMap from a set of configurations as
|
||||
# part of this helm chart. An example of setting "data" might be:
|
||||
# config:
|
||||
# name: device-plugin-config
|
||||
# create: true
|
||||
# data:
|
||||
# default: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: none
|
||||
# mig-single: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: single
|
||||
# mig-mixed: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: mixed
|
||||
config:
|
||||
# Create a ConfigMap (default: false)
|
||||
create: false
|
||||
# ConfigMap name (either exiting or to create a new one with create=true above)
|
||||
name: ""
|
||||
# Default config name within the ConfigMap
|
||||
default: ""
|
||||
# Data section for the ConfigMap to create (i.e only applies when create=true)
|
||||
data: {}
|
||||
# MPS related configuration for the plugin
|
||||
mps:
|
||||
# MPS root path on the host
|
||||
root: "/run/nvidia/mps"
|
||||
|
||||
# standalone dcgm hostengine
|
||||
dcgm:
|
||||
# disabled by default to use embedded nv-hostengine by exporter
|
||||
enabled: false
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: dcgm
|
||||
version: 3.3.3-1-ubuntu22.04
|
||||
imagePullPolicy: IfNotPresent
|
||||
hostPort: 5555
|
||||
args: []
|
||||
env: []
|
||||
resources: {}
|
||||
|
||||
dcgmExporter:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia/k8s
|
||||
image: dcgm-exporter
|
||||
version: 3.3.5-3.4.0-ubuntu22.04
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: DCGM_EXPORTER_LISTEN
|
||||
value: ":9400"
|
||||
- name: DCGM_EXPORTER_KUBERNETES
|
||||
value: "true"
|
||||
- name: DCGM_EXPORTER_COLLECTORS
|
||||
value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
|
||||
resources: {}
|
||||
serviceMonitor:
|
||||
enabled: false
|
||||
interval: 15s
|
||||
honorLabels: false
|
||||
additionalLabels: {}
|
||||
relabelings: []
|
||||
# - source_labels:
|
||||
# - __meta_kubernetes_pod_node_name
|
||||
# regex: (.*)
|
||||
# target_label: instance
|
||||
# replacement: $1
|
||||
# action: replace
|
||||
|
||||
gfd:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia
|
||||
image: k8s-device-plugin
|
||||
version: v0.15.0-rc.2-ubi8
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env:
|
||||
- name: GFD_SLEEP_INTERVAL
|
||||
value: 60s
|
||||
- name: GFD_FAIL_ON_INIT_ERROR
|
||||
value: "true"
|
||||
resources: {}
|
||||
|
||||
migManager:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: k8s-mig-manager
|
||||
version: v0.6.0-ubuntu20.04
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env:
|
||||
- name: WITH_REBOOT
|
||||
value: "false"
|
||||
resources: {}
|
||||
config:
|
||||
name: "default-mig-parted-config"
|
||||
default: "all-disabled"
|
||||
gpuClientsConfig:
|
||||
name: ""
|
||||
|
||||
nodeStatusExporter:
|
||||
enabled: false
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: gpu-operator-validator
|
||||
# If version is not specified, then default is to use chart.AppVersion
|
||||
#version: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
resources: {}
|
||||
|
||||
gds:
|
||||
enabled: false
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: nvidia-fs
|
||||
version: "2.17.5"
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
args: []
|
||||
|
||||
gdrcopy:
|
||||
enabled: false
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: gdrdrv
|
||||
version: "v2.4.1"
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
args: []
|
||||
|
||||
vgpuManager:
|
||||
enabled: false
|
||||
repository: ""
|
||||
image: vgpu-manager
|
||||
version: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
resources: {}
|
||||
driverManager:
|
||||
image: k8s-driver-manager
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
|
||||
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
|
||||
version: v0.6.7
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: ENABLE_GPU_POD_EVICTION
|
||||
value: "false"
|
||||
- name: ENABLE_AUTO_DRAIN
|
||||
value: "false"
|
||||
|
||||
vgpuDeviceManager:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: vgpu-device-manager
|
||||
version: "v0.2.5"
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
config:
|
||||
name: ""
|
||||
default: "default"
|
||||
|
||||
vfioManager:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia
|
||||
image: cuda
|
||||
version: 12.3.2-base-ubi8
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
resources: {}
|
||||
driverManager:
|
||||
image: k8s-driver-manager
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
|
||||
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
|
||||
version: v0.6.7
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: ENABLE_GPU_POD_EVICTION
|
||||
value: "false"
|
||||
- name: ENABLE_AUTO_DRAIN
|
||||
value: "false"
|
||||
|
||||
kataManager:
|
||||
enabled: false
|
||||
config:
|
||||
artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
|
||||
runtimeClasses:
|
||||
- name: kata-qemu-nvidia-gpu
|
||||
nodeSelector: {}
|
||||
artifacts:
|
||||
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
|
||||
pullSecret: ""
|
||||
- name: kata-qemu-nvidia-gpu-snp
|
||||
nodeSelector:
|
||||
"nvidia.com/cc.capable": "true"
|
||||
artifacts:
|
||||
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
|
||||
pullSecret: ""
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: k8s-kata-manager
|
||||
version: v0.1.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env: []
|
||||
resources: {}
|
||||
|
||||
sandboxDevicePlugin:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia
|
||||
image: kubevirt-gpu-device-plugin
|
||||
version: v1.2.6
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
args: []
|
||||
env: []
|
||||
resources: {}
|
||||
|
||||
ccManager:
|
||||
enabled: false
|
||||
defaultMode: "off"
|
||||
repository: nvcr.io/nvidia/cloud-native
|
||||
image: k8s-cc-manager
|
||||
version: v0.1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
env:
|
||||
- name: CC_CAPABLE_DEVICE_IDS
|
||||
value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d"
|
||||
resources: {}
|
||||
|
||||
node-feature-discovery:
|
||||
enableNodeFeatureApi: true
|
||||
gc:
|
||||
enable: true
|
||||
replicaCount: 1
|
||||
serviceAccount:
|
||||
name: node-feature-discovery
|
||||
create: false
|
||||
worker:
|
||||
serviceAccount:
|
||||
name: node-feature-discovery
|
||||
# disable creation to avoid duplicate serviceaccount creation by master spec below
|
||||
create: false
|
||||
tolerations:
|
||||
- key: "node-role.kubernetes.io/master"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
- key: "node-role.kubernetes.io/control-plane"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
config:
|
||||
sources:
|
||||
pci:
|
||||
deviceClassWhitelist:
|
||||
- "02"
|
||||
- "0200"
|
||||
- "0207"
|
||||
- "0300"
|
||||
- "0302"
|
||||
deviceLabelFields:
|
||||
- vendor
|
||||
master:
|
||||
serviceAccount:
|
||||
name: node-feature-discovery
|
||||
create: true
|
||||
config:
|
||||
extraLabelNs: ["nvidia.com"]
|
||||
# noPublish: false
|
||||
# resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"]
|
||||
# enableTaints: false
|
||||
# labelWhiteList: "nvidia.com/gpu"
|
Loading…
Reference in New Issue
Block a user