apiVersion: apps/v1 kind: Deployment metadata: name: openllm-deployment namespace: openllm-ns spec: replicas: 1 selector: matchLabels: app: openllm template: metadata: labels: app: openllm spec: runtimeClassName: nvidia containers: - name: openllm-container image: ghcr.io/bentoml/openllm # Set this to desired deployment model args: ["start", "HuggingFaceH4/zephyr-7b-beta", "--backend", "vllm"] env: - name: TRUST_REMOVE_CODE value: "True" - name: OPENLLM_DO_NOT_TRACK value: "True" ports: - containerPort: 3000 resources: limits: nvidia.com/gpu: 1 nodeSelector: kubernetes.io/os: linux