apiVersion: apps/v1 kind: Deployment metadata: name: openllm-deployment namespace: openllm-ns spec: replicas: 1 selector: matchLabels: app: openllm template: metadata: labels: app: openllm spec: runtimeClassName: nvidia containers: - name: openllm-container image: ghcr.io/bentoml/openllm command: ["start"] args: ["${MODEL_NAME}", "--backend", "vllm"] env: # Set this to desired deployment model - name: MODEL_NAME value: "meta-llama/Llama-2-13b-hf" - name: TRUST_REMOVE_CODE value: "True" - name: OPENLLM_DO_NOT_TRACK value: "True" ports: - containerPort: 3000 resources: limits: nvidia.com/gpu: 1 nodeSelector: kubernetes.io/os: linux