apiVersion: apps/v1 kind: Deployment metadata: name: vllm-server namespace: vllm-ns spec: replicas: 1 selector: matchLabels: app: vllm-inference-server template: metadata: labels: app: vllm-inference-server spec: runtimeClassName: nvidia containers: - name: vllm-inference-server image: vllm/vllm-openai:v0.3.3 imagePullPolicy: IfNotPresent resources: limits: nvidia.com/gpu: 2 env: - name: HUGGING_FACE_HUB_TOKEN value: "" - name: TRANSFORMERS_CACHE value: /.cache - name: shm-size value: 1g #command: ["/bin/bash", "-c"] #args: #- while true; do sleep 2600; done command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: ["--model=mistralai/Mistral-7B-v0.1", "--gpu-memory-utilization=0.95", "--disable-log-requests", "--trust-remote-code", "--port=8000", "--dtype=half", "--tensor-parallel-size=2"] ports: - containerPort: 8000 name: http securityContext: runAsUser: 1000 volumeMounts: - mountPath: /dev/shm name: dshm - mountPath: /.cache name: cache volumes: - name: cache emptyDir: {} - name: dshm emptyDir: medium: Memory