Gluttony-Cluster/vllm/vllm-deployment.yaml

59 lines
1.5 KiB
YAML
Raw Normal View History

2024-03-31 02:25:35 +00:00
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
namespace: vllm-ns
spec:
replicas: 1
selector:
matchLabels:
app: vllm-inference-server
template:
metadata:
labels:
app: vllm-inference-server
spec:
2024-03-31 02:39:54 +00:00
runtimeClassName: nvidia
2024-03-31 02:25:35 +00:00
containers:
- name: vllm-inference-server
2024-03-31 03:03:01 +00:00
image: vllm/vllm-openai:latest
2024-03-31 02:25:35 +00:00
imagePullPolicy: IfNotPresent
resources:
limits:
2024-03-31 02:45:48 +00:00
nvidia.com/gpu: 2
2024-03-31 02:25:35 +00:00
env:
- name: HUGGING_FACE_HUB_TOKEN
value: ""
- name: TRANSFORMERS_CACHE
value: /.cache
- name: shm-size
value: 1g
2024-03-31 02:35:21 +00:00
#command: ["/bin/bash", "-c"]
#args:
#- while true; do sleep 2600; done
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
2024-03-31 03:03:01 +00:00
args: ["--model=openai-community/gpt2",
2024-03-31 02:35:21 +00:00
"--gpu-memory-utilization=0.95",
"--disable-log-requests",
"--trust-remote-code",
"--port=8000",
2024-03-31 02:45:48 +00:00
"--dtype=half",
2024-03-31 02:49:18 +00:00
"--tensor-parallel-size=2"]
2024-03-31 02:25:35 +00:00
ports:
- containerPort: 8000
name: http
securityContext:
runAsUser: 1000
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /.cache
name: cache
volumes:
- name: cache
emptyDir: {}
- name: dshm
emptyDir:
medium: Memory