Add back vllm
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Tyler Perkins 2024-03-30 22:25:35 -04:00
parent d0717f588d
commit efa0019bd1
Signed by: tyler
GPG Key ID: 03B27509E17EFDC8
2 changed files with 66 additions and 0 deletions

54
vllm/vllm-deployment.yaml Normal file
View File

@ -0,0 +1,54 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
namespace: vllm-ns
spec:
replicas: 1
selector:
matchLabels:
app: vllm-inference-server
template:
metadata:
labels:
app: vllm-inference-server
spec:
containers:
- name: vllm-inference-server
image: vllm/vllm-openai
imagePullPolicy: IfNotPresent
resources:
limits:
nvidia.com/gpu: 1
env:
- name: HUGGING_FACE_HUB_TOKEN
value: ""
- name: TRANSFORMERS_CACHE
value: /.cache
- name: shm-size
value: 1g
command: ["watch", "/bin/ls"]
#command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#args: ["--model=meta-llama/Llama-2-7b-hf",
# "--gpu-memory-utilization=0.95",
# "--disable-log-requests",
# "--trust-remote-code",
# "--port=8000",
# "--tensor-parallel-size=1"]
ports:
- containerPort: 8000
name: http
securityContext:
runAsUser: 1000
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /.cache
name: cache
volumes:
- name: cache
emptyDir: {}
- name: dshm
emptyDir:
medium: Memory

12
vllm/vllm-service.yaml Normal file
View File

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: vllm-inference-server
namespace: vllm-ns
spec:
selector:
app: vllm-inference-server
type: LoadBalancer
ports:
- port: 8000
targetPort: http