From efa0019bd10da602ddf70c967ea1e98898a14bda Mon Sep 17 00:00:00 2001 From: Tyler Perkins Date: Sat, 30 Mar 2024 22:25:35 -0400 Subject: [PATCH] Add back vllm --- vllm/vllm-deployment.yaml | 54 +++++++++++++++++++++++++++++++++++++++ vllm/vllm-service.yaml | 12 +++++++++ 2 files changed, 66 insertions(+) create mode 100644 vllm/vllm-deployment.yaml create mode 100644 vllm/vllm-service.yaml diff --git a/vllm/vllm-deployment.yaml b/vllm/vllm-deployment.yaml new file mode 100644 index 0000000..4c8e733 --- /dev/null +++ b/vllm/vllm-deployment.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server + namespace: vllm-ns +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-inference-server + template: + metadata: + labels: + app: vllm-inference-server + spec: + containers: + - name: vllm-inference-server + image: vllm/vllm-openai + imagePullPolicy: IfNotPresent + + resources: + limits: + nvidia.com/gpu: 1 + env: + - name: HUGGING_FACE_HUB_TOKEN + value: "" + - name: TRANSFORMERS_CACHE + value: /.cache + - name: shm-size + value: 1g + command: ["watch", "/bin/ls"] + #command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + #args: ["--model=meta-llama/Llama-2-7b-hf", + # "--gpu-memory-utilization=0.95", + # "--disable-log-requests", + # "--trust-remote-code", + # "--port=8000", + # "--tensor-parallel-size=1"] + ports: + - containerPort: 8000 + name: http + securityContext: + runAsUser: 1000 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /.cache + name: cache + volumes: + - name: cache + emptyDir: {} + - name: dshm + emptyDir: + medium: Memory diff --git a/vllm/vllm-service.yaml b/vllm/vllm-service.yaml new file mode 100644 index 0000000..26ce96a --- /dev/null +++ b/vllm/vllm-service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: vllm-inference-server + namespace: vllm-ns +spec: + selector: + app: vllm-inference-server + type: LoadBalancer + ports: + - port: 8000 + targetPort: http