diff --git a/vllm/vllm-deployment.yaml b/vllm/vllm-deployment.yaml index dd9d994..9cb4da7 100644 --- a/vllm/vllm-deployment.yaml +++ b/vllm/vllm-deployment.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: vllm-inference-server - image: vllm/vllm-openai + image: vllm/vllm-openai:v0.3.3 imagePullPolicy: IfNotPresent resources: @@ -28,16 +28,16 @@ spec: value: /.cache - name: shm-size value: 1g - command: ["/bin/bash", "-c"] - args: - - while true; do sleep 2600; done - #command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] - #args: ["--model=meta-llama/Llama-2-7b-hf", - # "--gpu-memory-utilization=0.95", - # "--disable-log-requests", - # "--trust-remote-code", - # "--port=8000", - # "--tensor-parallel-size=1"] + #command: ["/bin/bash", "-c"] + #args: + #- while true; do sleep 2600; done + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: ["--model=meta-llama/Llama-2-7b-hf", + "--gpu-memory-utilization=0.95", + "--disable-log-requests", + "--trust-remote-code", + "--port=8000", + "--tensor-parallel-size=1"] ports: - containerPort: 8000 name: http