ci: add GPU tests (#1095)

* ci: test GPU Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci: show logs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Debug * debug Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * split extra/core images Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * split extra/core images Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * consider runner host dir Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-06-07 19:40:48 +00:00 · 2023-10-19 13:50:40 +02:00 · 2023-10-19 13:50:40 +02:00 · 432513c3ba
commit 432513c3ba
parent 45370c212b
6 changed files with 242 additions and 30 deletions
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@ -0,0 +1,60 @@
+---
+name: 'GPU tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-latest:
+    runs-on: self-hosted
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+        with: 
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
+      - name: Build
+        run: |
+          if [ ! -e /run/systemd/system ]; then
+            sudo mkdir /run/systemd/system
+          fi
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            BUILD_TYPE=cublas \
+            prepare-e2e run-e2e-image test-e2e
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          sudo rm -rf build || true
+          sudo rm -rf bin || true
+          sudo rm -rf dist || true
+          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
+          sudo cat logs.txt || true
+          sudo rm -rf logs.txt
+          make clean || true
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            teardown-e2e || true
+          docker system prune -f -a --volumes || true
--- a/80
+++ b/80
@ -1,6 +1,9 @@
 ARG GO_VERSION=1.21-bullseye
+ARG IMAGE_TYPE=extras
+# extras or core

-FROM golang:$GO_VERSION as requirements
+
+FROM golang:$GO_VERSION as requirements-core

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
@ -35,24 +38,6 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

-# Extras requirements
-COPY extra/requirements.txt /build/extra/requirements.txt
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN pip install --upgrade pip
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-    fi
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
-    fi
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
-
-# Vall-e-X
-RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
-
-WORKDIR /build
-
 # OpenBLAS requirements
 RUN apt-get install -y libopenblas-dev

@ -61,6 +46,8 @@ RUN apt-get install -y libopencv-dev && \
    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2


+WORKDIR /build
+
 # piper requirements
 # Use pre-compiled Piper phonemization library (includes onnxruntime)
 #RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
@ -80,17 +67,40 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/ && \
+    rm spdlog-${SPDLOG_VERSION} -rf && \
+    rm /build/lib/Linux-$(uname -m)/piper_phonemize -rf
+
+# Extras requirements
+FROM requirements-core as requirements-extras
+
+COPY extra/requirements.txt /build/extra/requirements.txt
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+    fi
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+    fi
+RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
+# Vall-e-X
+RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
+
 # \
 #    ; fi

 ###################################
 ###################################

-FROM requirements as builder
+FROM requirements-${IMAGE_TYPE} as builder

 ARG GO_TAGS="stablediffusion tts"
-
+ARG GRPC_BACKENDS
+ARG BUILD_GRPC=true
+ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
@ -108,10 +118,12 @@ COPY .git .
 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

-RUN git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+RUN if [ "${BUILD_GRPC}" = "true" ]; then \
+    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
-       ../.. && make -j12 install && rm -rf grpc
+       ../.. && make -j12 install && rm -rf grpc \
+    ; fi

 # Rebuild with defaults backends
 RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build
@ -119,7 +131,7 @@ RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data
 ###################################
 ###################################

-FROM requirements
+FROM requirements-${IMAGE_TYPE}

 ARG FFMPEG
 ARG BUILD_TYPE
@ -129,6 +141,11 @@ ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

+ARG CUDA_MAJOR_VERSION=11
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all
+
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg \
@ -146,16 +163,19 @@ RUN make prepare-sources
 # Copy the binary
 COPY --from=builder /build/local-ai ./

-# do not let piper rebuild (requires an older version of absl)
-COPY --from=builder /build/backend-assets/grpc/piper ./backend-assets/grpc/piper
+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion

 # Copy VALLE-X as it's not a real "lib"
-RUN cp -rfv /usr/lib/vall-e-x/* ./
+RUN if [ -d /usr/lib/vall-e-x ]; then \
+    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
+    fi

-# To resolve exllama import error
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+# we also copy exllama libs over to resolve exllama import error
+RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
    fi
+
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/27
+++ b/27
@ -47,6 +47,10 @@ CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git

+TEST_DIR=/tmp/test
+
+RANDOM := $(shell bash -c 'echo $$RANDOM')
+
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
@ -64,6 +68,9 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

+# Default Docker bridge IP
+E2E_BRIDGE_IP?=172.17.0.1
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@ -329,6 +336,26 @@ test: prepare test-models/testmodel grpcs
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

+prepare-e2e:
+	mkdir -p $(TEST_DIR)
+	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
+	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
+	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+
+run-e2e-image:
+	ls -liah $(abspath ./tests/e2e-fixtures)
+	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
+
+test-e2e:
+	@echo 'Running e2e tests'
+	BUILD_TYPE=$(BUILD_TYPE) \
+	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+
+teardown-e2e:
+	rm -rf $(TEST_DIR) || true
+	docker stop $$(docker ps -q --filter ancestor=localai-tests)
+
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
--- a/tests/e2e-fixtures/gpu.yaml
+++ b/tests/e2e-fixtures/gpu.yaml
@ -0,0 +1,17 @@
+context_size: 2048
+mirostat: 2
+mirostat_tau: 5.0
+mirostat_eta: 0.1
+f16: true
+threads: 1
+gpu_layers: 90
+name: gpt-4
+mmap: true
+parameters:
+  model: ggllm-test-model.bin
+  rope_freq_base: 10000 
+  max_tokens: 20
+  rope_freq_scale: 1
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@ -0,0 +1,18 @@
+package e2e_test
+
+import (
+	"os"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var (
+	localAIURL = os.Getenv("LOCALAI_API")
+)
+
+func TestLocalAI(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "LocalAI E2E test suite")
+}
--- a/tests/e2e/e2e_test.go
+++ b/tests/e2e/e2e_test.go
@ -0,0 +1,70 @@
+package e2e_test
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	openaigo "github.com/otiai10/openaigo"
+	"github.com/sashabaranov/go-openai"
+)
+
+var _ = Describe("E2E test", func() {
+	var client *openai.Client
+	var client2 *openaigo.Client
+
+	Context("API with ephemeral models", func() {
+		BeforeEach(func() {
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = localAIURL
+
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+
+		// Check that the GPU was used
+		AfterEach(func() {
+			cmd := exec.Command("/bin/bash", "-xce", "docker logs $(docker ps -q --filter ancestor=localai-tests)")
+			out, err := cmd.CombinedOutput()
+			Expect(err).ToNot(HaveOccurred(), string(out))
+			// Execute docker logs $$(docker ps -q --filter ancestor=localai-tests) as a command and check the output
+			if os.Getenv("BUILD_TYPE") == "cublas" {
+
+				Expect(string(out)).To(ContainSubstring("found 1 CUDA devices"), string(out))
+				Expect(string(out)).To(ContainSubstring("using CUDA for GPU acceleration"), string(out))
+			} else {
+				fmt.Println("Skipping GPU check")
+				Expect(string(out)).To(ContainSubstring("[llama-cpp] Loads OK"), string(out))
+				Expect(string(out)).To(ContainSubstring("llama_model_loader"), string(out))
+			}
+		})
+
+		Context("Generates text", func() {
+			It("streams chat tokens", func() {
+				model := "gpt-4"
+				resp, err := client.CreateChatCompletion(context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: model, Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "How much is 2+2?",
+							},
+						}})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")), fmt.Sprint(resp.Choices[0].Message.Content))
+			})
+		})
+	})
+})