From 596cf761356e23f4325b86785056acb3a19e23a5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Jun 2024 08:40:51 +0200 Subject: [PATCH 1/2] build(intel): bundle intel variants in single-binary (#2494) * wip: try to build also intel variants Signed-off-by: Ettore Di Giacinto * Add dependencies * Select automatically intel backend --------- Signed-off-by: Ettore Di Giacinto --- .github/workflows/release.yaml | 7 +++++++ Makefile | 16 +++++++++++++++ pkg/model/initializers.go | 37 +++++++++++++++++++++++++++++----- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index aae9c6b5..aadd8685 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -70,6 +70,12 @@ jobs: run: | sudo apt-get update sudo apt-get install build-essential ffmpeg protobuf-compiler ccache + - name: Intel Dependencies + run: | + wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list + sudo apt update + sudo apt install -y intel-basekit - name: Install CUDA Dependencies run: | curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb @@ -127,6 +133,7 @@ jobs: export PATH=$PATH:$GOPATH/bin export PATH=/usr/local/cuda/bin:$PATH export PATH=/opt/rocm/bin:$PATH + source /opt/intel/oneapi/setvars.sh GO_TAGS=p2p make dist - uses: actions/upload-artifact@v4 with: diff --git a/Makefile b/Makefile index 2f5b3f08..45e2d291 100644 --- a/Makefile +++ b/Makefile @@ -328,6 +328,8 @@ ifeq ($(OS),Darwin) else $(MAKE) backend-assets/grpc/llama-cpp-cuda $(MAKE) backend-assets/grpc/llama-cpp-hipblas + $(MAKE) backend-assets/grpc/llama-cpp-sycl_f16 + $(MAKE) backend-assets/grpc/llama-cpp-sycl_f32 endif $(MAKE) build mkdir -p release @@ -720,6 +722,20 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas +backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16 + $(MAKE) -C backend/cpp/llama-sycl_f16 purge + $(info ${GREEN}I llama-cpp build info:sycl_f16${RESET}) + BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16 + +backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32 + $(MAKE) -C backend/cpp/llama-sycl_f32 purge + $(info ${GREEN}I llama-cpp build info:sycl_f32${RESET}) + BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32 + backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-grpc $(MAKE) -C backend/cpp/llama-grpc purge diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index ec58c279..7572735e 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -38,7 +38,10 @@ const ( LLamaCPPFallback = "llama-cpp-fallback" LLamaCPPCUDA = "llama-cpp-cuda" LLamaCPPHipblas = "llama-cpp-hipblas" - LLamaCPPGRPC = "llama-cpp-grpc" + LLamaCPPSycl16 = "llama-cpp-sycl_16" + LLamaCPPSycl32 = "llama-cpp-sycl_32" + + LLamaCPPGRPC = "llama-cpp-grpc" Gpt4AllLlamaBackend = "gpt4all-llama" Gpt4AllMptBackend = "gpt4all-mpt" @@ -94,7 +97,7 @@ ENTRY: if autoDetect { // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up // when starting the service - foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false if _, ok := backends[LLamaCPP]; !ok { for _, e := range entry { if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { @@ -121,6 +124,14 @@ ENTRY: backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas) foundLCPPHipblas = true } + if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16) + foundSycl16 = true + } + if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32) + foundSycl32 = true + } } } } @@ -172,9 +183,10 @@ ENTRY: } // selectGRPCProcess selects the GRPC process to start based on system capabilities -func selectGRPCProcess(backend, assetDir string) string { +func selectGRPCProcess(backend, assetDir string, f16 bool) string { foundCUDA := false foundAMDGPU := false + foundIntelGPU := false var grpcProcess string // Select backend now just for llama.cpp @@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string { log.Info().Msgf("GPU device found but no HIPBLAS backend present") } } + if strings.Contains(gpu.String(), "intel") { + backend := LLamaCPPSycl16 + if !f16 { + backend = LLamaCPPSycl32 + } + p := backendPath(assetDir, backend) + if _, err := os.Stat(p); err == nil { + log.Info().Msgf("[%s] attempting to load with Intel variant", backend) + grpcProcess = p + foundIntelGPU = true + } else { + log.Info().Msgf("GPU device found but no Intel backend present") + } + } } } - if foundCUDA || foundAMDGPU { + if foundCUDA || foundAMDGPU || foundIntelGPU { return grpcProcess } @@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string { // It also loads the model func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) { return func(modelName, modelFile string) (ModelAddress, error) { + log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o) var client ModelAddress @@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string if autoDetect { // autoDetect GRPC process to start based on system capabilities - if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" { + if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" { grpcProcess = selectedProcess } } From 4c9623f50d15e2b9fd92a6ddbc57fab489679a53 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Jun 2024 08:41:04 +0200 Subject: [PATCH 2/2] deps(whisper): update, add libcufft-dev (#2501) * arrow_up: Update ggerganov/whisper.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * fix(build): add libcufft-dev Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- Dockerfile | 2 ++ Makefile | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 60df78d1..ba48a837 100644 --- a/Dockerfile +++ b/Dockerfile @@ -120,6 +120,7 @@ RUN <