Merge branch 'master' into rf-service-wiring-1c

2024-06-07 19:40:48 +00:00 · 2024-06-06 09:18:52 -04:00 · 2024-06-06 09:18:52 -04:00 · d2cf353eb0
commit d2cf353eb0
parent fd089dd022 4c9623f50d
4 changed files with 59 additions and 7 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -70,6 +70,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
          sudo apt update
          sudo apt install -y intel-basekit
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@ -127,6 +133,7 @@ jobs:
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          export PATH=/opt/rocm/bin:$PATH
          source /opt/intel/oneapi/setvars.sh
          GO_TAGS=p2p make dist
      - uses: actions/upload-artifact@v4
        with:
--- a/2
+++ b/2
@ -120,6 +120,7 @@ RUN <<EOT bash
            apt-get update && \
            apt-get install -y --no-install-recommends \
                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
@ -139,6 +140,7 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
--- a/20
+++ b/20
@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
-WHISPER_CPP_VERSION?=af5833e29819810f2d83228228a9a3077e5ccd93
+WHISPER_CPP_VERSION?=ffef323c4cfa8596cb91cf92d6f791f01a44335e
 # bert.cpp version
 BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
@ -100,7 +100,7 @@ ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
 	export WHISPER_CUDA=1
-	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
 endif
 ifeq ($(BUILD_TYPE),hipblas)
@ -328,6 +328,8 @@ ifeq ($(OS),Darwin)
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
 	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
 	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
 	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
 endif
 	$(MAKE) build
 	mkdir -p release
@ -720,6 +722,20 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
 	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
 backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
 	$(MAKE) -C backend/cpp/llama-sycl_f16 purge
 	$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
 	BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
 backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
 	$(MAKE) -C backend/cpp/llama-sycl_f32 purge
 	$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
 	BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
 backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-grpc
 	$(MAKE) -C backend/cpp/llama-grpc purge
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -38,7 +38,10 @@ const (
 	LLamaCPPFallback = "llama-cpp-fallback"
 	LLamaCPPCUDA     = "llama-cpp-cuda"
 	LLamaCPPHipblas  = "llama-cpp-hipblas"
-	LLamaCPPGRPC     = "llama-cpp-grpc"
+	LLamaCPPSycl16   = "llama-cpp-sycl_16"
 	LLamaCPPSycl32   = "llama-cpp-sycl_32"
 	LLamaCPPGRPC = "llama-cpp-grpc"
 	Gpt4AllLlamaBackend = "gpt4all-llama"
 	Gpt4AllMptBackend   = "gpt4all-mpt"
@ -94,7 +97,7 @@ ENTRY:
 	if autoDetect {
 		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
 		// when starting the service
-		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false
+		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
 		if _, ok := backends[LLamaCPP]; !ok {
 			for _, e := range entry {
 				if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
@ -121,6 +124,14 @@ ENTRY:
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
 					foundLCPPHipblas = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
 					foundSycl16 = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
 					foundSycl32 = true
 				}
 			}
 		}
 	}
@ -172,9 +183,10 @@ ENTRY:
 }
 // selectGRPCProcess selects the GRPC process to start based on system capabilities
-func selectGRPCProcess(backend, assetDir string) string {
+func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 	foundCUDA := false
 	foundAMDGPU := false
 	foundIntelGPU := false
 	var grpcProcess string
 	// Select backend now just for llama.cpp
@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string {
 					log.Info().Msgf("GPU device found but no HIPBLAS backend present")
 				}
 			}
 			if strings.Contains(gpu.String(), "intel") {
 				backend := LLamaCPPSycl16
 				if !f16 {
 					backend = LLamaCPPSycl32
 				}
 				p := backendPath(assetDir, backend)
 				if _, err := os.Stat(p); err == nil {
 					log.Info().Msgf("[%s] attempting to load with Intel variant", backend)
 					grpcProcess = p
 					foundIntelGPU = true
 				} else {
 					log.Info().Msgf("GPU device found but no Intel backend present")
 				}
 			}
 		}
 	}
-	if foundCUDA || foundAMDGPU {
+	if foundCUDA || foundAMDGPU || foundIntelGPU {
 		return grpcProcess
 	}
@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string {
 // It also loads the model
 func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
 	return func(modelName, modelFile string) (ModelAddress, error) {
 		log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
 		var client ModelAddress
@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			if autoDetect {
 				// autoDetect GRPC process to start based on system capabilities
-				if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
+				if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
 					grpcProcess = selectedProcess
 				}
 			}