mirror of
https://github.com/mudler/LocalAI.git
synced 2024-06-07 19:40:48 +00:00
Merge branch 'master' into rf-service-wiring-1c
This commit is contained in:
commit
d2cf353eb0
7
.github/workflows/release.yaml
vendored
7
.github/workflows/release.yaml
vendored
@ -70,6 +70,12 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
|
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
|
||||||
|
- name: Intel Dependencies
|
||||||
|
run: |
|
||||||
|
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y intel-basekit
|
||||||
- name: Install CUDA Dependencies
|
- name: Install CUDA Dependencies
|
||||||
run: |
|
run: |
|
||||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
||||||
@ -127,6 +133,7 @@ jobs:
|
|||||||
export PATH=$PATH:$GOPATH/bin
|
export PATH=$PATH:$GOPATH/bin
|
||||||
export PATH=/usr/local/cuda/bin:$PATH
|
export PATH=/usr/local/cuda/bin:$PATH
|
||||||
export PATH=/opt/rocm/bin:$PATH
|
export PATH=/opt/rocm/bin:$PATH
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
GO_TAGS=p2p make dist
|
GO_TAGS=p2p make dist
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
@ -120,6 +120,7 @@ RUN <<EOT bash
|
|||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
@ -139,6 +140,7 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
|
|||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
20
Makefile
20
Makefile
@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
|||||||
RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
||||||
|
|
||||||
# whisper.cpp version
|
# whisper.cpp version
|
||||||
WHISPER_CPP_VERSION?=af5833e29819810f2d83228228a9a3077e5ccd93
|
WHISPER_CPP_VERSION?=ffef323c4cfa8596cb91cf92d6f791f01a44335e
|
||||||
|
|
||||||
# bert.cpp version
|
# bert.cpp version
|
||||||
BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
|
BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
|
||||||
@ -100,7 +100,7 @@ ifeq ($(BUILD_TYPE),cublas)
|
|||||||
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
|
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
|
||||||
export LLAMA_CUBLAS=1
|
export LLAMA_CUBLAS=1
|
||||||
export WHISPER_CUDA=1
|
export WHISPER_CUDA=1
|
||||||
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
|
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),hipblas)
|
ifeq ($(BUILD_TYPE),hipblas)
|
||||||
@ -328,6 +328,8 @@ ifeq ($(OS),Darwin)
|
|||||||
else
|
else
|
||||||
$(MAKE) backend-assets/grpc/llama-cpp-cuda
|
$(MAKE) backend-assets/grpc/llama-cpp-cuda
|
||||||
$(MAKE) backend-assets/grpc/llama-cpp-hipblas
|
$(MAKE) backend-assets/grpc/llama-cpp-hipblas
|
||||||
|
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
|
||||||
|
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
|
||||||
endif
|
endif
|
||||||
$(MAKE) build
|
$(MAKE) build
|
||||||
mkdir -p release
|
mkdir -p release
|
||||||
@ -720,6 +722,20 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
|
|||||||
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
|
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
|
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
|
||||||
|
|
||||||
|
backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
|
||||||
|
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
|
||||||
|
$(MAKE) -C backend/cpp/llama-sycl_f16 purge
|
||||||
|
$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
|
||||||
|
BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
|
||||||
|
cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
|
||||||
|
|
||||||
|
backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
|
||||||
|
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
|
||||||
|
$(MAKE) -C backend/cpp/llama-sycl_f32 purge
|
||||||
|
$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
|
||||||
|
BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
|
||||||
|
cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-grpc
|
cp -rf backend/cpp/llama backend/cpp/llama-grpc
|
||||||
$(MAKE) -C backend/cpp/llama-grpc purge
|
$(MAKE) -C backend/cpp/llama-grpc purge
|
||||||
|
@ -38,7 +38,10 @@ const (
|
|||||||
LLamaCPPFallback = "llama-cpp-fallback"
|
LLamaCPPFallback = "llama-cpp-fallback"
|
||||||
LLamaCPPCUDA = "llama-cpp-cuda"
|
LLamaCPPCUDA = "llama-cpp-cuda"
|
||||||
LLamaCPPHipblas = "llama-cpp-hipblas"
|
LLamaCPPHipblas = "llama-cpp-hipblas"
|
||||||
LLamaCPPGRPC = "llama-cpp-grpc"
|
LLamaCPPSycl16 = "llama-cpp-sycl_16"
|
||||||
|
LLamaCPPSycl32 = "llama-cpp-sycl_32"
|
||||||
|
|
||||||
|
LLamaCPPGRPC = "llama-cpp-grpc"
|
||||||
|
|
||||||
Gpt4AllLlamaBackend = "gpt4all-llama"
|
Gpt4AllLlamaBackend = "gpt4all-llama"
|
||||||
Gpt4AllMptBackend = "gpt4all-mpt"
|
Gpt4AllMptBackend = "gpt4all-mpt"
|
||||||
@ -94,7 +97,7 @@ ENTRY:
|
|||||||
if autoDetect {
|
if autoDetect {
|
||||||
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
|
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
|
||||||
// when starting the service
|
// when starting the service
|
||||||
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false
|
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
|
||||||
if _, ok := backends[LLamaCPP]; !ok {
|
if _, ok := backends[LLamaCPP]; !ok {
|
||||||
for _, e := range entry {
|
for _, e := range entry {
|
||||||
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
|
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
|
||||||
@ -121,6 +124,14 @@ ENTRY:
|
|||||||
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
|
||||||
foundLCPPHipblas = true
|
foundLCPPHipblas = true
|
||||||
}
|
}
|
||||||
|
if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
|
||||||
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
|
||||||
|
foundSycl16 = true
|
||||||
|
}
|
||||||
|
if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
|
||||||
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
|
||||||
|
foundSycl32 = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -172,9 +183,10 @@ ENTRY:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// selectGRPCProcess selects the GRPC process to start based on system capabilities
|
// selectGRPCProcess selects the GRPC process to start based on system capabilities
|
||||||
func selectGRPCProcess(backend, assetDir string) string {
|
func selectGRPCProcess(backend, assetDir string, f16 bool) string {
|
||||||
foundCUDA := false
|
foundCUDA := false
|
||||||
foundAMDGPU := false
|
foundAMDGPU := false
|
||||||
|
foundIntelGPU := false
|
||||||
var grpcProcess string
|
var grpcProcess string
|
||||||
|
|
||||||
// Select backend now just for llama.cpp
|
// Select backend now just for llama.cpp
|
||||||
@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string {
|
|||||||
log.Info().Msgf("GPU device found but no HIPBLAS backend present")
|
log.Info().Msgf("GPU device found but no HIPBLAS backend present")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if strings.Contains(gpu.String(), "intel") {
|
||||||
|
backend := LLamaCPPSycl16
|
||||||
|
if !f16 {
|
||||||
|
backend = LLamaCPPSycl32
|
||||||
|
}
|
||||||
|
p := backendPath(assetDir, backend)
|
||||||
|
if _, err := os.Stat(p); err == nil {
|
||||||
|
log.Info().Msgf("[%s] attempting to load with Intel variant", backend)
|
||||||
|
grpcProcess = p
|
||||||
|
foundIntelGPU = true
|
||||||
|
} else {
|
||||||
|
log.Info().Msgf("GPU device found but no Intel backend present")
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if foundCUDA || foundAMDGPU {
|
if foundCUDA || foundAMDGPU || foundIntelGPU {
|
||||||
return grpcProcess
|
return grpcProcess
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string {
|
|||||||
// It also loads the model
|
// It also loads the model
|
||||||
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
|
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
|
||||||
return func(modelName, modelFile string) (ModelAddress, error) {
|
return func(modelName, modelFile string) (ModelAddress, error) {
|
||||||
|
|
||||||
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
|
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
|
||||||
|
|
||||||
var client ModelAddress
|
var client ModelAddress
|
||||||
@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
|||||||
|
|
||||||
if autoDetect {
|
if autoDetect {
|
||||||
// autoDetect GRPC process to start based on system capabilities
|
// autoDetect GRPC process to start based on system capabilities
|
||||||
if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
|
if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
|
||||||
grpcProcess = selectedProcess
|
grpcProcess = selectedProcess
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user