diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 4344ac2b..51fd12cb 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -12,6 +12,9 @@ jobs: - repository: "go-skynet/go-llama.cpp" variable: "GOLLAMA_VERSION" branch: "master" + - repository: "go-skynet/go-llama.cpp" + variable: "GOLLAMA_MASTER_VERSION" + branch: "master" - repository: "go-skynet/go-ggml-transformers.cpp" variable: "GOGGMLTRANSFORMERS_VERSION" branch: "master" diff --git a/Makefile b/Makefile index 67e0059b..8faf0c94 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ BINARY_NAME=local-ai # Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124 GOLLAMA_VERSION?=cb8d7cd4cb95725a04504a9e3a26dd72a12b69ac +GOLLAMA_MASTER_VERSION?=6c97625cca76aa5fca98a5a138ee4e5fe4797ecb # Temporary set a specific version of llama.cpp # containing: https://github.com/ggerganov/llama.cpp/pull/1773 and # rebased on top of master. @@ -204,17 +205,25 @@ ifneq ($(LLAMA_CPP_REPO),) cd go-llama && rm -rf llama.cpp && git clone $(LLAMA_CPP_REPO) llama.cpp && cd llama.cpp && git checkout -b build $(LLAMA_CPP_VERSION) && git submodule update --init --recursive --depth 1 endif +go-llama-master: + git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-master + cd go-llama-master && git checkout -b build $(GOLLAMA_MASTER_VERSION) && git submodule update --init --recursive --depth 1 + go-llama/libbinding.a: go-llama $(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a +go-llama-master/libbinding.a: go-llama-master + $(MAKE) -C go-llama-master BUILD_TYPE=$(BUILD_TYPE) libbinding.a + go-piper/libpiper_binding.a: $(MAKE) -C go-piper libpiper_binding.a example/main -get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion +get-sources: go-llama go-ggllm go-llama-master go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion touch $@ replace: $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama + $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp-master=$(shell pwd)/go-llama-master $(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang $(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers $(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv @@ -232,6 +241,7 @@ prepare-sources: get-sources replace rebuild: ## Rebuilds the project $(GOCMD) clean -cache $(MAKE) -C go-llama clean + $(MAKE) -C go-llama-master clean $(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean $(MAKE) -C go-ggml-transformers clean $(MAKE) -C go-rwkv clean @@ -361,6 +371,10 @@ backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/ +backend-assets/grpc/llama-master: backend-assets/grpc go-llama-master/libbinding.a + CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-master LIBRARY_PATH=$(shell pwd)/go-llama-master \ + $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-master ./cmd/grpc/llama-master/ + backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/ @@ -424,4 +438,4 @@ backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/ -grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) \ No newline at end of file +grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/llama-master backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) \ No newline at end of file diff --git a/api/api_test.go b/api/api_test.go index ca840b53..06e978ba 100644 --- a/api/api_test.go +++ b/api/api_test.go @@ -555,9 +555,10 @@ var _ = Describe("API test", func() { }) It("returns errors", func() { + backends := len(model.AutoLoadBackends) _, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"}) Expect(err).To(HaveOccurred()) - Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 12 errors occurred:")) + Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends))) }) It("transcribes audio", func() { if runtime.GOOS != "linux" { diff --git a/cmd/grpc/llama-master/main.go b/cmd/grpc/llama-master/main.go new file mode 100644 index 00000000..51cd00f5 --- /dev/null +++ b/cmd/grpc/llama-master/main.go @@ -0,0 +1,25 @@ +package main + +// GRPC Falcon server + +// Note: this is started internally by LocalAI and a server is allocated for each model + +import ( + "flag" + + llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama-master" + + grpc "github.com/go-skynet/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +func main() { + flag.Parse() + + if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil { + panic(err) + } +} diff --git a/go.mod b/go.mod index b5f23c25..f56ce008 100644 --- a/go.mod +++ b/go.mod @@ -39,6 +39,7 @@ require ( require ( github.com/dlclark/regexp2 v1.8.1 // indirect github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect + github.com/go-skynet/go-llama.cpp-master v0.0.0-20230703203849-ffa57fbc3a12 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/golang/snappy v0.0.2 // indirect github.com/klauspost/pgzip v1.2.5 // indirect diff --git a/pkg/grpc/llm/llama-master/llama.go b/pkg/grpc/llm/llama-master/llama.go new file mode 100644 index 00000000..43ae25f5 --- /dev/null +++ b/pkg/grpc/llm/llama-master/llama.go @@ -0,0 +1,168 @@ +package llama + +// This is a wrapper to statisfy the GRPC service interface +// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) +import ( + "fmt" + + "github.com/go-skynet/LocalAI/pkg/grpc/base" + pb "github.com/go-skynet/LocalAI/pkg/grpc/proto" + "github.com/go-skynet/go-llama.cpp-master" +) + +type LLM struct { + base.Base + + llama *llama.LLama +} + +func (llm *LLM) Load(opts *pb.ModelOptions) error { + llamaOpts := []llama.ModelOption{} + + if opts.ContextSize != 0 { + llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) + } + if opts.F16Memory { + llamaOpts = append(llamaOpts, llama.EnableF16Memory) + } + if opts.Embeddings { + llamaOpts = append(llamaOpts, llama.EnableEmbeddings) + } + if opts.NGPULayers != 0 { + llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) + } + + llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) + llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) + llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) + if opts.NBatch != 0 { + llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) + } else { + llamaOpts = append(llamaOpts, llama.SetNBatch(512)) + } + + if opts.NUMA { + llamaOpts = append(llamaOpts, llama.EnableNUMA) + } + + if opts.LowVRAM { + llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) + } + + model, err := llama.New(opts.Model, llamaOpts...) + llm.llama = model + return err +} + +func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { + predictOptions := []llama.PredictOption{ + llama.SetTemperature(float64(opts.Temperature)), + llama.SetTopP(float64(opts.TopP)), + llama.SetTopK(int(opts.TopK)), + llama.SetTokens(int(opts.Tokens)), + llama.SetThreads(int(opts.Threads)), + } + + if opts.PromptCacheAll { + predictOptions = append(predictOptions, llama.EnablePromptCacheAll) + } + + if opts.PromptCacheRO { + predictOptions = append(predictOptions, llama.EnablePromptCacheRO) + } + + // Expected absolute path + if opts.PromptCachePath != "" { + predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) + } + + if opts.Mirostat != 0 { + predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) + } + + if opts.MirostatETA != 0 { + predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA))) + } + + if opts.MirostatTAU != 0 { + predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU))) + } + + if opts.Debug { + predictOptions = append(predictOptions, llama.Debug) + } + + predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) + + if opts.PresencePenalty != 0 { + predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty))) + } + + if opts.NKeep != 0 { + predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) + } + + if opts.Batch != 0 { + predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) + } + + if opts.F16KV { + predictOptions = append(predictOptions, llama.EnableF16KV) + } + + if opts.IgnoreEOS { + predictOptions = append(predictOptions, llama.IgnoreEOS) + } + + if opts.Seed != 0 { + predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) + } + + //predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed)) + + predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty))) + predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) + predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) + predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) + predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) + predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ))) + predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP))) + return predictOptions +} + +func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { + return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) +} + +func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { + predictOptions := buildPredictOptions(opts) + + predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { + results <- token + return true + })) + + go func() { + _, err := llm.llama.Predict(opts.Prompt, predictOptions...) + if err != nil { + fmt.Println("err: ", err) + } + close(results) + }() + + return nil +} + +func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { + predictOptions := buildPredictOptions(opts) + + if len(opts.EmbeddingTokens) > 0 { + tokens := []int{} + for _, t := range opts.EmbeddingTokens { + tokens = append(tokens, int(t)) + } + return llm.llama.TokenEmbeddings(tokens, predictOptions...) + } + + return llm.llama.Embeddings(opts.Embeddings, predictOptions...) +} diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index d91131d8..9d33a6ee 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -37,6 +37,7 @@ const ( Gpt4All = "gpt4all" FalconBackend = "falcon" FalconGGMLBackend = "falcon-ggml" + LlamaMasterBackend = "llama-master" BertEmbeddingsBackend = "bert-embeddings" RwkvBackend = "rwkv" @@ -47,14 +48,13 @@ const ( //GGLLMFalconBackend = "falcon" ) -var autoLoadBackends []string = []string{ +var AutoLoadBackends []string = []string{ LlamaBackend, Gpt4All, - RwkvBackend, FalconBackend, - WhisperBackend, GPTNeoXBackend, BertEmbeddingsBackend, + LlamaMasterBackend, FalconGGMLBackend, GPTJBackend, Gpt2Backend, @@ -62,7 +62,6 @@ var autoLoadBackends []string = []string{ MPTBackend, ReplitBackend, StarcoderBackend, - BloomzBackend, } func (ml *ModelLoader) StopGRPC() { @@ -186,7 +185,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err er backend := strings.ToLower(o.backendString) switch backend { - case LlamaBackend, GPTJBackend, DollyBackend, + case LlamaBackend, LlamaMasterBackend, GPTJBackend, DollyBackend, MPTBackend, Gpt2Backend, FalconBackend, GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend, RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend: @@ -217,10 +216,7 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) { ml.mu.Unlock() var err error - for _, b := range autoLoadBackends { - if b == BloomzBackend || b == WhisperBackend || b == RwkvBackend { // do not autoload bloomz/whisper/rwkv - continue - } + for _, b := range AutoLoadBackends { log.Debug().Msgf("[%s] Attempting to load", b) model, modelerr := ml.BackendLoader( @@ -236,6 +232,9 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) { } else if modelerr != nil { err = multierror.Append(err, modelerr) log.Debug().Msgf("[%s] Fails: %s", b, modelerr.Error()) + } else if model == nil { + err = multierror.Append(err, modelerr) + log.Debug().Msgf("[%s] Fails: %s", b, "backend returned no usable model") } }