LocalAI/api/backend/llm.go

package backend

import (
	"context"
	"os"
	"regexp"
	"strings"
	"sync"
	"unicode/utf8"

	config "github.com/go-skynet/LocalAI/api/config"
	"github.com/go-skynet/LocalAI/api/options"
	"github.com/go-skynet/LocalAI/pkg/gallery"
	"github.com/go-skynet/LocalAI/pkg/grpc"
	model "github.com/go-skynet/LocalAI/pkg/model"
	"github.com/go-skynet/LocalAI/pkg/utils"
)

type LLMResponse struct {
	Response string // should this be []byte?
	Usage    TokenUsage
}

type TokenUsage struct {
	Prompt     int
	Completion int
}

func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
	modelFile := c.Model

	grpcOpts := gRPCModelOpts(c)

	var inferenceModel *grpc.Client
	var err error

	opts := modelOpts(c, o, []model.Option{
		model.WithLoadGRPCLoadModelOpts(grpcOpts),
		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
		model.WithAssetDir(o.AssetsDestination),
		model.WithModel(modelFile),
		model.WithContext(o.Context),
	})

	if c.Backend != "" {
		opts = append(opts, model.WithBackendString(c.Backend))
	}

	// Check if the modelFile exists, if it doesn't try to load it from the gallery
	if o.AutoloadGalleries { // experimental
		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
			utils.ResetDownloadTimers()
			// if we failed to load the model, we try to download it
			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
			if err != nil {
				return nil, err
			}
		}
	}

	if c.Backend == "" {
		inferenceModel, err = loader.GreedyLoader(opts...)
	} else {
		inferenceModel, err = loader.BackendLoader(opts...)
	}

	if err != nil {
		return nil, err
	}

	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
	fn := func() (LLMResponse, error) {
		opts := gRPCPredictOpts(c, loader.ModelPath)
		opts.Prompt = s

		tokenUsage := TokenUsage{}

		// check the per-model feature flag for usage, since tokenCallback may have a cost.
		// Defaults to off as for now it is still experimental
		if c.FeatureFlag.Enabled("usage") {
			userTokenCallback := tokenCallback
			if userTokenCallback == nil {
				userTokenCallback = func(token string, usage TokenUsage) bool {
					return true
				}
			}

			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
			if pErr == nil && promptInfo.Length > 0 {
				tokenUsage.Prompt = int(promptInfo.Length)
			}

			tokenCallback = func(token string, usage TokenUsage) bool {
				tokenUsage.Completion++
				return userTokenCallback(token, tokenUsage)
			}
		}

		if tokenCallback != nil {
			ss := ""

			var partialRune []byte
			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
				partialRune = append(partialRune, chars...)

				for len(partialRune) > 0 {
					r, size := utf8.DecodeRune(partialRune)
					if r == utf8.RuneError {
						// incomplete rune, wait for more bytes
						break
					}

					tokenCallback(string(r), tokenUsage)
					ss += string(r)

					partialRune = partialRune[size:]
				}
			})
			return LLMResponse{
				Response: ss,
				Usage:    tokenUsage,
			}, err
		} else {
			// TODO: Is the chicken bit the only way to get here? is that acceptable?
			reply, err := inferenceModel.Predict(ctx, opts)
			if err != nil {
				return LLMResponse{}, err
			}
			return LLMResponse{
				Response: string(reply.Message),
				Usage:    tokenUsage,
			}, err
		}
	}

	return fn, nil
}

var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
var mu sync.Mutex = sync.Mutex{}

func Finetune(config config.Config, input, prediction string) string {
	if config.Echo {
		prediction = input + prediction
	}

	for _, c := range config.Cutstrings {
		mu.Lock()
		reg, ok := cutstrings[c]
		if !ok {
			cutstrings[c] = regexp.MustCompile(c)
			reg = cutstrings[c]
		}
		mu.Unlock()
		prediction = reg.ReplaceAllString(prediction, "")
	}

	for _, c := range config.TrimSpace {
		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
	}
	return prediction

}
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`package backend`

			`import (`
feat: cancel stream generation if client disappears (#792) 2023-07-24 21:10:54 +00:00			`"context"`
feat: add external grpc and model autoloading 2023-07-20 20:10:12 +00:00			`"os"`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`"regexp"`
			`"strings"`
			`"sync"`
fix(utf8): prevent multi-byte utf8 characters from being mangled (#981) Description This PR fixes #677 using [suggested solution](https://github.com/go-skynet/LocalAI/issues/677#issuecomment-1695939097) from @yantoz before: ``` ❯ curl -N http://localhost:57541/v1/completions -H "Content-Type: application/json" -d '{ "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true }' data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" \|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` now: ``` ❯ curl -N http://localhost:57541/v1/completions -H Content-Type: application/json -d { "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true } data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"😂"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"\|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` Notes for Reviewers [Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin) - [X] Yes, I signed my commits. <!-- Thank you for contributing to LocalAI! Contributing Conventions: 1. Include descriptive PR titles with [<component-name>] prepended. 2. Build and test your changes before submitting a PR. 3. Sign your commits By following the community's contribution conventions upfront, the review process will be accelerated and your PR merged more quickly. --> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com> 2023-08-30 23:56:59 +00:00			`"unicode/utf8"`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00
			`config "github.com/go-skynet/LocalAI/api/config"`
			`"github.com/go-skynet/LocalAI/api/options"`
feat: add external grpc and model autoloading 2023-07-20 20:10:12 +00:00			`"github.com/go-skynet/LocalAI/pkg/gallery"`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`"github.com/go-skynet/LocalAI/pkg/grpc"`
			`model "github.com/go-skynet/LocalAI/pkg/model"`
feat: add external grpc and model autoloading 2023-07-20 20:10:12 +00:00			`"github.com/go-skynet/LocalAI/pkg/utils"`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`)`

Usage Features (#863) 2023-08-18 19:23:14 +00:00			`type LLMResponse struct {`
			`Response string // should this be []byte?`
			`Usage TokenUsage`
			`}`

			`type TokenUsage struct {`
			`Prompt int`
			`Completion int`
			`}`

			`func ModelInference(ctx context.Context, s string, loader model.ModelLoader, c config.Config, o options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`modelFile := c.Model`

			`grpcOpts := gRPCModelOpts(c)`

feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`var inferenceModel *grpc.Client`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`var err error`

feat: add --single-active-backend to allow only one backend active at the time (#925) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-08-18 23:49:33 +00:00			`opts := modelOpts(c, o, []model.Option{`
feat: Add Diffusers (#874) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-08-09 06:38:51 +00:00			`model.WithLoadGRPCLoadModelOpts(grpcOpts),`
feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`model.WithAssetDir(o.AssetsDestination),`
feat: add initial AutoGPTQ backend implementation 2023-08-07 20:39:10 +00:00			`model.WithModel(modelFile),`
feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`model.WithContext(o.Context),`
feat: add --single-active-backend to allow only one backend active at the time (#925) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-08-18 23:49:33 +00:00			`})`
feat: add external grpc and model autoloading 2023-07-20 20:10:12 +00:00
			`if c.Backend != "" {`
			`opts = append(opts, model.WithBackendString(c.Backend))`
			`}`

			`// Check if the modelFile exists, if it doesn't try to load it from the gallery`
			`if o.AutoloadGalleries { // experimental`
			`if _, err := os.Stat(modelFile); os.IsNotExist(err) {`
			`utils.ResetDownloadTimers()`
			`// if we failed to load the model, we try to download it`
			`err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)`
			`if err != nil {`
			`return nil, err`
			`}`
			`}`
			`}`

feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`if c.Backend == "" {`
			`inferenceModel, err = loader.GreedyLoader(opts...)`
			`} else {`
			`inferenceModel, err = loader.BackendLoader(opts...)`
			`}`
feat: add external grpc and model autoloading 2023-07-20 20:10:12 +00:00
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`if err != nil {`
			`return nil, err`
			`}`

feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported`
Usage Features (#863) 2023-08-18 19:23:14 +00:00			`fn := func() (LLMResponse, error) {`
feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`opts := gRPCPredictOpts(c, loader.ModelPath)`
			`opts.Prompt = s`
Usage Features (#863) 2023-08-18 19:23:14 +00:00
			`tokenUsage := TokenUsage{}`

fix: disable usage by default (still experimental) (#929) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-08-19 14:15:22 +00:00			`// check the per-model feature flag for usage, since tokenCallback may have a cost.`
			`// Defaults to off as for now it is still experimental`
			`if c.FeatureFlag.Enabled("usage") {`
Usage Features (#863) 2023-08-18 19:23:14 +00:00			`userTokenCallback := tokenCallback`
			`if userTokenCallback == nil {`
			`userTokenCallback = func(token string, usage TokenUsage) bool {`
			`return true`
			`}`
			`}`

			`promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)`
			`if pErr == nil && promptInfo.Length > 0 {`
			`tokenUsage.Prompt = int(promptInfo.Length)`
			`}`

			`tokenCallback = func(token string, usage TokenUsage) bool {`
			`tokenUsage.Completion++`
			`return userTokenCallback(token, tokenUsage)`
			`}`
			`}`

feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`if tokenCallback != nil {`
			`ss := ""`
fix(utf8): prevent multi-byte utf8 characters from being mangled (#981) Description This PR fixes #677 using [suggested solution](https://github.com/go-skynet/LocalAI/issues/677#issuecomment-1695939097) from @yantoz before: ``` ❯ curl -N http://localhost:57541/v1/completions -H "Content-Type: application/json" -d '{ "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true }' data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" \|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` now: ``` ❯ curl -N http://localhost:57541/v1/completions -H Content-Type: application/json -d { "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true } data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"😂"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"\|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` Notes for Reviewers [Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin) - [X] Yes, I signed my commits. <!-- Thank you for contributing to LocalAI! Contributing Conventions: 1. Include descriptive PR titles with [<component-name>] prepended. 2. Build and test your changes before submitting a PR. 3. Sign your commits By following the community's contribution conventions upfront, the review process will be accelerated and your PR merged more quickly. --> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com> 2023-08-30 23:56:59 +00:00
			`var partialRune []byte`
			`err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {`
			`partialRune = append(partialRune, chars...)`

			`for len(partialRune) > 0 {`
			`r, size := utf8.DecodeRune(partialRune)`
			`if r == utf8.RuneError {`
			`// incomplete rune, wait for more bytes`
			`break`
			`}`

			`tokenCallback(string(r), tokenUsage)`
			`ss += string(r)`

			`partialRune = partialRune[size:]`
			`}`
feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`})`
Usage Features (#863) 2023-08-18 19:23:14 +00:00			`return LLMResponse{`
			`Response: ss,`
			`Usage: tokenUsage,`
			`}, err`
feat: move other backends to grpc This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`} else {`
Usage Features (#863) 2023-08-18 19:23:14 +00:00			`// TODO: Is the chicken bit the only way to get here? is that acceptable?`
feat: cancel stream generation if client disappears (#792) 2023-07-24 21:10:54 +00:00			`reply, err := inferenceModel.Predict(ctx, opts)`
debug 2023-07-20 22:52:43 +00:00			`if err != nil {`
Usage Features (#863) 2023-08-18 19:23:14 +00:00			`return LLMResponse{}, err`
debug 2023-07-20 22:52:43 +00:00			`}`
Usage Features (#863) 2023-08-18 19:23:14 +00:00			`return LLMResponse{`
			`Response: string(reply.Message),`
			`Usage: tokenUsage,`
			`}, err`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`}`
			`}`

Usage Features (#863) 2023-08-18 19:23:14 +00:00			`return fn, nil`
feat: various refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2023-07-14 23:19:43 +00:00			`}`

			`var cutstrings map[string]regexp.Regexp = make(map[string]regexp.Regexp)`
			`var mu sync.Mutex = sync.Mutex{}`

			`func Finetune(config config.Config, input, prediction string) string {`
			`if config.Echo {`
			`prediction = input + prediction`
			`}`

			`for _, c := range config.Cutstrings {`
			`mu.Lock()`
			`reg, ok := cutstrings[c]`
			`if !ok {`
			`cutstrings[c] = regexp.MustCompile(c)`
			`reg = cutstrings[c]`
			`}`
			`mu.Unlock()`
			`prediction = reg.ReplaceAllString(prediction, "")`
			`}`

			`for _, c := range config.TrimSpace {`
			`prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))`
			`}`
			`return prediction`

			`}`