LocalAI/core/backend/llm.go

package backend

import (
	"context"
	"os"
	"regexp"
	"strings"
	"sync"
	"unicode/utf8"

	"github.com/go-skynet/LocalAI/core/config"

	"github.com/go-skynet/LocalAI/pkg/gallery"
	"github.com/go-skynet/LocalAI/pkg/grpc"
	model "github.com/go-skynet/LocalAI/pkg/model"
	"github.com/go-skynet/LocalAI/pkg/utils"
)

type LLMResponse struct {
	Response string // should this be []byte?
	Usage    TokenUsage
}

type TokenUsage struct {
	Prompt     int
	Completion int
}

func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
	modelFile := c.Model
	threads := c.Threads
	if *threads == 0 && o.Threads != 0 {
		threads = &o.Threads
	}
	grpcOpts := gRPCModelOpts(c)

	var inferenceModel grpc.Backend
	var err error

	opts := modelOpts(c, o, []model.Option{
		model.WithLoadGRPCLoadModelOpts(grpcOpts),
		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
		model.WithAssetDir(o.AssetsDestination),
		model.WithModel(modelFile),
		model.WithContext(o.Context),
	})

	if c.Backend != "" {
		opts = append(opts, model.WithBackendString(c.Backend))
	}

	// Check if the modelFile exists, if it doesn't try to load it from the gallery
	if o.AutoloadGalleries { // experimental
		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
			utils.ResetDownloadTimers()
			// if we failed to load the model, we try to download it
			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
			if err != nil {
				return nil, err
			}
		}
	}

	if c.Backend == "" {
		inferenceModel, err = loader.GreedyLoader(opts...)
	} else {
		inferenceModel, err = loader.BackendLoader(opts...)
	}

	if err != nil {
		return nil, err
	}

	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
	fn := func() (LLMResponse, error) {
		opts := gRPCPredictOpts(c, loader.ModelPath)
		opts.Prompt = s
		opts.Images = images

		tokenUsage := TokenUsage{}

		// check the per-model feature flag for usage, since tokenCallback may have a cost.
		// Defaults to off as for now it is still experimental
		if c.FeatureFlag.Enabled("usage") {
			userTokenCallback := tokenCallback
			if userTokenCallback == nil {
				userTokenCallback = func(token string, usage TokenUsage) bool {
					return true
				}
			}

			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
			if pErr == nil && promptInfo.Length > 0 {
				tokenUsage.Prompt = int(promptInfo.Length)
			}

			tokenCallback = func(token string, usage TokenUsage) bool {
				tokenUsage.Completion++
				return userTokenCallback(token, tokenUsage)
			}
		}

		if tokenCallback != nil {
			ss := ""

			var partialRune []byte
			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
				partialRune = append(partialRune, chars...)

				for len(partialRune) > 0 {
					r, size := utf8.DecodeRune(partialRune)
					if r == utf8.RuneError {
						// incomplete rune, wait for more bytes
						break
					}

					tokenCallback(string(r), tokenUsage)
					ss += string(r)

					partialRune = partialRune[size:]
				}
			})
			return LLMResponse{
				Response: ss,
				Usage:    tokenUsage,
			}, err
		} else {
			// TODO: Is the chicken bit the only way to get here? is that acceptable?
			reply, err := inferenceModel.Predict(ctx, opts)
			if err != nil {
				return LLMResponse{}, err
			}
			return LLMResponse{
				Response: string(reply.Message),
				Usage:    tokenUsage,
			}, err
		}
	}

	return fn, nil
}

var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
var mu sync.Mutex = sync.Mutex{}

func Finetune(config config.BackendConfig, input, prediction string) string {
	if config.Echo {
		prediction = input + prediction
	}

	for _, c := range config.Cutstrings {
		mu.Lock()
		reg, ok := cutstrings[c]
		if !ok {
			cutstrings[c] = regexp.MustCompile(c)
			reg = cutstrings[c]
		}
		mu.Unlock()
		prediction = reg.ReplaceAllString(prediction, "")
	}

	for _, c := range config.TrimSpace {
		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
	}

	for _, c := range config.TrimSuffix {
		prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
	}
	return prediction
}
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`package backend`

			`import (`
			`"context"`
			`"os"`
			`"regexp"`
			`"strings"`
			`"sync"`
			`"unicode/utf8"`

refactor: move remaining api packages to core (#1731) * core 1 * api/openai/files fix * core 2 - core/config * move over core api.go and tests to the start of core/http * move over localai specific endpoints to core/http, begin the service/endpoint split there * refactor big chunk on the plane * refactor chunk 2 on plane, next step: port and modify changes to request.go * easy fixes for request.go, major changes not done yet * lintfix * json tag lintfix? * gitignore and .keep files * strange fix attempt: rename the config dir? 2024-03-01 15:19:53 +00:00			`"github.com/go-skynet/LocalAI/core/config"`

Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`"github.com/go-skynet/LocalAI/pkg/gallery"`
			`"github.com/go-skynet/LocalAI/pkg/grpc"`
			`model "github.com/go-skynet/LocalAI/pkg/model"`
			`"github.com/go-skynet/LocalAI/pkg/utils"`
			`)`

			`type LLMResponse struct {`
			`Response string // should this be []byte?`
			`Usage TokenUsage`
			`}`

			`type TokenUsage struct {`
			`Prompt int`
			`Completion int`
			`}`

refactor: move remaining api packages to core (#1731) * core 1 * api/openai/files fix * core 2 - core/config * move over core api.go and tests to the start of core/http * move over localai specific endpoints to core/http, begin the service/endpoint split there * refactor big chunk on the plane * refactor chunk 2 on plane, next step: port and modify changes to request.go * easy fixes for request.go, major changes not done yet * lintfix * json tag lintfix? * gitignore and .keep files * strange fix attempt: rename the config dir? 2024-03-01 15:19:53 +00:00			`func ModelInference(ctx context.Context, s string, images []string, loader model.ModelLoader, c config.BackendConfig, o config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`modelFile := c.Model`
feat(intel): add diffusers/transformers support (#1746) * feat(intel): add diffusers support * try to consume upstream container image * Debug * Manually install deps * Map transformers/hf cache dir to modelpath if not specified * fix(compel): update initialization, pass by all gRPC options * fix: add dependencies, implement transformers for xpu * base it from the oneapi image * Add pillow * set threads if specified when launching the API * Skip conda install if intel * defaults to non-intel * ci: add to pipelines * prepare compel only if enabled * Skip conda install if intel * fix cleanup * Disable compel by default * Install torch 2.1.0 with Intel * Skip conda on some setups * Detect python * Quiet output * Do not override system python with conda * Prefer python3 * Fixups * exllama2: do not install without conda (overrides pytorch version) * exllama/exllama2: do not install if not using cuda * Add missing dataset dependency * Small fixups, symlink to python, add requirements * Add neural_speed to the deps * correctly handle model offloading * fix: device_map == xpu * go back at calling python, fixed at dockerfile level * Exllama2 restricted to only nvidia gpus * Tokenizer to xpu 2024-03-07 13:37:45 +00:00			`threads := c.Threads`
fix(config): set better defaults for inferencing (#1822) * fix(defaults): set better defaults for inferencing This changeset aim to have better defaults and to properly detect when no inference settings are provided with the model. If not specified, we defaults to mirostat sampling, and offload all the GPU layers (if a GPU is detected). Related to https://github.com/mudler/LocalAI/issues/1373 and https://github.com/mudler/LocalAI/issues/1723 * Adapt tests * Also pre-initialize default seed 2024-03-13 09:05:30 +00:00			`if *threads == 0 && o.Threads != 0 {`
			`threads = &o.Threads`
feat(intel): add diffusers/transformers support (#1746) * feat(intel): add diffusers support * try to consume upstream container image * Debug * Manually install deps * Map transformers/hf cache dir to modelpath if not specified * fix(compel): update initialization, pass by all gRPC options * fix: add dependencies, implement transformers for xpu * base it from the oneapi image * Add pillow * set threads if specified when launching the API * Skip conda install if intel * defaults to non-intel * ci: add to pipelines * prepare compel only if enabled * Skip conda install if intel * fix cleanup * Disable compel by default * Install torch 2.1.0 with Intel * Skip conda on some setups * Detect python * Quiet output * Do not override system python with conda * Prefer python3 * Fixups * exllama2: do not install without conda (overrides pytorch version) * exllama/exllama2: do not install if not using cuda * Add missing dataset dependency * Small fixups, symlink to python, add requirements * Add neural_speed to the deps * correctly handle model offloading * fix: device_map == xpu * go back at calling python, fixed at dockerfile level * Exllama2 restricted to only nvidia gpus * Tokenizer to xpu 2024-03-07 13:37:45 +00:00			`}`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`grpcOpts := gRPCModelOpts(c)`

feat(grpc): backend SPI pluggable in embedding mode (#1621) * run server * grpc backend embedded support * backend providable 2024-01-23 07:56:36 +00:00			`var inferenceModel grpc.Backend`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`var err error`

			`opts := modelOpts(c, o, []model.Option{`
			`model.WithLoadGRPCLoadModelOpts(grpcOpts),`
fix(config): set better defaults for inferencing (#1822) * fix(defaults): set better defaults for inferencing This changeset aim to have better defaults and to properly detect when no inference settings are provided with the model. If not specified, we defaults to mirostat sampling, and offload all the GPU layers (if a GPU is detected). Related to https://github.com/mudler/LocalAI/issues/1373 and https://github.com/mudler/LocalAI/issues/1723 * Adapt tests * Also pre-initialize default seed 2024-03-13 09:05:30 +00:00			`model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`model.WithAssetDir(o.AssetsDestination),`
			`model.WithModel(modelFile),`
			`model.WithContext(o.Context),`
			`})`

			`if c.Backend != "" {`
			`opts = append(opts, model.WithBackendString(c.Backend))`
			`}`

			`// Check if the modelFile exists, if it doesn't try to load it from the gallery`
			`if o.AutoloadGalleries { // experimental`
			`if _, err := os.Stat(modelFile); os.IsNotExist(err) {`
			`utils.ResetDownloadTimers()`
			`// if we failed to load the model, we try to download it`
			`err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)`
			`if err != nil {`
			`return nil, err`
			`}`
			`}`
			`}`

			`if c.Backend == "" {`
			`inferenceModel, err = loader.GreedyLoader(opts...)`
			`} else {`
			`inferenceModel, err = loader.BackendLoader(opts...)`
			`}`

			`if err != nil {`
			`return nil, err`
			`}`

			`// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported`
			`fn := func() (LLMResponse, error) {`
			`opts := gRPCPredictOpts(c, loader.ModelPath)`
			`opts.Prompt = s`
			`opts.Images = images`

			`tokenUsage := TokenUsage{}`

			`// check the per-model feature flag for usage, since tokenCallback may have a cost.`
			`// Defaults to off as for now it is still experimental`
			`if c.FeatureFlag.Enabled("usage") {`
			`userTokenCallback := tokenCallback`
			`if userTokenCallback == nil {`
			`userTokenCallback = func(token string, usage TokenUsage) bool {`
			`return true`
			`}`
			`}`

			`promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)`
			`if pErr == nil && promptInfo.Length > 0 {`
			`tokenUsage.Prompt = int(promptInfo.Length)`
			`}`

			`tokenCallback = func(token string, usage TokenUsage) bool {`
			`tokenUsage.Completion++`
			`return userTokenCallback(token, tokenUsage)`
			`}`
			`}`

			`if tokenCallback != nil {`
			`ss := ""`

			`var partialRune []byte`
			`err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {`
			`partialRune = append(partialRune, chars...)`

			`for len(partialRune) > 0 {`
			`r, size := utf8.DecodeRune(partialRune)`
			`if r == utf8.RuneError {`
			`// incomplete rune, wait for more bytes`
			`break`
			`}`

			`tokenCallback(string(r), tokenUsage)`
			`ss += string(r)`

			`partialRune = partialRune[size:]`
			`}`
			`})`
			`return LLMResponse{`
			`Response: ss,`
			`Usage: tokenUsage,`
			`}, err`
			`} else {`
			`// TODO: Is the chicken bit the only way to get here? is that acceptable?`
			`reply, err := inferenceModel.Predict(ctx, opts)`
			`if err != nil {`
			`return LLMResponse{}, err`
			`}`
			`return LLMResponse{`
			`Response: string(reply.Message),`
			`Usage: tokenUsage,`
			`}, err`
			`}`
			`}`

			`return fn, nil`
			`}`

			`var cutstrings map[string]regexp.Regexp = make(map[string]regexp.Regexp)`
			`var mu sync.Mutex = sync.Mutex{}`

refactor: move remaining api packages to core (#1731) * core 1 * api/openai/files fix * core 2 - core/config * move over core api.go and tests to the start of core/http * move over localai specific endpoints to core/http, begin the service/endpoint split there * refactor big chunk on the plane * refactor chunk 2 on plane, next step: port and modify changes to request.go * easy fixes for request.go, major changes not done yet * lintfix * json tag lintfix? * gitignore and .keep files * strange fix attempt: rename the config dir? 2024-03-01 15:19:53 +00:00			`func Finetune(config config.BackendConfig, input, prediction string) string {`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`if config.Echo {`
			`prediction = input + prediction`
			`}`

			`for _, c := range config.Cutstrings {`
			`mu.Lock()`
			`reg, ok := cutstrings[c]`
			`if !ok {`
			`cutstrings[c] = regexp.MustCompile(c)`
			`reg = cutstrings[c]`
			`}`
			`mu.Unlock()`
			`prediction = reg.ReplaceAllString(prediction, "")`
			`}`

			`for _, c := range config.TrimSpace {`
			`prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))`
			`}`

			`for _, c := range config.TrimSuffix {`
			`prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))`
			`}`
			`return prediction`
			`}`