LocalAI/api/backend/llm.go

164 lines
4.1 KiB
Go
Raw Normal View History

package backend
import (
"context"
"os"
"regexp"
"strings"
"sync"
fix(utf8): prevent multi-byte utf8 characters from being mangled (#981) **Description** This PR fixes #677 using [suggested solution](https://github.com/go-skynet/LocalAI/issues/677#issuecomment-1695939097) from @yantoz before: ``` ❯ curl -N http://localhost:57541/v1/completions -H "Content-Type: application/json" -d '{ "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true }' data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" |"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` now: ``` ❯ curl -N http://localhost:57541/v1/completions -H Content-Type: application/json -d { "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true } data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"😂"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` **Notes for Reviewers** **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)** - [X] Yes, I signed my commits. <!-- Thank you for contributing to LocalAI! Contributing Conventions: 1. Include descriptive PR titles with [<component-name>] prepended. 2. Build and test your changes before submitting a PR. 3. Sign your commits By following the community's contribution conventions upfront, the review process will be accelerated and your PR merged more quickly. --> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2023-08-30 23:56:59 +00:00
"unicode/utf8"
config "github.com/go-skynet/LocalAI/api/config"
"github.com/go-skynet/LocalAI/api/options"
"github.com/go-skynet/LocalAI/pkg/gallery"
"github.com/go-skynet/LocalAI/pkg/grpc"
model "github.com/go-skynet/LocalAI/pkg/model"
"github.com/go-skynet/LocalAI/pkg/utils"
)
2023-08-18 19:23:14 +00:00
type LLMResponse struct {
Response string // should this be []byte?
Usage TokenUsage
}
type TokenUsage struct {
Prompt int
Completion int
}
func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
modelFile := c.Model
grpcOpts := gRPCModelOpts(c)
var inferenceModel *grpc.Client
var err error
opts := modelOpts(c, o, []model.Option{
model.WithLoadGRPCLoadModelOpts(grpcOpts),
model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
model.WithAssetDir(o.AssetsDestination),
model.WithModel(modelFile),
model.WithContext(o.Context),
})
if c.Backend != "" {
opts = append(opts, model.WithBackendString(c.Backend))
}
// Check if the modelFile exists, if it doesn't try to load it from the gallery
if o.AutoloadGalleries { // experimental
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
utils.ResetDownloadTimers()
// if we failed to load the model, we try to download it
err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
if err != nil {
return nil, err
}
}
}
if c.Backend == "" {
inferenceModel, err = loader.GreedyLoader(opts...)
} else {
inferenceModel, err = loader.BackendLoader(opts...)
}
if err != nil {
return nil, err
}
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
2023-08-18 19:23:14 +00:00
fn := func() (LLMResponse, error) {
opts := gRPCPredictOpts(c, loader.ModelPath)
opts.Prompt = s
2023-08-18 19:23:14 +00:00
tokenUsage := TokenUsage{}
// check the per-model feature flag for usage, since tokenCallback may have a cost.
// Defaults to off as for now it is still experimental
if c.FeatureFlag.Enabled("usage") {
2023-08-18 19:23:14 +00:00
userTokenCallback := tokenCallback
if userTokenCallback == nil {
userTokenCallback = func(token string, usage TokenUsage) bool {
return true
}
}
promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
if pErr == nil && promptInfo.Length > 0 {
tokenUsage.Prompt = int(promptInfo.Length)
}
tokenCallback = func(token string, usage TokenUsage) bool {
tokenUsage.Completion++
return userTokenCallback(token, tokenUsage)
}
}
if tokenCallback != nil {
ss := ""
fix(utf8): prevent multi-byte utf8 characters from being mangled (#981) **Description** This PR fixes #677 using [suggested solution](https://github.com/go-skynet/LocalAI/issues/677#issuecomment-1695939097) from @yantoz before: ``` ❯ curl -N http://localhost:57541/v1/completions -H "Content-Type: application/json" -d '{ "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true }' data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" |"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` now: ``` ❯ curl -N http://localhost:57541/v1/completions -H Content-Type: application/json -d { "model": "ggml-model-q4_0.bin", "prompt": "", "max_tokens": 32, "temperature": 0.7, "stream": true } data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"😂"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}} ``` **Notes for Reviewers** **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)** - [X] Yes, I signed my commits. <!-- Thank you for contributing to LocalAI! Contributing Conventions: 1. Include descriptive PR titles with [<component-name>] prepended. 2. Build and test your changes before submitting a PR. 3. Sign your commits By following the community's contribution conventions upfront, the review process will be accelerated and your PR merged more quickly. --> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2023-08-30 23:56:59 +00:00
var partialRune []byte
err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
partialRune = append(partialRune, chars...)
for len(partialRune) > 0 {
r, size := utf8.DecodeRune(partialRune)
if r == utf8.RuneError {
// incomplete rune, wait for more bytes
break
}
tokenCallback(string(r), tokenUsage)
ss += string(r)
partialRune = partialRune[size:]
}
})
2023-08-18 19:23:14 +00:00
return LLMResponse{
Response: ss,
Usage: tokenUsage,
}, err
} else {
2023-08-18 19:23:14 +00:00
// TODO: Is the chicken bit the only way to get here? is that acceptable?
reply, err := inferenceModel.Predict(ctx, opts)
2023-07-20 22:52:43 +00:00
if err != nil {
2023-08-18 19:23:14 +00:00
return LLMResponse{}, err
2023-07-20 22:52:43 +00:00
}
2023-08-18 19:23:14 +00:00
return LLMResponse{
Response: string(reply.Message),
Usage: tokenUsage,
}, err
}
}
2023-08-18 19:23:14 +00:00
return fn, nil
}
var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
var mu sync.Mutex = sync.Mutex{}
func Finetune(config config.Config, input, prediction string) string {
if config.Echo {
prediction = input + prediction
}
for _, c := range config.Cutstrings {
mu.Lock()
reg, ok := cutstrings[c]
if !ok {
cutstrings[c] = regexp.MustCompile(c)
reg = cutstrings[c]
}
mu.Unlock()
prediction = reg.ReplaceAllString(prediction, "")
}
for _, c := range config.TrimSpace {
prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
}
return prediction
}