mirror of
https://github.com/mudler/LocalAI.git
synced 2024-06-07 19:40:48 +00:00
862 lines
27 KiB
Go
862 lines
27 KiB
Go
|
package backend
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"encoding/json"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"os"
|
||
|
"path/filepath"
|
||
|
"regexp"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
"time"
|
||
|
"unicode/utf8"
|
||
|
|
||
|
"github.com/go-skynet/LocalAI/core/services"
|
||
|
"github.com/go-skynet/LocalAI/pkg/gallery"
|
||
|
"github.com/go-skynet/LocalAI/pkg/grammar"
|
||
|
"github.com/go-skynet/LocalAI/pkg/grpc"
|
||
|
"github.com/go-skynet/LocalAI/pkg/model"
|
||
|
"github.com/go-skynet/LocalAI/pkg/schema"
|
||
|
"github.com/go-skynet/LocalAI/pkg/utils"
|
||
|
"github.com/google/uuid"
|
||
|
"github.com/rs/zerolog/log"
|
||
|
)
|
||
|
|
||
|
////////// TYPES //////////////
|
||
|
|
||
|
type LLMResponse struct {
|
||
|
Response string // should this be []byte?
|
||
|
Usage TokenUsage
|
||
|
}
|
||
|
|
||
|
// TODO: Test removing this and using the variant in pkg/schema someday?
|
||
|
type TokenUsage struct {
|
||
|
Prompt int
|
||
|
Completion int
|
||
|
}
|
||
|
|
||
|
type TemplateConfigBindingFn func(*schema.Config) *string
|
||
|
|
||
|
// type LLMStreamProcessor func(s string, req *schema.OpenAIRequest, config *schema.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse)
|
||
|
|
||
|
/////// CONSTS ///////////
|
||
|
|
||
|
const DEFAULT_NO_ACTION_NAME = "answer"
|
||
|
const DEFAULT_NO_ACTION_DESCRIPTION = "use this action to answer without performing any action"
|
||
|
|
||
|
////// INFERENCE /////////
|
||
|
|
||
|
func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c schema.Config, o *schema.StartupOptions, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||
|
modelFile := c.Model
|
||
|
|
||
|
grpcOpts := gRPCModelOpts(c)
|
||
|
|
||
|
var inferenceModel *grpc.Client
|
||
|
var err error
|
||
|
|
||
|
opts := modelOpts(c, o, []model.Option{
|
||
|
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||
|
model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
|
||
|
model.WithAssetDir(o.AssetsDestination),
|
||
|
model.WithModel(modelFile),
|
||
|
model.WithContext(o.Context),
|
||
|
model.WithExternalBackends(o.ExternalGRPCBackends, false),
|
||
|
})
|
||
|
|
||
|
if c.Backend != "" {
|
||
|
opts = append(opts, model.WithBackendString(c.Backend))
|
||
|
}
|
||
|
|
||
|
// Check if the modelFile exists, if it doesn't try to load it from the gallery
|
||
|
if o.AutoloadGalleries { // experimental
|
||
|
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
|
||
|
utils.ResetDownloadTimers()
|
||
|
// if we failed to load the model, we try to download it
|
||
|
err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if c.Backend == "" {
|
||
|
inferenceModel, err = loader.GreedyLoader(opts...)
|
||
|
} else {
|
||
|
inferenceModel, err = loader.BackendLoader(opts...)
|
||
|
}
|
||
|
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
|
||
|
fn := func() (LLMResponse, error) {
|
||
|
opts := gRPCPredictOpts(c, loader.ModelPath)
|
||
|
opts.Prompt = s
|
||
|
opts.Images = images
|
||
|
|
||
|
tokenUsage := TokenUsage{}
|
||
|
|
||
|
// check the per-model feature flag for usage, since tokenCallback may have a cost.
|
||
|
// Defaults to off as for now it is still experimental
|
||
|
if c.FeatureFlag.Enabled("usage") {
|
||
|
userTokenCallback := tokenCallback
|
||
|
if userTokenCallback == nil {
|
||
|
userTokenCallback = func(token string, usage TokenUsage) bool {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
|
||
|
promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
|
||
|
if pErr == nil && promptInfo.Length > 0 {
|
||
|
tokenUsage.Prompt = int(promptInfo.Length)
|
||
|
}
|
||
|
|
||
|
tokenCallback = func(token string, usage TokenUsage) bool {
|
||
|
tokenUsage.Completion++
|
||
|
return userTokenCallback(token, tokenUsage)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if tokenCallback != nil {
|
||
|
ss := ""
|
||
|
|
||
|
var partialRune []byte
|
||
|
err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
|
||
|
partialRune = append(partialRune, chars...)
|
||
|
|
||
|
for len(partialRune) > 0 {
|
||
|
r, size := utf8.DecodeRune(partialRune)
|
||
|
if r == utf8.RuneError {
|
||
|
// incomplete rune, wait for more bytes
|
||
|
break
|
||
|
}
|
||
|
|
||
|
tokenCallback(string(r), tokenUsage)
|
||
|
ss += string(r)
|
||
|
|
||
|
partialRune = partialRune[size:]
|
||
|
}
|
||
|
})
|
||
|
return LLMResponse{
|
||
|
Response: ss,
|
||
|
Usage: tokenUsage,
|
||
|
}, err
|
||
|
} else {
|
||
|
// TODO: Is the chicken bit the only way to get here? is that acceptable?
|
||
|
reply, err := inferenceModel.Predict(ctx, opts)
|
||
|
if err != nil {
|
||
|
return LLMResponse{}, err
|
||
|
}
|
||
|
return LLMResponse{
|
||
|
Response: string(reply.Message),
|
||
|
Usage: tokenUsage,
|
||
|
}, err
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return fn, nil
|
||
|
}
|
||
|
|
||
|
var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
|
||
|
var mu sync.Mutex = sync.Mutex{}
|
||
|
|
||
|
func Finetune(config schema.Config, input, prediction string) string {
|
||
|
if config.Echo {
|
||
|
prediction = input + prediction
|
||
|
}
|
||
|
|
||
|
for _, c := range config.Cutstrings {
|
||
|
mu.Lock()
|
||
|
reg, ok := cutstrings[c]
|
||
|
if !ok {
|
||
|
cutstrings[c] = regexp.MustCompile(c)
|
||
|
reg = cutstrings[c]
|
||
|
}
|
||
|
mu.Unlock()
|
||
|
prediction = reg.ReplaceAllString(prediction, "")
|
||
|
}
|
||
|
|
||
|
for _, c := range config.TrimSpace {
|
||
|
prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
|
||
|
}
|
||
|
|
||
|
for _, c := range config.TrimSuffix {
|
||
|
prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
|
||
|
}
|
||
|
return prediction
|
||
|
|
||
|
}
|
||
|
|
||
|
////// CONFIG AND REQUEST HANDLING ///////////////
|
||
|
|
||
|
func ReadConfigFromFileAndCombineWithOpenAIRequest(modelFile string, input *schema.OpenAIRequest, cm *services.ConfigLoader, startupOptions *schema.StartupOptions) (*schema.Config, *schema.OpenAIRequest, error) {
|
||
|
// Load a config file if present after the model name
|
||
|
modelConfig := filepath.Join(startupOptions.ModelPath, modelFile+".yaml")
|
||
|
|
||
|
var cfg *schema.Config
|
||
|
|
||
|
defaults := func() {
|
||
|
cfg = schema.DefaultConfig(modelFile)
|
||
|
cfg.ContextSize = startupOptions.ContextSize
|
||
|
cfg.Threads = startupOptions.Threads
|
||
|
cfg.F16 = startupOptions.F16
|
||
|
cfg.Debug = startupOptions.Debug
|
||
|
}
|
||
|
|
||
|
cfgExisting, exists := cm.GetConfig(modelFile)
|
||
|
if !exists {
|
||
|
if _, err := os.Stat(modelConfig); err == nil {
|
||
|
if err := cm.LoadConfig(modelConfig); err != nil {
|
||
|
return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
|
||
|
}
|
||
|
cfgExisting, exists = cm.GetConfig(modelFile)
|
||
|
if exists {
|
||
|
cfg = &cfgExisting
|
||
|
} else {
|
||
|
defaults()
|
||
|
}
|
||
|
} else {
|
||
|
defaults()
|
||
|
}
|
||
|
} else {
|
||
|
cfg = &cfgExisting
|
||
|
}
|
||
|
|
||
|
// Set the parameters for the language model prediction
|
||
|
schema.UpdateConfigFromOpenAIRequest(cfg, input)
|
||
|
|
||
|
// Don't allow 0 as setting
|
||
|
if cfg.Threads == 0 {
|
||
|
if startupOptions.Threads != 0 {
|
||
|
cfg.Threads = startupOptions.Threads
|
||
|
} else {
|
||
|
cfg.Threads = 4
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Enforce debug flag if passed from CLI
|
||
|
if startupOptions.Debug {
|
||
|
cfg.Debug = true
|
||
|
}
|
||
|
|
||
|
return cfg, input, nil
|
||
|
}
|
||
|
|
||
|
func ComputeChoices(
|
||
|
req *schema.OpenAIRequest,
|
||
|
predInput string,
|
||
|
config *schema.Config,
|
||
|
o *schema.StartupOptions,
|
||
|
loader *model.ModelLoader,
|
||
|
cb func(string, *[]schema.Choice),
|
||
|
tokenCallback func(string, TokenUsage) bool) ([]schema.Choice, TokenUsage, error) {
|
||
|
n := req.N // number of completions to return
|
||
|
result := []schema.Choice{}
|
||
|
|
||
|
if n == 0 {
|
||
|
n = 1
|
||
|
}
|
||
|
|
||
|
images := []string{}
|
||
|
for _, m := range req.Messages {
|
||
|
images = append(images, m.StringImages...)
|
||
|
}
|
||
|
|
||
|
// get the model function to call for the result
|
||
|
predFunc, err := ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
|
||
|
if err != nil {
|
||
|
return result, TokenUsage{}, err
|
||
|
}
|
||
|
|
||
|
tokenUsage := TokenUsage{}
|
||
|
|
||
|
for i := 0; i < n; i++ {
|
||
|
prediction, err := predFunc()
|
||
|
if err != nil {
|
||
|
return result, TokenUsage{}, err
|
||
|
}
|
||
|
|
||
|
tokenUsage.Prompt += prediction.Usage.Prompt
|
||
|
tokenUsage.Completion += prediction.Usage.Completion
|
||
|
|
||
|
finetunedResponse := Finetune(*config, predInput, prediction.Response)
|
||
|
cb(finetunedResponse, &result)
|
||
|
|
||
|
//result = append(result, Choice{Text: prediction})
|
||
|
|
||
|
}
|
||
|
return result, tokenUsage, err
|
||
|
}
|
||
|
|
||
|
// TODO: No functions???? Commonize with prepareChatGenerationOpenAIRequest below?
|
||
|
func prepareGenerationOpenAIRequest(bindingFn TemplateConfigBindingFn, modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.Config, error) {
|
||
|
config, input, err := ReadConfigFromFileAndCombineWithOpenAIRequest(modelName, input, cl, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("failed reading parameters from request:%w", err)
|
||
|
}
|
||
|
|
||
|
if input.ResponseFormat.Type == "json_object" {
|
||
|
input.Grammar = grammar.JSONBNF
|
||
|
}
|
||
|
|
||
|
log.Debug().Msgf("Parameter Config: %+v", config)
|
||
|
|
||
|
configTemplate := bindingFn(config)
|
||
|
|
||
|
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||
|
if (*configTemplate == "") && (ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model))) {
|
||
|
*configTemplate = config.Model
|
||
|
}
|
||
|
if *configTemplate == "" {
|
||
|
return nil, fmt.Errorf(("failed to find templateConfig"))
|
||
|
}
|
||
|
|
||
|
return config, nil
|
||
|
}
|
||
|
|
||
|
////////// SPECIFIC REQUESTS //////////////
|
||
|
// TODO: For round one of the refactor, give each of the three primary text endpoints their own function?
|
||
|
// SEMITODO: During a merge, edit/completion were semi-combined - but remain nominally split
|
||
|
// Can cleanup into a common form later if possible easier if they are all here for now
|
||
|
// If they remain different, extract each of these named segments to a seperate file
|
||
|
|
||
|
func prepareChatGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.Config, string, bool, error) {
|
||
|
|
||
|
// IMPORTANT DEFS
|
||
|
funcs := grammar.Functions{}
|
||
|
|
||
|
// The Basic Begining
|
||
|
|
||
|
config, input, err := ReadConfigFromFileAndCombineWithOpenAIRequest(modelName, input, cl, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, "", false, fmt.Errorf("failed reading parameters from request:%w", err)
|
||
|
}
|
||
|
log.Debug().Msgf("Configuration read: %+v", config)
|
||
|
|
||
|
// Special Input/Config Handling
|
||
|
|
||
|
// Allow the user to set custom actions via config file
|
||
|
// to be "embedded" in each model - but if they are missing, use defaults.
|
||
|
if config.FunctionsConfig.NoActionFunctionName == "" {
|
||
|
config.FunctionsConfig.NoActionFunctionName = DEFAULT_NO_ACTION_NAME
|
||
|
}
|
||
|
if config.FunctionsConfig.NoActionDescriptionName == "" {
|
||
|
config.FunctionsConfig.NoActionDescriptionName = DEFAULT_NO_ACTION_DESCRIPTION
|
||
|
}
|
||
|
|
||
|
if input.ResponseFormat.Type == "json_object" {
|
||
|
input.Grammar = grammar.JSONBNF
|
||
|
}
|
||
|
|
||
|
processFunctions := len(input.Functions) > 0 && config.ShouldUseFunctions()
|
||
|
|
||
|
if processFunctions {
|
||
|
log.Debug().Msgf("Response needs to process functions")
|
||
|
|
||
|
noActionGrammar := grammar.Function{
|
||
|
Name: config.FunctionsConfig.NoActionFunctionName,
|
||
|
Description: config.FunctionsConfig.NoActionDescriptionName,
|
||
|
Parameters: map[string]interface{}{
|
||
|
"properties": map[string]interface{}{
|
||
|
"message": map[string]interface{}{
|
||
|
"type": "string",
|
||
|
"description": "The message to reply the user with",
|
||
|
}},
|
||
|
},
|
||
|
}
|
||
|
|
||
|
// Append the no action function
|
||
|
funcs = append(funcs, input.Functions...)
|
||
|
if !config.FunctionsConfig.DisableNoAction {
|
||
|
funcs = append(funcs, noActionGrammar)
|
||
|
}
|
||
|
|
||
|
// Force picking one of the functions by the request
|
||
|
if config.FunctionToCall() != "" {
|
||
|
funcs = funcs.Select(config.FunctionToCall())
|
||
|
}
|
||
|
|
||
|
// Update input grammar
|
||
|
jsStruct := funcs.ToJSONStructure()
|
||
|
config.Grammar = jsStruct.Grammar("")
|
||
|
} else if input.JSONFunctionGrammarObject != nil {
|
||
|
config.Grammar = input.JSONFunctionGrammarObject.Grammar("")
|
||
|
}
|
||
|
|
||
|
log.Debug().Msgf("Parameters: %+v", config)
|
||
|
|
||
|
var predInput string
|
||
|
|
||
|
suppressConfigSystemPrompt := false
|
||
|
mess := []string{}
|
||
|
for messageIndex, i := range input.Messages {
|
||
|
var content string
|
||
|
role := i.Role
|
||
|
|
||
|
// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
|
||
|
// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
|
||
|
if i.FunctionCall != nil && i.Role == "assistant" {
|
||
|
roleFn := "assistant_function_call"
|
||
|
r := config.Roles[roleFn]
|
||
|
if r != "" {
|
||
|
role = roleFn
|
||
|
}
|
||
|
}
|
||
|
r := config.Roles[role]
|
||
|
contentExists := i.Content != nil && i.StringContent != ""
|
||
|
// First attempt to populate content via a chat message specific template
|
||
|
if config.TemplateConfig.ChatMessage != "" {
|
||
|
chatMessageData := model.ChatMessageTemplateData{
|
||
|
SystemPrompt: config.SystemPrompt,
|
||
|
Role: r,
|
||
|
RoleName: role,
|
||
|
Content: i.StringContent,
|
||
|
MessageIndex: messageIndex,
|
||
|
}
|
||
|
templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
|
||
|
if err != nil {
|
||
|
log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
|
||
|
} else {
|
||
|
if templatedChatMessage == "" {
|
||
|
log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
|
||
|
continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
|
||
|
}
|
||
|
log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
|
||
|
content = templatedChatMessage
|
||
|
}
|
||
|
}
|
||
|
// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
|
||
|
if content == "" {
|
||
|
if r != "" {
|
||
|
if contentExists {
|
||
|
content = fmt.Sprint(r, i.StringContent)
|
||
|
}
|
||
|
if i.FunctionCall != nil {
|
||
|
j, err := json.Marshal(i.FunctionCall)
|
||
|
if err == nil {
|
||
|
if contentExists {
|
||
|
content += "\n" + fmt.Sprint(r, " ", string(j))
|
||
|
} else {
|
||
|
content = fmt.Sprint(r, " ", string(j))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
if contentExists {
|
||
|
content = fmt.Sprint(i.StringContent)
|
||
|
}
|
||
|
if i.FunctionCall != nil {
|
||
|
j, err := json.Marshal(i.FunctionCall)
|
||
|
if err == nil {
|
||
|
if contentExists {
|
||
|
content += "\n" + string(j)
|
||
|
} else {
|
||
|
content = string(j)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
|
||
|
if contentExists && role == "system" {
|
||
|
suppressConfigSystemPrompt = true
|
||
|
}
|
||
|
}
|
||
|
|
||
|
mess = append(mess, content)
|
||
|
}
|
||
|
|
||
|
predInput = strings.Join(mess, "\n")
|
||
|
log.Debug().Msgf("Prompt (before templating): %s", predInput)
|
||
|
|
||
|
templateFile := ""
|
||
|
|
||
|
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||
|
if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
|
||
|
templateFile = config.Model
|
||
|
}
|
||
|
|
||
|
if config.TemplateConfig.Chat != "" && !processFunctions {
|
||
|
templateFile = config.TemplateConfig.Chat
|
||
|
}
|
||
|
|
||
|
if config.TemplateConfig.Functions != "" && processFunctions {
|
||
|
templateFile = config.TemplateConfig.Functions
|
||
|
}
|
||
|
|
||
|
if templateFile != "" {
|
||
|
templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
|
||
|
SystemPrompt: config.SystemPrompt,
|
||
|
SuppressSystemPrompt: suppressConfigSystemPrompt,
|
||
|
Input: predInput,
|
||
|
Functions: funcs,
|
||
|
})
|
||
|
if err == nil {
|
||
|
predInput = templatedInput
|
||
|
log.Debug().Msgf("Template found, input modified to: %s", predInput)
|
||
|
} else {
|
||
|
log.Debug().Msgf("Template failed loading: %s", err.Error())
|
||
|
}
|
||
|
}
|
||
|
|
||
|
log.Debug().Msgf("Prompt (after templating): %s", predInput)
|
||
|
if processFunctions {
|
||
|
log.Debug().Msgf("Grammar: %+v", config.Grammar)
|
||
|
}
|
||
|
|
||
|
return config, predInput, processFunctions, nil
|
||
|
|
||
|
}
|
||
|
|
||
|
func EditGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.OpenAIResponse, error) {
|
||
|
id := uuid.New().String()
|
||
|
created := int(time.Now().Unix())
|
||
|
|
||
|
binding := func(config *schema.Config) *string {
|
||
|
return &config.TemplateConfig.Edit
|
||
|
}
|
||
|
|
||
|
config, err := prepareGenerationOpenAIRequest(binding, modelName, input, cl, ml, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
var result []schema.Choice
|
||
|
totalTokenUsage := TokenUsage{}
|
||
|
|
||
|
for _, i := range config.InputStrings {
|
||
|
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||
|
templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, config.TemplateConfig.Edit, model.PromptTemplateData{
|
||
|
Input: i,
|
||
|
Instruction: input.Instruction,
|
||
|
SystemPrompt: config.SystemPrompt,
|
||
|
})
|
||
|
if err == nil {
|
||
|
i = templatedInput
|
||
|
log.Debug().Msgf("Template found, input modified to: %s", i)
|
||
|
}
|
||
|
|
||
|
r, tokenUsage, err := ComputeChoices(input, i, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
|
||
|
*c = append(*c, schema.Choice{Text: s})
|
||
|
}, nil)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
totalTokenUsage.Prompt += tokenUsage.Prompt
|
||
|
totalTokenUsage.Completion += tokenUsage.Completion
|
||
|
|
||
|
result = append(result, r...)
|
||
|
}
|
||
|
|
||
|
return &schema.OpenAIResponse{
|
||
|
ID: id,
|
||
|
Created: created,
|
||
|
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||
|
Choices: result,
|
||
|
Object: "edit",
|
||
|
Usage: schema.OpenAIUsage{
|
||
|
PromptTokens: totalTokenUsage.Prompt,
|
||
|
CompletionTokens: totalTokenUsage.Completion,
|
||
|
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
|
||
|
},
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
func ChatGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.OpenAIResponse, error) {
|
||
|
|
||
|
// DEFS
|
||
|
id := uuid.New().String()
|
||
|
created := int(time.Now().Unix())
|
||
|
|
||
|
// Prepare
|
||
|
config, predInput, processFunctions, err := prepareChatGenerationOpenAIRequest(modelName, input, cl, ml, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
result, tokenUsage, err := ComputeChoices(input, predInput, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
|
||
|
if processFunctions {
|
||
|
// As we have to change the result before processing, we can't stream the answer (yet?)
|
||
|
ss := map[string]interface{}{}
|
||
|
// This prevent newlines to break JSON parsing for clients
|
||
|
s = utils.EscapeNewLines(s)
|
||
|
json.Unmarshal([]byte(s), &ss)
|
||
|
log.Debug().Msgf("Function return: %s %+v", s, ss)
|
||
|
|
||
|
// The grammar defines the function name as "function", while OpenAI returns "name"
|
||
|
func_name := ss["function"]
|
||
|
// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
|
||
|
args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
|
||
|
d, _ := json.Marshal(args)
|
||
|
|
||
|
ss["arguments"] = string(d)
|
||
|
ss["name"] = func_name
|
||
|
|
||
|
// if do nothing, reply with a message
|
||
|
if func_name == config.FunctionsConfig.NoActionFunctionName {
|
||
|
log.Debug().Msgf("nothing to do, computing a reply")
|
||
|
|
||
|
// If there is a message that the LLM already sends as part of the JSON reply, use it
|
||
|
arguments := map[string]interface{}{}
|
||
|
json.Unmarshal([]byte(d), &arguments)
|
||
|
m, exists := arguments["message"]
|
||
|
if exists {
|
||
|
switch message := m.(type) {
|
||
|
case string:
|
||
|
if message != "" {
|
||
|
log.Debug().Msgf("Reply received from LLM: %s", message)
|
||
|
message = Finetune(*config, predInput, message)
|
||
|
log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
|
||
|
|
||
|
*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
|
||
|
return
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
|
||
|
// Otherwise ask the LLM to understand the JSON output and the context, and return a message
|
||
|
// Note: This costs (in term of CPU) another computation
|
||
|
config.Grammar = ""
|
||
|
images := []string{}
|
||
|
for _, m := range input.Messages {
|
||
|
images = append(images, m.StringImages...)
|
||
|
}
|
||
|
predFunc, err := ModelInference(input.Context, predInput, images, ml, *config, startupOptions, nil)
|
||
|
if err != nil {
|
||
|
log.Error().Msgf("inference error: %s", err.Error())
|
||
|
return
|
||
|
}
|
||
|
|
||
|
prediction, err := predFunc()
|
||
|
if err != nil {
|
||
|
log.Error().Msgf("inference error: %s", err.Error())
|
||
|
return
|
||
|
}
|
||
|
|
||
|
fineTunedResponse := Finetune(*config, predInput, prediction.Response)
|
||
|
*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
|
||
|
} else {
|
||
|
// otherwise reply with the function call
|
||
|
*c = append(*c, schema.Choice{
|
||
|
FinishReason: "function_call",
|
||
|
Message: &schema.Message{Role: "assistant", FunctionCall: ss},
|
||
|
})
|
||
|
}
|
||
|
|
||
|
return
|
||
|
}
|
||
|
*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
|
||
|
}, nil)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
return &schema.OpenAIResponse{
|
||
|
ID: id,
|
||
|
Created: created,
|
||
|
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||
|
Choices: result,
|
||
|
Object: "chat.completion",
|
||
|
Usage: schema.OpenAIUsage{
|
||
|
PromptTokens: tokenUsage.Prompt,
|
||
|
CompletionTokens: tokenUsage.Completion,
|
||
|
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
||
|
},
|
||
|
}, nil
|
||
|
|
||
|
}
|
||
|
|
||
|
func CompletionGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.OpenAIResponse, error) {
|
||
|
// Prepare
|
||
|
id := uuid.New().String()
|
||
|
created := int(time.Now().Unix())
|
||
|
|
||
|
binding := func(config *schema.Config) *string {
|
||
|
return &config.TemplateConfig.Completion
|
||
|
}
|
||
|
|
||
|
config, err := prepareGenerationOpenAIRequest(binding, modelName, input, cl, ml, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
var result []schema.Choice
|
||
|
|
||
|
totalTokenUsage := TokenUsage{}
|
||
|
|
||
|
for k, i := range config.PromptStrings {
|
||
|
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||
|
templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, config.TemplateConfig.Completion, model.PromptTemplateData{
|
||
|
SystemPrompt: config.SystemPrompt,
|
||
|
Input: i,
|
||
|
})
|
||
|
if err == nil {
|
||
|
i = templatedInput
|
||
|
log.Debug().Msgf("Template found, input modified to: %s", i)
|
||
|
}
|
||
|
|
||
|
r, tokenUsage, err := ComputeChoices(
|
||
|
input, i, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
|
||
|
*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
|
||
|
}, nil)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
totalTokenUsage.Prompt += tokenUsage.Prompt
|
||
|
totalTokenUsage.Completion += tokenUsage.Completion
|
||
|
|
||
|
result = append(result, r...)
|
||
|
}
|
||
|
|
||
|
return &schema.OpenAIResponse{
|
||
|
ID: id,
|
||
|
Created: created,
|
||
|
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||
|
Choices: result,
|
||
|
Object: "text_completion",
|
||
|
Usage: schema.OpenAIUsage{
|
||
|
PromptTokens: totalTokenUsage.Prompt,
|
||
|
CompletionTokens: totalTokenUsage.Completion,
|
||
|
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
|
||
|
},
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
func StreamingChatGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (chan schema.OpenAIResponse, error) {
|
||
|
|
||
|
// DEFS
|
||
|
emptyMessage := ""
|
||
|
id := uuid.New().String()
|
||
|
created := int(time.Now().Unix())
|
||
|
|
||
|
// Prepare
|
||
|
config, predInput, processFunctions, err := prepareChatGenerationOpenAIRequest(modelName, input, cl, ml, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
if processFunctions {
|
||
|
// TODO: unused variable means I did something wrong. investigate once stable
|
||
|
log.Debug().Msgf("StreamingChatGenerationOpenAIRequest with processFunctions=true for %s?", config.Name)
|
||
|
}
|
||
|
|
||
|
processor := func(s string, req *schema.OpenAIRequest, config *schema.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
|
||
|
initialMessage := schema.OpenAIResponse{
|
||
|
ID: id,
|
||
|
Created: created,
|
||
|
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||
|
Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
|
||
|
Object: "chat.completion.chunk",
|
||
|
}
|
||
|
responses <- initialMessage
|
||
|
|
||
|
ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage TokenUsage) bool {
|
||
|
resp := schema.OpenAIResponse{
|
||
|
ID: id,
|
||
|
Created: created,
|
||
|
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||
|
Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
|
||
|
Object: "chat.completion.chunk",
|
||
|
Usage: schema.OpenAIUsage{
|
||
|
PromptTokens: usage.Prompt,
|
||
|
CompletionTokens: usage.Completion,
|
||
|
TotalTokens: usage.Prompt + usage.Completion,
|
||
|
},
|
||
|
}
|
||
|
|
||
|
responses <- resp
|
||
|
return true
|
||
|
})
|
||
|
close(responses)
|
||
|
}
|
||
|
log.Trace().Msg("StreamingChatGenerationOpenAIRequest :: About to create response channel")
|
||
|
|
||
|
responses := make(chan schema.OpenAIResponse)
|
||
|
|
||
|
log.Trace().Msg("StreamingChatGenerationOpenAIRequest :: About to start processor goroutine")
|
||
|
|
||
|
go processor(predInput, input, config, ml, responses)
|
||
|
|
||
|
log.Trace().Msg("StreamingChatGenerationOpenAIRequest :: DONE! successfully returning to caller!")
|
||
|
|
||
|
return responses, nil
|
||
|
|
||
|
}
|
||
|
|
||
|
func StreamingCompletionGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (chan schema.OpenAIResponse, error) {
|
||
|
// DEFS
|
||
|
id := uuid.New().String()
|
||
|
created := int(time.Now().Unix())
|
||
|
|
||
|
binding := func(config *schema.Config) *string {
|
||
|
return &config.TemplateConfig.Completion
|
||
|
}
|
||
|
|
||
|
// Prepare
|
||
|
|
||
|
config, err := prepareGenerationOpenAIRequest(binding, modelName, input, cl, ml, startupOptions)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
processor := func(s string, req *schema.OpenAIRequest, config *schema.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
|
||
|
ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage TokenUsage) bool {
|
||
|
resp := schema.OpenAIResponse{
|
||
|
ID: id,
|
||
|
Created: created,
|
||
|
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||
|
Choices: []schema.Choice{
|
||
|
{
|
||
|
Index: 0,
|
||
|
Text: s,
|
||
|
},
|
||
|
},
|
||
|
Object: "text_completion",
|
||
|
Usage: schema.OpenAIUsage{
|
||
|
PromptTokens: usage.Prompt,
|
||
|
CompletionTokens: usage.Completion,
|
||
|
TotalTokens: usage.Prompt + usage.Completion,
|
||
|
},
|
||
|
}
|
||
|
log.Debug().Msgf("Sending goroutine: %s", s)
|
||
|
|
||
|
responses <- resp
|
||
|
return true
|
||
|
})
|
||
|
close(responses)
|
||
|
}
|
||
|
|
||
|
if len(config.PromptStrings) > 1 {
|
||
|
return nil, errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
|
||
|
|
||
|
}
|
||
|
|
||
|
predInput := config.PromptStrings[0]
|
||
|
|
||
|
//A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||
|
templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, config.TemplateConfig.Completion, model.PromptTemplateData{
|
||
|
Input: predInput,
|
||
|
})
|
||
|
if err == nil {
|
||
|
predInput = templatedInput
|
||
|
log.Debug().Msgf("Template found, input modified to: %s", predInput)
|
||
|
}
|
||
|
|
||
|
log.Trace().Msg("StreamingCompletionGenerationOpenAIRequest :: About to create response channel")
|
||
|
|
||
|
responses := make(chan schema.OpenAIResponse)
|
||
|
|
||
|
log.Trace().Msg("StreamingCompletionGenerationOpenAIRequest :: About to start processor goroutine")
|
||
|
|
||
|
go processor(predInput, input, config, ml, responses)
|
||
|
|
||
|
log.Trace().Msg("StreamingCompletionGenerationOpenAIRequest :: DONE! successfully returning to caller!")
|
||
|
|
||
|
return responses, nil
|
||
|
}
|