LocalAI/core/backend/llm.go

package backend

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"sync"
	"time"
	"unicode/utf8"

	"github.com/go-skynet/LocalAI/core/services"
	"github.com/go-skynet/LocalAI/pkg/gallery"
	"github.com/go-skynet/LocalAI/pkg/grammar"
	"github.com/go-skynet/LocalAI/pkg/grpc"
	"github.com/go-skynet/LocalAI/pkg/model"
	"github.com/go-skynet/LocalAI/pkg/schema"
	"github.com/go-skynet/LocalAI/pkg/utils"
	"github.com/google/uuid"
	"github.com/rs/zerolog/log"
)

////////// TYPES //////////////

type LLMResponse struct {
	Response string // should this be []byte?
	Usage    TokenUsage
}

// TODO: Test removing this and using the variant in pkg/schema someday?
type TokenUsage struct {
	Prompt     int
	Completion int
}

type TemplateConfigBindingFn func(*schema.Config) *string

// type LLMStreamProcessor func(s string, req *schema.OpenAIRequest, config *schema.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse)

/////// CONSTS ///////////

const DEFAULT_NO_ACTION_NAME = "answer"
const DEFAULT_NO_ACTION_DESCRIPTION = "use this action to answer without performing any action"

////// INFERENCE /////////

func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c schema.Config, o *schema.StartupOptions, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
	modelFile := c.Model

	grpcOpts := gRPCModelOpts(c)

	var inferenceModel *grpc.Client
	var err error

	opts := modelOpts(c, o, []model.Option{
		model.WithLoadGRPCLoadModelOpts(grpcOpts),
		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
		model.WithAssetDir(o.AssetsDestination),
		model.WithModel(modelFile),
		model.WithContext(o.Context),
		model.WithExternalBackends(o.ExternalGRPCBackends, false),
	})

	if c.Backend != "" {
		opts = append(opts, model.WithBackendString(c.Backend))
	}

	// Check if the modelFile exists, if it doesn't try to load it from the gallery
	if o.AutoloadGalleries { // experimental
		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
			utils.ResetDownloadTimers()
			// if we failed to load the model, we try to download it
			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
			if err != nil {
				return nil, err
			}
		}
	}

	if c.Backend == "" {
		inferenceModel, err = loader.GreedyLoader(opts...)
	} else {
		inferenceModel, err = loader.BackendLoader(opts...)
	}

	if err != nil {
		return nil, err
	}

	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
	fn := func() (LLMResponse, error) {
		opts := gRPCPredictOpts(c, loader.ModelPath)
		opts.Prompt = s
		opts.Images = images

		tokenUsage := TokenUsage{}

		// check the per-model feature flag for usage, since tokenCallback may have a cost.
		// Defaults to off as for now it is still experimental
		if c.FeatureFlag.Enabled("usage") {
			userTokenCallback := tokenCallback
			if userTokenCallback == nil {
				userTokenCallback = func(token string, usage TokenUsage) bool {
					return true
				}
			}

			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
			if pErr == nil && promptInfo.Length > 0 {
				tokenUsage.Prompt = int(promptInfo.Length)
			}

			tokenCallback = func(token string, usage TokenUsage) bool {
				tokenUsage.Completion++
				return userTokenCallback(token, tokenUsage)
			}
		}

		if tokenCallback != nil {
			ss := ""

			var partialRune []byte
			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
				partialRune = append(partialRune, chars...)

				for len(partialRune) > 0 {
					r, size := utf8.DecodeRune(partialRune)
					if r == utf8.RuneError {
						// incomplete rune, wait for more bytes
						break
					}

					tokenCallback(string(r), tokenUsage)
					ss += string(r)

					partialRune = partialRune[size:]
				}
			})
			return LLMResponse{
				Response: ss,
				Usage:    tokenUsage,
			}, err
		} else {
			// TODO: Is the chicken bit the only way to get here? is that acceptable?
			reply, err := inferenceModel.Predict(ctx, opts)
			if err != nil {
				return LLMResponse{}, err
			}
			return LLMResponse{
				Response: string(reply.Message),
				Usage:    tokenUsage,
			}, err
		}
	}

	return fn, nil
}

var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
var mu sync.Mutex = sync.Mutex{}

func Finetune(config schema.Config, input, prediction string) string {
	if config.Echo {
		prediction = input + prediction
	}

	for _, c := range config.Cutstrings {
		mu.Lock()
		reg, ok := cutstrings[c]
		if !ok {
			cutstrings[c] = regexp.MustCompile(c)
			reg = cutstrings[c]
		}
		mu.Unlock()
		prediction = reg.ReplaceAllString(prediction, "")
	}

	for _, c := range config.TrimSpace {
		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
	}

	for _, c := range config.TrimSuffix {
		prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
	}
	return prediction

}

////// CONFIG AND REQUEST HANDLING ///////////////

func ReadConfigFromFileAndCombineWithOpenAIRequest(modelFile string, input *schema.OpenAIRequest, cm *services.ConfigLoader, startupOptions *schema.StartupOptions) (*schema.Config, *schema.OpenAIRequest, error) {
	// Load a config file if present after the model name
	modelConfig := filepath.Join(startupOptions.ModelPath, modelFile+".yaml")

	var cfg *schema.Config

	defaults := func() {
		cfg = schema.DefaultConfig(modelFile)
		cfg.ContextSize = startupOptions.ContextSize
		cfg.Threads = startupOptions.Threads
		cfg.F16 = startupOptions.F16
		cfg.Debug = startupOptions.Debug
	}

	cfgExisting, exists := cm.GetConfig(modelFile)
	if !exists {
		if _, err := os.Stat(modelConfig); err == nil {
			if err := cm.LoadConfig(modelConfig); err != nil {
				return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
			}
			cfgExisting, exists = cm.GetConfig(modelFile)
			if exists {
				cfg = &cfgExisting
			} else {
				defaults()
			}
		} else {
			defaults()
		}
	} else {
		cfg = &cfgExisting
	}

	// Set the parameters for the language model prediction
	schema.UpdateConfigFromOpenAIRequest(cfg, input)

	// Don't allow 0 as setting
	if cfg.Threads == 0 {
		if startupOptions.Threads != 0 {
			cfg.Threads = startupOptions.Threads
		} else {
			cfg.Threads = 4
		}
	}

	// Enforce debug flag if passed from CLI
	if startupOptions.Debug {
		cfg.Debug = true
	}

	return cfg, input, nil
}

func ComputeChoices(
	req *schema.OpenAIRequest,
	predInput string,
	config *schema.Config,
	o *schema.StartupOptions,
	loader *model.ModelLoader,
	cb func(string, *[]schema.Choice),
	tokenCallback func(string, TokenUsage) bool) ([]schema.Choice, TokenUsage, error) {
	n := req.N // number of completions to return
	result := []schema.Choice{}

	if n == 0 {
		n = 1
	}

	images := []string{}
	for _, m := range req.Messages {
		images = append(images, m.StringImages...)
	}

	// get the model function to call for the result
	predFunc, err := ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
	if err != nil {
		return result, TokenUsage{}, err
	}

	tokenUsage := TokenUsage{}

	for i := 0; i < n; i++ {
		prediction, err := predFunc()
		if err != nil {
			return result, TokenUsage{}, err
		}

		tokenUsage.Prompt += prediction.Usage.Prompt
		tokenUsage.Completion += prediction.Usage.Completion

		finetunedResponse := Finetune(*config, predInput, prediction.Response)
		cb(finetunedResponse, &result)

		//result = append(result, Choice{Text: prediction})

	}
	return result, tokenUsage, err
}

// TODO: No functions???? Commonize with prepareChatGenerationOpenAIRequest below?
func prepareGenerationOpenAIRequest(bindingFn TemplateConfigBindingFn, modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.Config, error) {
	config, input, err := ReadConfigFromFileAndCombineWithOpenAIRequest(modelName, input, cl, startupOptions)
	if err != nil {
		return nil, fmt.Errorf("failed reading parameters from request:%w", err)
	}

	if input.ResponseFormat.Type == "json_object" {
		input.Grammar = grammar.JSONBNF
	}

	log.Debug().Msgf("Parameter Config: %+v", config)

	configTemplate := bindingFn(config)

	// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
	if (*configTemplate == "") && (ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model))) {
		*configTemplate = config.Model
	}
	if *configTemplate == "" {
		return nil, fmt.Errorf(("failed to find templateConfig"))
	}

	return config, nil
}

////////// SPECIFIC REQUESTS //////////////
// TODO: For round one of the refactor, give each of the three primary text endpoints their own function?
// SEMITODO: During a merge, edit/completion were semi-combined - but remain nominally split
// Can cleanup into a common form later if possible easier if they are all here for now
// If they remain different, extract each of these named segments to a seperate file

func prepareChatGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.Config, string, bool, error) {

	// IMPORTANT DEFS
	funcs := grammar.Functions{}

	// The Basic Begining

	config, input, err := ReadConfigFromFileAndCombineWithOpenAIRequest(modelName, input, cl, startupOptions)
	if err != nil {
		return nil, "", false, fmt.Errorf("failed reading parameters from request:%w", err)
	}
	log.Debug().Msgf("Configuration read: %+v", config)

	// Special Input/Config Handling

	// Allow the user to set custom actions via config file
	// to be "embedded" in each model - but if they are missing, use defaults.
	if config.FunctionsConfig.NoActionFunctionName == "" {
		config.FunctionsConfig.NoActionFunctionName = DEFAULT_NO_ACTION_NAME
	}
	if config.FunctionsConfig.NoActionDescriptionName == "" {
		config.FunctionsConfig.NoActionDescriptionName = DEFAULT_NO_ACTION_DESCRIPTION
	}

	if input.ResponseFormat.Type == "json_object" {
		input.Grammar = grammar.JSONBNF
	}

	processFunctions := len(input.Functions) > 0 && config.ShouldUseFunctions()

	if processFunctions {
		log.Debug().Msgf("Response needs to process functions")

		noActionGrammar := grammar.Function{
			Name:        config.FunctionsConfig.NoActionFunctionName,
			Description: config.FunctionsConfig.NoActionDescriptionName,
			Parameters: map[string]interface{}{
				"properties": map[string]interface{}{
					"message": map[string]interface{}{
						"type":        "string",
						"description": "The message to reply the user with",
					}},
			},
		}

		// Append the no action function
		funcs = append(funcs, input.Functions...)
		if !config.FunctionsConfig.DisableNoAction {
			funcs = append(funcs, noActionGrammar)
		}

		// Force picking one of the functions by the request
		if config.FunctionToCall() != "" {
			funcs = funcs.Select(config.FunctionToCall())
		}

		// Update input grammar
		jsStruct := funcs.ToJSONStructure()
		config.Grammar = jsStruct.Grammar("")
	} else if input.JSONFunctionGrammarObject != nil {
		config.Grammar = input.JSONFunctionGrammarObject.Grammar("")
	}

	log.Debug().Msgf("Parameters: %+v", config)

	var predInput string

	suppressConfigSystemPrompt := false
	mess := []string{}
	for messageIndex, i := range input.Messages {
		var content string
		role := i.Role

		// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
		// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
		if i.FunctionCall != nil && i.Role == "assistant" {
			roleFn := "assistant_function_call"
			r := config.Roles[roleFn]
			if r != "" {
				role = roleFn
			}
		}
		r := config.Roles[role]
		contentExists := i.Content != nil && i.StringContent != ""
		// First attempt to populate content via a chat message specific template
		if config.TemplateConfig.ChatMessage != "" {
			chatMessageData := model.ChatMessageTemplateData{
				SystemPrompt: config.SystemPrompt,
				Role:         r,
				RoleName:     role,
				Content:      i.StringContent,
				MessageIndex: messageIndex,
			}
			templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
			if err != nil {
				log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
			} else {
				if templatedChatMessage == "" {
					log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
					continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
				}
				log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
				content = templatedChatMessage
			}
		}
		// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
		if content == "" {
			if r != "" {
				if contentExists {
					content = fmt.Sprint(r, i.StringContent)
				}
				if i.FunctionCall != nil {
					j, err := json.Marshal(i.FunctionCall)
					if err == nil {
						if contentExists {
							content += "\n" + fmt.Sprint(r, " ", string(j))
						} else {
							content = fmt.Sprint(r, " ", string(j))
						}
					}
				}
			} else {
				if contentExists {
					content = fmt.Sprint(i.StringContent)
				}
				if i.FunctionCall != nil {
					j, err := json.Marshal(i.FunctionCall)
					if err == nil {
						if contentExists {
							content += "\n" + string(j)
						} else {
							content = string(j)
						}
					}
				}
			}
			// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
			if contentExists && role == "system" {
				suppressConfigSystemPrompt = true
			}
		}

		mess = append(mess, content)
	}

	predInput = strings.Join(mess, "\n")
	log.Debug().Msgf("Prompt (before templating): %s", predInput)

	templateFile := ""

	// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
	if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
		templateFile = config.Model
	}

	if config.TemplateConfig.Chat != "" && !processFunctions {
		templateFile = config.TemplateConfig.Chat
	}

	if config.TemplateConfig.Functions != "" && processFunctions {
		templateFile = config.TemplateConfig.Functions
	}

	if templateFile != "" {
		templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
			SystemPrompt:         config.SystemPrompt,
			SuppressSystemPrompt: suppressConfigSystemPrompt,
			Input:                predInput,
			Functions:            funcs,
		})
		if err == nil {
			predInput = templatedInput
			log.Debug().Msgf("Template found, input modified to: %s", predInput)
		} else {
			log.Debug().Msgf("Template failed loading: %s", err.Error())
		}
	}

	log.Debug().Msgf("Prompt (after templating): %s", predInput)
	if processFunctions {
		log.Debug().Msgf("Grammar: %+v", config.Grammar)
	}

	return config, predInput, processFunctions, nil

}

func EditGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.OpenAIResponse, error) {
	id := uuid.New().String()
	created := int(time.Now().Unix())

	binding := func(config *schema.Config) *string {
		return &config.TemplateConfig.Edit
	}

	config, err := prepareGenerationOpenAIRequest(binding, modelName, input, cl, ml, startupOptions)
	if err != nil {
		return nil, err
	}

	var result []schema.Choice
	totalTokenUsage := TokenUsage{}

	for _, i := range config.InputStrings {
		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
		templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, config.TemplateConfig.Edit, model.PromptTemplateData{
			Input:        i,
			Instruction:  input.Instruction,
			SystemPrompt: config.SystemPrompt,
		})
		if err == nil {
			i = templatedInput
			log.Debug().Msgf("Template found, input modified to: %s", i)
		}

		r, tokenUsage, err := ComputeChoices(input, i, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
			*c = append(*c, schema.Choice{Text: s})
		}, nil)
		if err != nil {
			return nil, err
		}

		totalTokenUsage.Prompt += tokenUsage.Prompt
		totalTokenUsage.Completion += tokenUsage.Completion

		result = append(result, r...)
	}

	return &schema.OpenAIResponse{
		ID:      id,
		Created: created,
		Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
		Choices: result,
		Object:  "edit",
		Usage: schema.OpenAIUsage{
			PromptTokens:     totalTokenUsage.Prompt,
			CompletionTokens: totalTokenUsage.Completion,
			TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
		},
	}, nil
}

func ChatGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.OpenAIResponse, error) {

	// DEFS
	id := uuid.New().String()
	created := int(time.Now().Unix())

	// Prepare
	config, predInput, processFunctions, err := prepareChatGenerationOpenAIRequest(modelName, input, cl, ml, startupOptions)
	if err != nil {
		return nil, err
	}

	result, tokenUsage, err := ComputeChoices(input, predInput, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
		if processFunctions {
			// As we have to change the result before processing, we can't stream the answer (yet?)
			ss := map[string]interface{}{}
			// This prevent newlines to break JSON parsing for clients
			s = utils.EscapeNewLines(s)
			json.Unmarshal([]byte(s), &ss)
			log.Debug().Msgf("Function return: %s %+v", s, ss)

			// The grammar defines the function name as "function", while OpenAI returns "name"
			func_name := ss["function"]
			// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
			args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
			d, _ := json.Marshal(args)

			ss["arguments"] = string(d)
			ss["name"] = func_name

			// if do nothing, reply with a message
			if func_name == config.FunctionsConfig.NoActionFunctionName {
				log.Debug().Msgf("nothing to do, computing a reply")

				// If there is a message that the LLM already sends as part of the JSON reply, use it
				arguments := map[string]interface{}{}
				json.Unmarshal([]byte(d), &arguments)
				m, exists := arguments["message"]
				if exists {
					switch message := m.(type) {
					case string:
						if message != "" {
							log.Debug().Msgf("Reply received from LLM: %s", message)
							message = Finetune(*config, predInput, message)
							log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)

							*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
							return
						}
					}
				}

				log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
				// Otherwise ask the LLM to understand the JSON output and the context, and return a message
				// Note: This costs (in term of CPU) another computation
				config.Grammar = ""
				images := []string{}
				for _, m := range input.Messages {
					images = append(images, m.StringImages...)
				}
				predFunc, err := ModelInference(input.Context, predInput, images, ml, *config, startupOptions, nil)
				if err != nil {
					log.Error().Msgf("inference error: %s", err.Error())
					return
				}

				prediction, err := predFunc()
				if err != nil {
					log.Error().Msgf("inference error: %s", err.Error())
					return
				}

				fineTunedResponse := Finetune(*config, predInput, prediction.Response)
				*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
			} else {
				// otherwise reply with the function call
				*c = append(*c, schema.Choice{
					FinishReason: "function_call",
					Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
				})
			}

			return
		}
		*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
	}, nil)
	if err != nil {
		return nil, err
	}

	return &schema.OpenAIResponse{
		ID:      id,
		Created: created,
		Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
		Choices: result,
		Object:  "chat.completion",
		Usage: schema.OpenAIUsage{
			PromptTokens:     tokenUsage.Prompt,
			CompletionTokens: tokenUsage.Completion,
			TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
		},
	}, nil

}

func CompletionGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (*schema.OpenAIResponse, error) {
	// Prepare
	id := uuid.New().String()
	created := int(time.Now().Unix())

	binding := func(config *schema.Config) *string {
		return &config.TemplateConfig.Completion
	}

	config, err := prepareGenerationOpenAIRequest(binding, modelName, input, cl, ml, startupOptions)
	if err != nil {
		return nil, err
	}

	var result []schema.Choice

	totalTokenUsage := TokenUsage{}

	for k, i := range config.PromptStrings {
		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
		templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, config.TemplateConfig.Completion, model.PromptTemplateData{
			SystemPrompt: config.SystemPrompt,
			Input:        i,
		})
		if err == nil {
			i = templatedInput
			log.Debug().Msgf("Template found, input modified to: %s", i)
		}

		r, tokenUsage, err := ComputeChoices(
			input, i, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
				*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
			}, nil)
		if err != nil {
			return nil, err
		}

		totalTokenUsage.Prompt += tokenUsage.Prompt
		totalTokenUsage.Completion += tokenUsage.Completion

		result = append(result, r...)
	}

	return &schema.OpenAIResponse{
		ID:      id,
		Created: created,
		Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
		Choices: result,
		Object:  "text_completion",
		Usage: schema.OpenAIUsage{
			PromptTokens:     totalTokenUsage.Prompt,
			CompletionTokens: totalTokenUsage.Completion,
			TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
		},
	}, nil
}

func StreamingChatGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (chan schema.OpenAIResponse, error) {

	// DEFS
	emptyMessage := ""
	id := uuid.New().String()
	created := int(time.Now().Unix())

	// Prepare
	config, predInput, processFunctions, err := prepareChatGenerationOpenAIRequest(modelName, input, cl, ml, startupOptions)
	if err != nil {
		return nil, err
	}

	if processFunctions {
		// TODO: unused variable means I did something wrong. investigate once stable
		log.Debug().Msgf("StreamingChatGenerationOpenAIRequest with processFunctions=true for %s?", config.Name)
	}

	processor := func(s string, req *schema.OpenAIRequest, config *schema.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
		initialMessage := schema.OpenAIResponse{
			ID:      id,
			Created: created,
			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
			Object:  "chat.completion.chunk",
		}
		responses <- initialMessage

		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage TokenUsage) bool {
			resp := schema.OpenAIResponse{
				ID:      id,
				Created: created,
				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
				Object:  "chat.completion.chunk",
				Usage: schema.OpenAIUsage{
					PromptTokens:     usage.Prompt,
					CompletionTokens: usage.Completion,
					TotalTokens:      usage.Prompt + usage.Completion,
				},
			}

			responses <- resp
			return true
		})
		close(responses)
	}
	log.Trace().Msg("StreamingChatGenerationOpenAIRequest :: About to create response channel")

	responses := make(chan schema.OpenAIResponse)

	log.Trace().Msg("StreamingChatGenerationOpenAIRequest :: About to start processor goroutine")

	go processor(predInput, input, config, ml, responses)

	log.Trace().Msg("StreamingChatGenerationOpenAIRequest :: DONE! successfully returning to caller!")

	return responses, nil

}

func StreamingCompletionGenerationOpenAIRequest(modelName string, input *schema.OpenAIRequest, cl *services.ConfigLoader, ml *model.ModelLoader, startupOptions *schema.StartupOptions) (chan schema.OpenAIResponse, error) {
	// DEFS
	id := uuid.New().String()
	created := int(time.Now().Unix())

	binding := func(config *schema.Config) *string {
		return &config.TemplateConfig.Completion
	}

	// Prepare

	config, err := prepareGenerationOpenAIRequest(binding, modelName, input, cl, ml, startupOptions)
	if err != nil {
		return nil, err
	}

	processor := func(s string, req *schema.OpenAIRequest, config *schema.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage TokenUsage) bool {
			resp := schema.OpenAIResponse{
				ID:      id,
				Created: created,
				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
				Choices: []schema.Choice{
					{
						Index: 0,
						Text:  s,
					},
				},
				Object: "text_completion",
				Usage: schema.OpenAIUsage{
					PromptTokens:     usage.Prompt,
					CompletionTokens: usage.Completion,
					TotalTokens:      usage.Prompt + usage.Completion,
				},
			}
			log.Debug().Msgf("Sending goroutine: %s", s)

			responses <- resp
			return true
		})
		close(responses)
	}

	if len(config.PromptStrings) > 1 {
		return nil, errors.New("cannot handle more than 1 `PromptStrings` when Streaming")

	}

	predInput := config.PromptStrings[0]

	//A model can have a "file.bin.tmpl" file associated with a prompt template prefix
	templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, config.TemplateConfig.Completion, model.PromptTemplateData{
		Input: predInput,
	})
	if err == nil {
		predInput = templatedInput
		log.Debug().Msgf("Template found, input modified to: %s", predInput)
	}

	log.Trace().Msg("StreamingCompletionGenerationOpenAIRequest :: About to create response channel")

	responses := make(chan schema.OpenAIResponse)

	log.Trace().Msg("StreamingCompletionGenerationOpenAIRequest :: About to start processor goroutine")

	go processor(predInput, input, config, ml, responses)

	log.Trace().Msg("StreamingCompletionGenerationOpenAIRequest :: DONE! successfully returning to caller!")

	return responses, nil
}