2024-04-11 07:19:24 +00:00
package cli
import (
feat(llama.cpp): Totally decentralized, private, distributed, p2p inference (#2343)
* feat(llama.cpp): Enable decentralized, distributed inference
As https://github.com/mudler/LocalAI/pull/2324 introduced distributed inferencing thanks to
@rgerganov implementation in https://github.com/ggerganov/llama.cpp/pull/6829 in upstream llama.cpp, now
it is possible to distribute the workload to remote llama.cpp gRPC server.
This changeset now uses mudler/edgevpn to establish a secure, distributed network between the nodes using a shared token.
The token is generated automatically when starting the server with the `--p2p` flag, and can be used by starting the workers
with `local-ai worker p2p-llama-cpp-rpc` by passing the token via environment variable (TOKEN) or with args (--token).
As per how mudler/edgevpn works, a network is established between the server and the workers with dht and mdns discovery protocols,
the llama.cpp rpc server is automatically started and exposed to the underlying p2p network so the API server can connect on.
When the HTTP server is started, it will discover the workers in the network and automatically create the port-forwards to the service locally.
Then llama.cpp is configured to use the services.
This feature is behind the "p2p" GO_FLAGS
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* go mod tidy
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* ci: add p2p tag
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* better message
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-05-20 17:17:59 +00:00
"context"
2024-04-11 07:19:24 +00:00
"fmt"
"strings"
"time"
feat(llama.cpp): Totally decentralized, private, distributed, p2p inference (#2343)
* feat(llama.cpp): Enable decentralized, distributed inference
As https://github.com/mudler/LocalAI/pull/2324 introduced distributed inferencing thanks to
@rgerganov implementation in https://github.com/ggerganov/llama.cpp/pull/6829 in upstream llama.cpp, now
it is possible to distribute the workload to remote llama.cpp gRPC server.
This changeset now uses mudler/edgevpn to establish a secure, distributed network between the nodes using a shared token.
The token is generated automatically when starting the server with the `--p2p` flag, and can be used by starting the workers
with `local-ai worker p2p-llama-cpp-rpc` by passing the token via environment variable (TOKEN) or with args (--token).
As per how mudler/edgevpn works, a network is established between the server and the workers with dht and mdns discovery protocols,
the llama.cpp rpc server is automatically started and exposed to the underlying p2p network so the API server can connect on.
When the HTTP server is started, it will discover the workers in the network and automatically create the port-forwards to the service locally.
Then llama.cpp is configured to use the services.
This feature is behind the "p2p" GO_FLAGS
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* go mod tidy
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* ci: add p2p tag
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* better message
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-05-20 17:17:59 +00:00
cliContext "github.com/go-skynet/LocalAI/core/cli/context"
2024-04-11 07:19:24 +00:00
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/http"
feat(llama.cpp): Totally decentralized, private, distributed, p2p inference (#2343)
* feat(llama.cpp): Enable decentralized, distributed inference
As https://github.com/mudler/LocalAI/pull/2324 introduced distributed inferencing thanks to
@rgerganov implementation in https://github.com/ggerganov/llama.cpp/pull/6829 in upstream llama.cpp, now
it is possible to distribute the workload to remote llama.cpp gRPC server.
This changeset now uses mudler/edgevpn to establish a secure, distributed network between the nodes using a shared token.
The token is generated automatically when starting the server with the `--p2p` flag, and can be used by starting the workers
with `local-ai worker p2p-llama-cpp-rpc` by passing the token via environment variable (TOKEN) or with args (--token).
As per how mudler/edgevpn works, a network is established between the server and the workers with dht and mdns discovery protocols,
the llama.cpp rpc server is automatically started and exposed to the underlying p2p network so the API server can connect on.
When the HTTP server is started, it will discover the workers in the network and automatically create the port-forwards to the service locally.
Then llama.cpp is configured to use the services.
This feature is behind the "p2p" GO_FLAGS
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* go mod tidy
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* ci: add p2p tag
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* better message
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-05-20 17:17:59 +00:00
"github.com/go-skynet/LocalAI/core/p2p"
2024-04-11 07:19:24 +00:00
"github.com/go-skynet/LocalAI/core/startup"
2024-04-20 08:43:37 +00:00
"github.com/rs/zerolog"
2024-04-11 07:19:24 +00:00
"github.com/rs/zerolog/log"
)
type RunCMD struct {
ModelArgs [ ] string ` arg:"" optional:"" name:"models" help:"Model configuration URLs to load" `
2024-04-20 00:31:15 +00:00
ModelsPath string ` env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"$ { basepath}/models" help:"Path containing models used for inferencing" group:"storage" `
BackendAssetsPath string ` env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage" `
ImagePath string ` env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage" `
AudioPath string ` env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage" `
UploadPath string ` env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage" `
ConfigPath string ` env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage" `
LocalaiConfigDir string ` env:"LOCALAI_CONFIG_DIR" type:"path" default:"$ { basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage" `
LocalaiConfigDirPollInterval time . Duration ` env:"LOCALAI_CONFIG_DIR_POLL_INTERVAL" help:"Typically the config path picks up changes automatically, but if your system has broken fsnotify events, set this to an interval to poll the LocalAI Config Dir (example: 1m)" group:"storage" `
2024-04-11 07:19:24 +00:00
// The alias on this option is there to preserve functionality with the old `--config-file` parameter
ModelsConfigFile string ` env:"LOCALAI_MODELS_CONFIG_FILE,CONFIG_FILE" aliases:"config-file" help:"YAML file containing a list of model backend configs" group:"storage" `
2024-04-20 13:22:54 +00:00
Galleries string ` env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"$ { galleries}" `
2024-04-11 07:19:24 +00:00
AutoloadGalleries bool ` env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models" `
RemoteLibrary string ` env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"$ { remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models" `
PreloadModels string ` env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models" `
Models [ ] string ` env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models" `
PreloadModelsConfig string ` env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models" `
F16 bool ` name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance" `
Threads int ` env:"LOCALAI_THREADS,THREADS" short:"t" default:"4" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance" `
ContextSize int ` env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance" `
feat(llama.cpp): Totally decentralized, private, distributed, p2p inference (#2343)
* feat(llama.cpp): Enable decentralized, distributed inference
As https://github.com/mudler/LocalAI/pull/2324 introduced distributed inferencing thanks to
@rgerganov implementation in https://github.com/ggerganov/llama.cpp/pull/6829 in upstream llama.cpp, now
it is possible to distribute the workload to remote llama.cpp gRPC server.
This changeset now uses mudler/edgevpn to establish a secure, distributed network between the nodes using a shared token.
The token is generated automatically when starting the server with the `--p2p` flag, and can be used by starting the workers
with `local-ai worker p2p-llama-cpp-rpc` by passing the token via environment variable (TOKEN) or with args (--token).
As per how mudler/edgevpn works, a network is established between the server and the workers with dht and mdns discovery protocols,
the llama.cpp rpc server is automatically started and exposed to the underlying p2p network so the API server can connect on.
When the HTTP server is started, it will discover the workers in the network and automatically create the port-forwards to the service locally.
Then llama.cpp is configured to use the services.
This feature is behind the "p2p" GO_FLAGS
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* go mod tidy
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* ci: add p2p tag
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* better message
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-05-20 17:17:59 +00:00
Address string ` env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api" `
CORS bool ` env:"LOCALAI_CORS,CORS" help:"" group:"api" `
CORSAllowOrigins string ` env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api" `
UploadLimit int ` env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api" `
APIKeys [ ] string ` env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api" `
DisableWebUI bool ` env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api" `
Peer2Peer bool ` env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p" `
Peer2PeerToken string ` env:"LOCALAI_P2P_TOKEN,P2P_TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p" `
2024-04-11 07:19:24 +00:00
ParallelRequests bool ` env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends" `
SingleActiveBackend bool ` env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends" `
PreloadBackendOnly bool ` env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends" `
ExternalGRPCBackends [ ] string ` env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends" `
EnableWatchdogIdle bool ` env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends" `
WatchdogIdleTimeout string ` env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends" `
EnableWatchdogBusy bool ` env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends" `
WatchdogBusyTimeout string ` env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends" `
}
feat(llama.cpp): Totally decentralized, private, distributed, p2p inference (#2343)
* feat(llama.cpp): Enable decentralized, distributed inference
As https://github.com/mudler/LocalAI/pull/2324 introduced distributed inferencing thanks to
@rgerganov implementation in https://github.com/ggerganov/llama.cpp/pull/6829 in upstream llama.cpp, now
it is possible to distribute the workload to remote llama.cpp gRPC server.
This changeset now uses mudler/edgevpn to establish a secure, distributed network between the nodes using a shared token.
The token is generated automatically when starting the server with the `--p2p` flag, and can be used by starting the workers
with `local-ai worker p2p-llama-cpp-rpc` by passing the token via environment variable (TOKEN) or with args (--token).
As per how mudler/edgevpn works, a network is established between the server and the workers with dht and mdns discovery protocols,
the llama.cpp rpc server is automatically started and exposed to the underlying p2p network so the API server can connect on.
When the HTTP server is started, it will discover the workers in the network and automatically create the port-forwards to the service locally.
Then llama.cpp is configured to use the services.
This feature is behind the "p2p" GO_FLAGS
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* go mod tidy
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* ci: add p2p tag
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* better message
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-05-20 17:17:59 +00:00
func ( r * RunCMD ) Run ( ctx * cliContext . Context ) error {
2024-04-11 07:19:24 +00:00
opts := [ ] config . AppOption {
config . WithConfigFile ( r . ModelsConfigFile ) ,
config . WithJSONStringPreload ( r . PreloadModels ) ,
config . WithYAMLConfigPreload ( r . PreloadModelsConfig ) ,
config . WithModelPath ( r . ModelsPath ) ,
config . WithContextSize ( r . ContextSize ) ,
2024-04-20 08:43:37 +00:00
config . WithDebug ( zerolog . GlobalLevel ( ) <= zerolog . DebugLevel ) ,
2024-04-11 07:19:24 +00:00
config . WithImageDir ( r . ImagePath ) ,
config . WithAudioDir ( r . AudioPath ) ,
config . WithUploadDir ( r . UploadPath ) ,
config . WithConfigsDir ( r . ConfigPath ) ,
2024-04-18 03:21:55 +00:00
config . WithDynamicConfigDir ( r . LocalaiConfigDir ) ,
2024-04-20 00:31:15 +00:00
config . WithDynamicConfigDirPollInterval ( r . LocalaiConfigDirPollInterval ) ,
2024-04-11 07:19:24 +00:00
config . WithF16 ( r . F16 ) ,
config . WithStringGalleries ( r . Galleries ) ,
config . WithModelLibraryURL ( r . RemoteLibrary ) ,
config . WithCors ( r . CORS ) ,
config . WithCorsAllowOrigins ( r . CORSAllowOrigins ) ,
config . WithThreads ( r . Threads ) ,
config . WithBackendAssets ( ctx . BackendAssets ) ,
config . WithBackendAssetsOutput ( r . BackendAssetsPath ) ,
config . WithUploadLimitMB ( r . UploadLimit ) ,
config . WithApiKeys ( r . APIKeys ) ,
config . WithModelsURL ( append ( r . Models , r . ModelArgs ... ) ... ) ,
}
feat(llama.cpp): Totally decentralized, private, distributed, p2p inference (#2343)
* feat(llama.cpp): Enable decentralized, distributed inference
As https://github.com/mudler/LocalAI/pull/2324 introduced distributed inferencing thanks to
@rgerganov implementation in https://github.com/ggerganov/llama.cpp/pull/6829 in upstream llama.cpp, now
it is possible to distribute the workload to remote llama.cpp gRPC server.
This changeset now uses mudler/edgevpn to establish a secure, distributed network between the nodes using a shared token.
The token is generated automatically when starting the server with the `--p2p` flag, and can be used by starting the workers
with `local-ai worker p2p-llama-cpp-rpc` by passing the token via environment variable (TOKEN) or with args (--token).
As per how mudler/edgevpn works, a network is established between the server and the workers with dht and mdns discovery protocols,
the llama.cpp rpc server is automatically started and exposed to the underlying p2p network so the API server can connect on.
When the HTTP server is started, it will discover the workers in the network and automatically create the port-forwards to the service locally.
Then llama.cpp is configured to use the services.
This feature is behind the "p2p" GO_FLAGS
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* go mod tidy
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* ci: add p2p tag
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* better message
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-05-20 17:17:59 +00:00
if r . Peer2Peer || r . Peer2PeerToken != "" {
log . Info ( ) . Msg ( "P2P mode enabled" )
token := r . Peer2PeerToken
if token == "" {
// IF no token is provided, and p2p is enabled,
// we generate one and wait for the user to pick up the token (this is for interactive)
log . Info ( ) . Msg ( "No token provided, generating one" )
token = p2p . GenerateToken ( )
log . Info ( ) . Msg ( "Generated Token:" )
fmt . Println ( token )
log . Info ( ) . Msg ( "To use the token, you can run the following command in another node or terminal:" )
fmt . Printf ( "export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n" , token )
// Ask for user confirmation
log . Info ( ) . Msg ( "Press a button to proceed" )
var input string
fmt . Scanln ( & input )
}
log . Info ( ) . Msg ( "Starting P2P server discovery..." )
if err := p2p . LLamaCPPRPCServerDiscoverer ( context . Background ( ) , token ) ; err != nil {
return err
}
}
2024-04-11 07:19:24 +00:00
idleWatchDog := r . EnableWatchdogIdle
busyWatchDog := r . EnableWatchdogBusy
2024-05-02 19:14:10 +00:00
if r . DisableWebUI {
opts = append ( opts , config . DisableWebUI )
2024-04-11 07:19:24 +00:00
}
if idleWatchDog || busyWatchDog {
opts = append ( opts , config . EnableWatchDog )
if idleWatchDog {
opts = append ( opts , config . EnableWatchDogIdleCheck )
dur , err := time . ParseDuration ( r . WatchdogIdleTimeout )
if err != nil {
return err
}
opts = append ( opts , config . SetWatchDogIdleTimeout ( dur ) )
}
if busyWatchDog {
opts = append ( opts , config . EnableWatchDogBusyCheck )
dur , err := time . ParseDuration ( r . WatchdogBusyTimeout )
if err != nil {
return err
}
opts = append ( opts , config . SetWatchDogBusyTimeout ( dur ) )
}
}
if r . ParallelRequests {
opts = append ( opts , config . EnableParallelBackendRequests )
}
if r . SingleActiveBackend {
opts = append ( opts , config . EnableSingleBackend )
}
// split ":" to get backend name and the uri
for _ , v := range r . ExternalGRPCBackends {
backend := v [ : strings . IndexByte ( v , ':' ) ]
uri := v [ strings . IndexByte ( v , ':' ) + 1 : ]
opts = append ( opts , config . WithExternalBackend ( backend , uri ) )
}
if r . AutoloadGalleries {
opts = append ( opts , config . EnableGalleriesAutoload )
}
if r . PreloadBackendOnly {
2024-04-17 21:33:49 +00:00
_ , _ , _ , err := startup . Startup ( opts ... )
2024-04-11 07:19:24 +00:00
return err
}
2024-04-17 21:33:49 +00:00
cl , ml , options , err := startup . Startup ( opts ... )
2024-04-11 07:19:24 +00:00
if err != nil {
return fmt . Errorf ( "failed basic startup tasks with error %s" , err . Error ( ) )
}
2024-04-17 21:33:49 +00:00
appHTTP , err := http . App ( cl , ml , options )
2024-04-11 07:19:24 +00:00
if err != nil {
log . Error ( ) . Err ( err ) . Msg ( "error during HTTP App construction" )
return err
}
return appHTTP . Listen ( r . Address )
}