feat: cuda transformers (#1401)

* Use cuda in transformers if available

tensorflow probably needs a different check.

Signed-off-by: Erich Schubert <kno10@users.noreply.github.com>

* feat: expose CUDA at top level

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* tests: add to tests and create workflow for py extra backends

* doc: update note on how to use core images

---------

Signed-off-by: Erich Schubert <kno10@users.noreply.github.com>
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Erich Schubert <kno10@users.noreply.github.com>
This commit is contained in:
Ettore Di Giacinto 2023-12-08 15:45:04 +01:00 committed by GitHub
parent 3822bd2369
commit 887b3dff04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 163 additions and 11 deletions

75
.github/workflows/test-extra.yml vendored Normal file
View File

@ -0,0 +1,75 @@
---
name: 'Tests extras backends'
on:
pull_request:
push:
branches:
- master
tags:
- '*'
concurrency:
group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
cancel-in-progress: true
jobs:
tests-linux:
runs-on: ubuntu-latest
steps:
- name: Release space from worker
run: |
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
df -h
echo
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
sudo rm -rf /usr/local/lib/android
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
sudo rm -rf /usr/share/dotnet
sudo apt-get remove -y '^mono-.*' || true
sudo apt-get remove -y '^ghc-.*' || true
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
sudo apt-get remove -y 'php.*' || true
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
sudo apt-get remove -y '^google-.*' || true
sudo apt-get remove -y azure-cli || true
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
sudo apt-get remove -y '^gfortran-.*' || true
sudo apt-get autoremove -y
sudo apt-get clean
echo
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
sudo rm -rfv build || true
df -h
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
- name: Test
run: |
PATH=$PATH:/opt/conda/bin make test-extra

View File

@ -414,6 +414,11 @@ prepare-extra-conda-environments:
$(MAKE) -C backend/python/petals $(MAKE) -C backend/python/petals
$(MAKE) -C backend/python/exllama2 $(MAKE) -C backend/python/exllama2
prepare-test-extra:
$(MAKE) -C backend/python/transformers
test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test
backend-assets/grpc: backend-assets/grpc:
mkdir -p backend-assets/grpc mkdir -p backend-assets/grpc

View File

@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
model.WithContext(o.Context), model.WithContext(o.Context),
model.WithModel(c.Model), model.WithModel(c.Model),
model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{ model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
CUDA: c.Diffusers.CUDA, CUDA: c.CUDA,
SchedulerType: c.Diffusers.SchedulerType, SchedulerType: c.Diffusers.SchedulerType,
PipelineType: c.Diffusers.PipelineType, PipelineType: c.Diffusers.PipelineType,
CFGScale: c.Diffusers.CFGScale, CFGScale: c.Diffusers.CFGScale,

View File

@ -46,6 +46,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
Seed: int32(c.Seed), Seed: int32(c.Seed),
NBatch: int32(b), NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ, NoMulMatQ: c.NoMulMatQ,
CUDA: c.CUDA, // diffusers, transformers
DraftModel: c.DraftModel, DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath, AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization, Quantization: c.Quantization,

View File

@ -46,6 +46,10 @@ type Config struct {
// Vall-e-x // Vall-e-x
VallE VallE `yaml:"vall-e"` VallE VallE `yaml:"vall-e"`
// CUDA
// Explicitly enable CUDA or not (some backends might need it)
CUDA bool `yaml:"cuda"`
} }
type VallE struct { type VallE struct {
@ -67,7 +71,6 @@ type GRPC struct {
type Diffusers struct { type Diffusers struct {
PipelineType string `yaml:"pipeline_type"` PipelineType string `yaml:"pipeline_type"`
SchedulerType string `yaml:"scheduler_type"` SchedulerType string `yaml:"scheduler_type"`
CUDA bool `yaml:"cuda"`
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser

View File

@ -31,7 +31,7 @@ class TestBackendServicer(unittest.TestCase):
""" """
This method tests if the server starts up successfully This method tests if the server starts up successfully
""" """
time.sleep(2) time.sleep(10)
try: try:
self.setUp() self.setUp()
with grpc.insecure_channel("localhost:50051") as channel: with grpc.insecure_channel("localhost:50051") as channel:
@ -48,11 +48,12 @@ class TestBackendServicer(unittest.TestCase):
""" """
This method tests if the model is loaded successfully This method tests if the model is loaded successfully
""" """
time.sleep(10)
try: try:
self.setUp() self.setUp()
with grpc.insecure_channel("localhost:50051") as channel: with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel) stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens")) response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
self.assertTrue(response.success) self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully") self.assertEqual(response.message, "Model loaded successfully")
except Exception as err: except Exception as err:
@ -65,11 +66,13 @@ class TestBackendServicer(unittest.TestCase):
""" """
This method tests if the embeddings are generated successfully This method tests if the embeddings are generated successfully
""" """
time.sleep(10)
try: try:
self.setUp() self.setUp()
with grpc.insecure_channel("localhost:50051") as channel: with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel) stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens")) response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
print(response.message)
self.assertTrue(response.success) self.assertTrue(response.success)
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.") embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
embedding_response = stub.Embedding(embedding_request) embedding_response = stub.Embedding(embedding_request)

View File

@ -14,14 +14,27 @@ import backend_pb2
import backend_pb2_grpc import backend_pb2_grpc
import grpc import grpc
import torch
from transformers import AutoModel from transformers import AutoTokenizer, AutoModel
_ONE_DAY_IN_SECONDS = 60 * 60 * 24 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
def mean_pooling(model_output, attention_mask):
"""
Mean pooling to get sentence embeddings. See:
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
# Implement the BackendServicer class with the service methods # Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer): class BackendServicer(backend_pb2_grpc.BackendServicer):
""" """
@ -56,9 +69,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
model_name = request.Model model_name = request.Model
try: try:
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2 self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
if request.CUDA:
try:
# TODO: also tensorflow, make configurable
import torch.cuda
if torch.cuda.is_available():
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
self.model = self.model.to("cuda")
except Exception as err:
print("Not using CUDA:", err, file=sys.stderr)
except Exception as err: except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service # Implement your logic here for the LoadModel service
# Replace this with your desired response # Replace this with your desired response
return backend_pb2.Result(message="Model loaded successfully", success=True) return backend_pb2.Result(message="Model loaded successfully", success=True)
@ -74,10 +97,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns: Returns:
An EmbeddingResult object that contains the calculated embeddings. An EmbeddingResult object that contains the calculated embeddings.
""" """
# Implement your logic here for the Embedding service
# Replace this with your desired response # Tokenize input
max_length = 512
if request.Tokens != 0:
max_length = request.Tokens
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
# Create word embeddings
model_output = self.model(**encoded_input)
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
sentence_embeddings = self.model.encode(request.Embeddings) print("Embeddings:", sentence_embeddings, file=sys.stderr)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings) return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)

View File

@ -207,6 +207,9 @@ lora_adapter: "/path/to/lora/adapter"
lora_base: "/path/to/lora/base" lora_base: "/path/to/lora/base"
# Disable mulmatq (CUDA) # Disable mulmatq (CUDA)
no_mulmatq: true no_mulmatq: true
# Diffusers/transformers
cuda: true
``` ```
### Prompt templates ### Prompt templates
@ -364,3 +367,31 @@ make GRPC_BACKENDS=backend-assets/grpc/llama-cpp build
``` ```
By default, all the backends are built. By default, all the backends are built.
### Extra backends
LocalAI can be extended with extra backends. The backends are implemented as `gRPC` services and can be written in any language. The container images that are built and published on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags) contain a set of images split in core and extra. By default Images bring all the dependencies and backends supported by LocalAI (we call those `extra` images). The `-core` images instead bring only the strictly necessary dependencies to run LocalAI without only a core set of backends.
If you wish to build a custom container image with extra backends, you can use the core images and build only the backends you are interested into. For instance, to use the diffusers backend:
```Dockerfile
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
```
Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--external-grpc-backends` as CLI flag) to point to the backends you are using (`EXTERNAL_GRPC_BACKENDS="backend_name:/path/to/backend"`), for example with diffusers:
```Dockerfile
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
ENV EXTERNAL_GRPC_BACKENDS="diffusers:/build/backend/python/diffusers/run.sh"
```
{{% notice note %}}
You can specify remote external backends or path to local files. The syntax is `backend-name:/path/to/backend` or `backend-name:host:port`.
{{% /notice %}}

View File

@ -178,6 +178,7 @@ You can control LocalAI with command line arguments, to specify a binding addres
| --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. | | --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. |
| --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. | | --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. |
| --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. | | --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. |
| --external-grpc-backends | EXTERNAL_GRPC_BACKENDS | none | Comma separated list of external gRPC backends to use. Format: `name:host:port` or `name:/path/to/file` |
### Container images ### Container images