mirror of
https://github.com/mudler/LocalAI.git
synced 2024-06-07 19:40:48 +00:00
feat: cuda transformers (#1401)
* Use cuda in transformers if available tensorflow probably needs a different check. Signed-off-by: Erich Schubert <kno10@users.noreply.github.com> * feat: expose CUDA at top level Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * tests: add to tests and create workflow for py extra backends * doc: update note on how to use core images --------- Signed-off-by: Erich Schubert <kno10@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Erich Schubert <kno10@users.noreply.github.com>
This commit is contained in:
parent
3822bd2369
commit
887b3dff04
75
.github/workflows/test-extra.yml
vendored
Normal file
75
.github/workflows/test-extra.yml
vendored
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
---
|
||||||
|
name: 'Tests extras backends'
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
tags:
|
||||||
|
- '*'
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
tests-linux:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Release space from worker
|
||||||
|
run: |
|
||||||
|
echo "Listing top largest packages"
|
||||||
|
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
|
||||||
|
head -n 30 <<< "${pkgs}"
|
||||||
|
echo
|
||||||
|
df -h
|
||||||
|
echo
|
||||||
|
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
|
||||||
|
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
|
||||||
|
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
|
||||||
|
sudo rm -rf /usr/local/lib/android
|
||||||
|
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
|
||||||
|
sudo rm -rf /usr/share/dotnet
|
||||||
|
sudo apt-get remove -y '^mono-.*' || true
|
||||||
|
sudo apt-get remove -y '^ghc-.*' || true
|
||||||
|
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
|
||||||
|
sudo apt-get remove -y 'php.*' || true
|
||||||
|
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
|
||||||
|
sudo apt-get remove -y '^google-.*' || true
|
||||||
|
sudo apt-get remove -y azure-cli || true
|
||||||
|
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
|
||||||
|
sudo apt-get remove -y '^gfortran-.*' || true
|
||||||
|
sudo apt-get autoremove -y
|
||||||
|
sudo apt-get clean
|
||||||
|
echo
|
||||||
|
echo "Listing top largest packages"
|
||||||
|
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
|
||||||
|
head -n 30 <<< "${pkgs}"
|
||||||
|
echo
|
||||||
|
sudo rm -rfv build || true
|
||||||
|
df -h
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
- name: Dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential ffmpeg
|
||||||
|
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
|
||||||
|
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
|
||||||
|
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
|
||||||
|
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
|
||||||
|
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
|
||||||
|
sudo apt-get update && \
|
||||||
|
sudo apt-get install -y conda
|
||||||
|
sudo apt-get install -y ca-certificates cmake curl patch
|
||||||
|
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
|
||||||
|
|
||||||
|
sudo rm -rfv /usr/bin/conda || true
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: |
|
||||||
|
PATH=$PATH:/opt/conda/bin make test-extra
|
||||||
|
|
5
Makefile
5
Makefile
@ -414,6 +414,11 @@ prepare-extra-conda-environments:
|
|||||||
$(MAKE) -C backend/python/petals
|
$(MAKE) -C backend/python/petals
|
||||||
$(MAKE) -C backend/python/exllama2
|
$(MAKE) -C backend/python/exllama2
|
||||||
|
|
||||||
|
prepare-test-extra:
|
||||||
|
$(MAKE) -C backend/python/transformers
|
||||||
|
|
||||||
|
test-extra: prepare-test-extra
|
||||||
|
$(MAKE) -C backend/python/transformers test
|
||||||
|
|
||||||
backend-assets/grpc:
|
backend-assets/grpc:
|
||||||
mkdir -p backend-assets/grpc
|
mkdir -p backend-assets/grpc
|
||||||
|
@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
|
|||||||
model.WithContext(o.Context),
|
model.WithContext(o.Context),
|
||||||
model.WithModel(c.Model),
|
model.WithModel(c.Model),
|
||||||
model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
|
model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
|
||||||
CUDA: c.Diffusers.CUDA,
|
CUDA: c.CUDA,
|
||||||
SchedulerType: c.Diffusers.SchedulerType,
|
SchedulerType: c.Diffusers.SchedulerType,
|
||||||
PipelineType: c.Diffusers.PipelineType,
|
PipelineType: c.Diffusers.PipelineType,
|
||||||
CFGScale: c.Diffusers.CFGScale,
|
CFGScale: c.Diffusers.CFGScale,
|
||||||
|
@ -46,6 +46,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
|
|||||||
Seed: int32(c.Seed),
|
Seed: int32(c.Seed),
|
||||||
NBatch: int32(b),
|
NBatch: int32(b),
|
||||||
NoMulMatQ: c.NoMulMatQ,
|
NoMulMatQ: c.NoMulMatQ,
|
||||||
|
CUDA: c.CUDA, // diffusers, transformers
|
||||||
DraftModel: c.DraftModel,
|
DraftModel: c.DraftModel,
|
||||||
AudioPath: c.VallE.AudioPath,
|
AudioPath: c.VallE.AudioPath,
|
||||||
Quantization: c.Quantization,
|
Quantization: c.Quantization,
|
||||||
|
@ -46,6 +46,10 @@ type Config struct {
|
|||||||
|
|
||||||
// Vall-e-x
|
// Vall-e-x
|
||||||
VallE VallE `yaml:"vall-e"`
|
VallE VallE `yaml:"vall-e"`
|
||||||
|
|
||||||
|
// CUDA
|
||||||
|
// Explicitly enable CUDA or not (some backends might need it)
|
||||||
|
CUDA bool `yaml:"cuda"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type VallE struct {
|
type VallE struct {
|
||||||
@ -67,7 +71,6 @@ type GRPC struct {
|
|||||||
type Diffusers struct {
|
type Diffusers struct {
|
||||||
PipelineType string `yaml:"pipeline_type"`
|
PipelineType string `yaml:"pipeline_type"`
|
||||||
SchedulerType string `yaml:"scheduler_type"`
|
SchedulerType string `yaml:"scheduler_type"`
|
||||||
CUDA bool `yaml:"cuda"`
|
|
||||||
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
|
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
|
||||||
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
||||||
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
|
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
|
||||||
|
@ -31,7 +31,7 @@ class TestBackendServicer(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
This method tests if the server starts up successfully
|
This method tests if the server starts up successfully
|
||||||
"""
|
"""
|
||||||
time.sleep(2)
|
time.sleep(10)
|
||||||
try:
|
try:
|
||||||
self.setUp()
|
self.setUp()
|
||||||
with grpc.insecure_channel("localhost:50051") as channel:
|
with grpc.insecure_channel("localhost:50051") as channel:
|
||||||
@ -48,11 +48,12 @@ class TestBackendServicer(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
This method tests if the model is loaded successfully
|
This method tests if the model is loaded successfully
|
||||||
"""
|
"""
|
||||||
|
time.sleep(10)
|
||||||
try:
|
try:
|
||||||
self.setUp()
|
self.setUp()
|
||||||
with grpc.insecure_channel("localhost:50051") as channel:
|
with grpc.insecure_channel("localhost:50051") as channel:
|
||||||
stub = backend_pb2_grpc.BackendStub(channel)
|
stub = backend_pb2_grpc.BackendStub(channel)
|
||||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens"))
|
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
|
||||||
self.assertTrue(response.success)
|
self.assertTrue(response.success)
|
||||||
self.assertEqual(response.message, "Model loaded successfully")
|
self.assertEqual(response.message, "Model loaded successfully")
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
@ -65,11 +66,13 @@ class TestBackendServicer(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
This method tests if the embeddings are generated successfully
|
This method tests if the embeddings are generated successfully
|
||||||
"""
|
"""
|
||||||
|
time.sleep(10)
|
||||||
try:
|
try:
|
||||||
self.setUp()
|
self.setUp()
|
||||||
with grpc.insecure_channel("localhost:50051") as channel:
|
with grpc.insecure_channel("localhost:50051") as channel:
|
||||||
stub = backend_pb2_grpc.BackendStub(channel)
|
stub = backend_pb2_grpc.BackendStub(channel)
|
||||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens"))
|
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
|
||||||
|
print(response.message)
|
||||||
self.assertTrue(response.success)
|
self.assertTrue(response.success)
|
||||||
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
|
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
|
||||||
embedding_response = stub.Embedding(embedding_request)
|
embedding_response = stub.Embedding(embedding_request)
|
@ -14,14 +14,27 @@ import backend_pb2
|
|||||||
import backend_pb2_grpc
|
import backend_pb2_grpc
|
||||||
|
|
||||||
import grpc
|
import grpc
|
||||||
|
import torch
|
||||||
|
|
||||||
from transformers import AutoModel
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
|
||||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||||
|
|
||||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||||
|
|
||||||
|
|
||||||
|
def mean_pooling(model_output, attention_mask):
|
||||||
|
"""
|
||||||
|
Mean pooling to get sentence embeddings. See:
|
||||||
|
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
|
||||||
|
"""
|
||||||
|
token_embeddings = model_output[0]
|
||||||
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||||
|
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
|
||||||
|
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||||
|
return sum_embeddings / sum_mask
|
||||||
|
|
||||||
# Implement the BackendServicer class with the service methods
|
# Implement the BackendServicer class with the service methods
|
||||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
"""
|
"""
|
||||||
@ -56,9 +69,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
model_name = request.Model
|
model_name = request.Model
|
||||||
try:
|
try:
|
||||||
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2
|
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
if request.CUDA:
|
||||||
|
try:
|
||||||
|
# TODO: also tensorflow, make configurable
|
||||||
|
import torch.cuda
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
|
||||||
|
self.model = self.model.to("cuda")
|
||||||
|
except Exception as err:
|
||||||
|
print("Not using CUDA:", err, file=sys.stderr)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||||
|
|
||||||
# Implement your logic here for the LoadModel service
|
# Implement your logic here for the LoadModel service
|
||||||
# Replace this with your desired response
|
# Replace this with your desired response
|
||||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||||
@ -74,10 +97,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
Returns:
|
Returns:
|
||||||
An EmbeddingResult object that contains the calculated embeddings.
|
An EmbeddingResult object that contains the calculated embeddings.
|
||||||
"""
|
"""
|
||||||
# Implement your logic here for the Embedding service
|
|
||||||
# Replace this with your desired response
|
# Tokenize input
|
||||||
|
max_length = 512
|
||||||
|
if request.Tokens != 0:
|
||||||
|
max_length = request.Tokens
|
||||||
|
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
|
||||||
|
|
||||||
|
# Create word embeddings
|
||||||
|
model_output = self.model(**encoded_input)
|
||||||
|
|
||||||
|
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
|
||||||
|
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
|
||||||
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
|
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
|
||||||
sentence_embeddings = self.model.encode(request.Embeddings)
|
print("Embeddings:", sentence_embeddings, file=sys.stderr)
|
||||||
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
|
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
|
||||||
|
|
||||||
|
|
||||||
|
@ -207,6 +207,9 @@ lora_adapter: "/path/to/lora/adapter"
|
|||||||
lora_base: "/path/to/lora/base"
|
lora_base: "/path/to/lora/base"
|
||||||
# Disable mulmatq (CUDA)
|
# Disable mulmatq (CUDA)
|
||||||
no_mulmatq: true
|
no_mulmatq: true
|
||||||
|
|
||||||
|
# Diffusers/transformers
|
||||||
|
cuda: true
|
||||||
```
|
```
|
||||||
|
|
||||||
### Prompt templates
|
### Prompt templates
|
||||||
@ -364,3 +367,31 @@ make GRPC_BACKENDS=backend-assets/grpc/llama-cpp build
|
|||||||
```
|
```
|
||||||
|
|
||||||
By default, all the backends are built.
|
By default, all the backends are built.
|
||||||
|
|
||||||
|
### Extra backends
|
||||||
|
|
||||||
|
LocalAI can be extended with extra backends. The backends are implemented as `gRPC` services and can be written in any language. The container images that are built and published on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags) contain a set of images split in core and extra. By default Images bring all the dependencies and backends supported by LocalAI (we call those `extra` images). The `-core` images instead bring only the strictly necessary dependencies to run LocalAI without only a core set of backends.
|
||||||
|
|
||||||
|
If you wish to build a custom container image with extra backends, you can use the core images and build only the backends you are interested into. For instance, to use the diffusers backend:
|
||||||
|
|
||||||
|
```Dockerfile
|
||||||
|
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
|
||||||
|
|
||||||
|
RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
|
||||||
|
```
|
||||||
|
|
||||||
|
Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--external-grpc-backends` as CLI flag) to point to the backends you are using (`EXTERNAL_GRPC_BACKENDS="backend_name:/path/to/backend"`), for example with diffusers:
|
||||||
|
|
||||||
|
```Dockerfile
|
||||||
|
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
|
||||||
|
|
||||||
|
RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
|
||||||
|
|
||||||
|
ENV EXTERNAL_GRPC_BACKENDS="diffusers:/build/backend/python/diffusers/run.sh"
|
||||||
|
```
|
||||||
|
|
||||||
|
{{% notice note %}}
|
||||||
|
|
||||||
|
You can specify remote external backends or path to local files. The syntax is `backend-name:/path/to/backend` or `backend-name:host:port`.
|
||||||
|
|
||||||
|
{{% /notice %}}
|
||||||
|
@ -178,6 +178,7 @@ You can control LocalAI with command line arguments, to specify a binding addres
|
|||||||
| --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. |
|
| --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. |
|
||||||
| --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. |
|
| --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. |
|
||||||
| --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. |
|
| --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. |
|
||||||
|
| --external-grpc-backends | EXTERNAL_GRPC_BACKENDS | none | Comma separated list of external gRPC backends to use. Format: `name:host:port` or `name:/path/to/file` |
|
||||||
|
|
||||||
### Container images
|
### Container images
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user