From 8b40f475a31109cc6ecbdc0d14a0cee9e0303291 Mon Sep 17 00:00:00 2001
From: Nuullll <vfirst218@gmail.com>
Date: Fri, 10 Nov 2023 11:06:26 +0800
Subject: [PATCH 1/4] Initial IPEX support

---
 modules/devices.py      | 11 +++++++++--
 modules/xpu_specific.py | 42 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 modules/xpu_specific.py

diff --git a/modules/devices.py b/modules/devices.py
index 1d4eb5635..be599736c 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -3,7 +3,7 @@ import contextlib
 from functools import lru_cache
 
 import torch
-from modules import errors, shared
+from modules import errors, shared, xpu_specific
 
 if sys.platform == "darwin":
     from modules import mac_specific
@@ -30,6 +30,9 @@ def get_optimal_device_name():
     if has_mps():
         return "mps"
 
+    if xpu_specific.has_ipex:
+        return xpu_specific.get_xpu_device_string()
+
     return "cpu"
 
 
@@ -100,11 +103,15 @@ def autocast(disable=False):
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
+    if xpu_specific.has_xpu:
+        return torch.autocast("xpu")
+
     return torch.autocast("cuda")
 
 
 def without_autocast(disable=False):
-    return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
+    device_type = "xpu" if xpu_specific.has_xpu else "cuda"
+    return torch.autocast(device_type, enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
 
 
 class NansException(Exception):
diff --git a/modules/xpu_specific.py b/modules/xpu_specific.py
new file mode 100644
index 000000000..6417dd2d6
--- /dev/null
+++ b/modules/xpu_specific.py
@@ -0,0 +1,42 @@
+import contextlib
+from modules import shared
+from modules.sd_hijack_utils import CondFunc
+
+has_ipex = False
+try:
+    import torch
+    import intel_extension_for_pytorch as ipex
+    has_ipex = True
+except Exception:
+    pass
+
+def check_for_xpu():
+    if not has_ipex:
+        return False
+
+    return hasattr(torch, 'xpu') and torch.xpu.is_available()
+
+has_xpu = check_for_xpu()
+
+def get_xpu_device_string():
+    if shared.cmd_opts.device_id is not None:
+        return f"xpu:{shared.cmd_opts.device_id}"
+    return "xpu"
+
+def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
+    return contextlib.nullcontext()
+
+if has_xpu:
+    CondFunc('torch.Generator',
+        lambda orig_func, device=None: torch.xpu.Generator(device),
+        lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu")
+
+    CondFunc('torch.nn.functional.layer_norm',
+        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
+        orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
+        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
+        weight is not None and input.dtype != weight.data.dtype)
+
+    CondFunc('torch.nn.modules.GroupNorm.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)

From 7499148ad4dbd3444215c843d02453f68c459707 Mon Sep 17 00:00:00 2001
From: Nuullll <vfirst218@gmail.com>
Date: Sat, 2 Dec 2023 14:00:46 +0800
Subject: [PATCH 2/4] Disable ipex autocast due to its bad perf

---
 modules/cmd_args.py     |  1 +
 modules/devices.py      | 20 +++++++++++++-------
 modules/xpu_specific.py | 28 ++++++++++++++++++----------
 webui-ipex-user.bat     | 19 +++++++++++++++++++
 4 files changed, 51 insertions(+), 17 deletions(-)
 create mode 100644 webui-ipex-user.bat

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index a9fb9bfa3..da93eb266 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -70,6 +70,7 @@ parser.add_argument("--opt-sdp-no-mem-attention", action='store_true', help="pre
 parser.add_argument("--disable-opt-split-attention", action='store_true', help="prefer no cross-attention layer optimization for automatic choice of optimization")
 parser.add_argument("--disable-nan-check", action='store_true', help="do not check if produced images/latent spaces have nans; useful for running without a checkpoint in CI")
 parser.add_argument("--use-cpu", nargs='+', help="use CPU as torch device for specified modules", default=[], type=str.lower)
+parser.add_argument("--use-ipex", action="store_true", help="use Intel XPU as torch device")
 parser.add_argument("--disable-model-loading-ram-optimization", action='store_true', help="disable an optimization that reduces RAM use when loading a model")
 parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
 parser.add_argument("--port", type=int, help="launch gradio with given server port, you need root/admin rights for ports < 1024, defaults to 7860 if available", default=None)
diff --git a/modules/devices.py b/modules/devices.py
index be599736c..37ecca784 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -3,11 +3,18 @@ import contextlib
 from functools import lru_cache
 
 import torch
-from modules import errors, shared, xpu_specific
+from modules import errors, shared
 
 if sys.platform == "darwin":
     from modules import mac_specific
 
+if shared.cmd_opts.use_ipex:
+    from modules import xpu_specific
+
+
+def has_xpu() -> bool:
+    return shared.cmd_opts.use_ipex and xpu_specific.has_xpu
+
 
 def has_mps() -> bool:
     if sys.platform != "darwin":
@@ -30,7 +37,7 @@ def get_optimal_device_name():
     if has_mps():
         return "mps"
 
-    if xpu_specific.has_ipex:
+    if has_xpu():
         return xpu_specific.get_xpu_device_string()
 
     return "cpu"
@@ -57,6 +64,9 @@ def torch_gc():
     if has_mps():
         mac_specific.torch_mps_gc()
 
+    if has_xpu():
+        xpu_specific.torch_xpu_gc()
+
 
 def enable_tf32():
     if torch.cuda.is_available():
@@ -103,15 +113,11 @@ def autocast(disable=False):
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
-    if xpu_specific.has_xpu:
-        return torch.autocast("xpu")
-
     return torch.autocast("cuda")
 
 
 def without_autocast(disable=False):
-    device_type = "xpu" if xpu_specific.has_xpu else "cuda"
-    return torch.autocast(device_type, enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
+    return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
 
 
 class NansException(Exception):
diff --git a/modules/xpu_specific.py b/modules/xpu_specific.py
index 6417dd2d6..2df68665a 100644
--- a/modules/xpu_specific.py
+++ b/modules/xpu_specific.py
@@ -1,4 +1,3 @@
-import contextlib
 from modules import shared
 from modules.sd_hijack_utils import CondFunc
 
@@ -10,33 +9,42 @@ try:
 except Exception:
     pass
 
+
 def check_for_xpu():
-    if not has_ipex:
-        return False
+    return has_ipex and hasattr(torch, 'xpu') and torch.xpu.is_available()
 
-    return hasattr(torch, 'xpu') and torch.xpu.is_available()
-
-has_xpu = check_for_xpu()
 
 def get_xpu_device_string():
     if shared.cmd_opts.device_id is not None:
         return f"xpu:{shared.cmd_opts.device_id}"
     return "xpu"
 
-def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
-    return contextlib.nullcontext()
+
+def torch_xpu_gc():
+    with torch.xpu.device(get_xpu_device_string()):
+        torch.xpu.empty_cache()
+
+
+has_xpu = check_for_xpu()
 
 if has_xpu:
+    # W/A for https://github.com/intel/intel-extension-for-pytorch/issues/452: torch.Generator API doesn't support XPU device
     CondFunc('torch.Generator',
         lambda orig_func, device=None: torch.xpu.Generator(device),
-        lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu")
+        lambda orig_func, device=None: device is not None and device.type == "xpu")
 
+    # W/A for some OPs that could not handle different input dtypes
     CondFunc('torch.nn.functional.layer_norm',
         lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
         orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
         lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
         weight is not None and input.dtype != weight.data.dtype)
-
     CondFunc('torch.nn.modules.GroupNorm.forward',
         lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
         lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.modules.linear.Linear.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.modules.conv.Conv2d.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
diff --git a/webui-ipex-user.bat b/webui-ipex-user.bat
new file mode 100644
index 000000000..ab25a0400
--- /dev/null
+++ b/webui-ipex-user.bat
@@ -0,0 +1,19 @@
+@echo off
+
+set PYTHON=
+@REM The "Nuullll/intel-extension-for-pytorch" wheels were built from IPEX source for Intel Arc GPU: https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main
+@REM This is NOT an Intel official release so please use it at your own risk!!
+@REM See https://github.com/Nuullll/intel-extension-for-pytorch/releases/tag/v2.0.110%2Bxpu-master%2Bdll-bundle for details.
+@REM 
+@REM Strengths (over official IPEX 2.0.110 windows release):
+@REM   - AOT build (for Arc GPU only) to eliminate JIT compilation overhead: https://github.com/intel/intel-extension-for-pytorch/issues/399
+@REM   - Bundles minimal oneAPI 2023.2 dependencies into the python wheels, so users don't need to install oneAPI for the whole system.
+@REM   - Provides a compatible torchvision wheel: https://github.com/intel/intel-extension-for-pytorch/issues/465
+@REM Limitation:
+@REM   - Only works for python 3.10
+set "TORCH_COMMAND=pip install https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%%2Bxpu-master%%2Bdll-bundle/torch-2.0.0a0+gite9ebda2-cp310-cp310-win_amd64.whl https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%%2Bxpu-master%%2Bdll-bundle/torchvision-0.15.2a0+fa99a53-cp310-cp310-win_amd64.whl https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%%2Bxpu-master%%2Bdll-bundle/intel_extension_for_pytorch-2.0.110+gitc6ea20b-cp310-cp310-win_amd64.whl"
+set GIT=
+set VENV_DIR=
+set "COMMANDLINE_ARGS=--use-ipex --skip-torch-cuda-test --skip-version-check --opt-sdp-attention"
+
+call webui.bat

From 87cd07b3af74c447b02570bf3963ba83ade2e203 Mon Sep 17 00:00:00 2001
From: Nuullll <vfirst218@gmail.com>
Date: Sat, 2 Dec 2023 15:54:25 +0800
Subject: [PATCH 3/4] Fix fp64

---
 modules/sd_samplers_timesteps_impl.py | 4 ++--
 modules/xpu_specific.py               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/sd_samplers_timesteps_impl.py b/modules/sd_samplers_timesteps_impl.py
index a72daafd4..930a64af5 100644
--- a/modules/sd_samplers_timesteps_impl.py
+++ b/modules/sd_samplers_timesteps_impl.py
@@ -11,7 +11,7 @@ from modules.models.diffusion.uni_pc import uni_pc
 def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=0.0):
     alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
     alphas = alphas_cumprod[timesteps]
-    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' else torch.float32)
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
     sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
     sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
 
@@ -43,7 +43,7 @@ def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=
 def plms(model, x, timesteps, extra_args=None, callback=None, disable=None):
     alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
     alphas = alphas_cumprod[timesteps]
-    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' else torch.float32)
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
     sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
 
     extra_args = {} if extra_args is None else extra_args
diff --git a/modules/xpu_specific.py b/modules/xpu_specific.py
index 2df68665a..d933c7903 100644
--- a/modules/xpu_specific.py
+++ b/modules/xpu_specific.py
@@ -4,7 +4,7 @@ from modules.sd_hijack_utils import CondFunc
 has_ipex = False
 try:
     import torch
-    import intel_extension_for_pytorch as ipex
+    import intel_extension_for_pytorch as ipex # noqa: F401
     has_ipex = True
 except Exception:
     pass

From 96871e4f744471177d97e01c49f8587d7f67c125 Mon Sep 17 00:00:00 2001
From: Nuullll <vfirst218@gmail.com>
Date: Sat, 2 Dec 2023 17:11:11 +0800
Subject: [PATCH 4/4] Remove webui-ipex-user.bat

---
 modules/launch_utils.py | 22 ++++++++++++++++++++++
 webui-ipex-user.bat     | 19 -------------------
 2 files changed, 22 insertions(+), 19 deletions(-)
 delete mode 100644 webui-ipex-user.bat

diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index 264ec9ca6..586cdc7eb 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -310,6 +310,26 @@ def requirements_met(requirements_file):
 def prepare_environment():
     torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu118")
     torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.0.1 torchvision==0.15.2 --extra-index-url {torch_index_url}")
+    if args.use_ipex:
+        if platform.system() == "Windows":
+            # The "Nuullll/intel-extension-for-pytorch" wheels were built from IPEX source for Intel Arc GPU: https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main
+            # This is NOT an Intel official release so please use it at your own risk!!
+            # See https://github.com/Nuullll/intel-extension-for-pytorch/releases/tag/v2.0.110%2Bxpu-master%2Bdll-bundle for details.
+            #
+            # Strengths (over official IPEX 2.0.110 windows release):
+            #   - AOT build (for Arc GPU only) to eliminate JIT compilation overhead: https://github.com/intel/intel-extension-for-pytorch/issues/399
+            #   - Bundles minimal oneAPI 2023.2 dependencies into the python wheels, so users don't need to install oneAPI for the whole system.
+            #   - Provides a compatible torchvision wheel: https://github.com/intel/intel-extension-for-pytorch/issues/465
+            # Limitation:
+            #   - Only works for python 3.10
+            url_prefix = "https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%2Bxpu-master%2Bdll-bundle"
+            torch_command = os.environ.get('TORCH_COMMAND', f"pip install {url_prefix}/torch-2.0.0a0+gite9ebda2-cp310-cp310-win_amd64.whl {url_prefix}/torchvision-0.15.2a0+fa99a53-cp310-cp310-win_amd64.whl {url_prefix}/intel_extension_for_pytorch-2.0.110+gitc6ea20b-cp310-cp310-win_amd64.whl")
+        else:
+            # Using official IPEX release for linux since it's already an AOT build.
+            # However, users still have to install oneAPI toolkit and activate oneAPI environment manually.
+            # See https://intel.github.io/intel-extension-for-pytorch/index.html#installation for details.
+            torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://pytorch-extension.intel.com/release-whl/stable/xpu/us/")
+            torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.0.0a0 intel-extension-for-pytorch==2.0.110+gitba7f6c1 --extra-index-url {torch_index_url}")
     requirements_file = os.environ.get('REQS_FILE', "requirements_versions.txt")
 
     xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.20')
@@ -352,6 +372,8 @@ def prepare_environment():
         run(f'"{python}" -m {torch_command}', "Installing torch and torchvision", "Couldn't install torch", live=True)
         startup_timer.record("install torch")
 
+    if args.use_ipex:
+        args.skip_torch_cuda_test = True
     if not args.skip_torch_cuda_test and not check_run_python("import torch; assert torch.cuda.is_available()"):
         raise RuntimeError(
             'Torch is not able to use GPU; '
diff --git a/webui-ipex-user.bat b/webui-ipex-user.bat
deleted file mode 100644
index ab25a0400..000000000
--- a/webui-ipex-user.bat
+++ /dev/null
@@ -1,19 +0,0 @@
-@echo off
-
-set PYTHON=
-@REM The "Nuullll/intel-extension-for-pytorch" wheels were built from IPEX source for Intel Arc GPU: https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main
-@REM This is NOT an Intel official release so please use it at your own risk!!
-@REM See https://github.com/Nuullll/intel-extension-for-pytorch/releases/tag/v2.0.110%2Bxpu-master%2Bdll-bundle for details.
-@REM 
-@REM Strengths (over official IPEX 2.0.110 windows release):
-@REM   - AOT build (for Arc GPU only) to eliminate JIT compilation overhead: https://github.com/intel/intel-extension-for-pytorch/issues/399
-@REM   - Bundles minimal oneAPI 2023.2 dependencies into the python wheels, so users don't need to install oneAPI for the whole system.
-@REM   - Provides a compatible torchvision wheel: https://github.com/intel/intel-extension-for-pytorch/issues/465
-@REM Limitation:
-@REM   - Only works for python 3.10
-set "TORCH_COMMAND=pip install https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%%2Bxpu-master%%2Bdll-bundle/torch-2.0.0a0+gite9ebda2-cp310-cp310-win_amd64.whl https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%%2Bxpu-master%%2Bdll-bundle/torchvision-0.15.2a0+fa99a53-cp310-cp310-win_amd64.whl https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.0.110%%2Bxpu-master%%2Bdll-bundle/intel_extension_for_pytorch-2.0.110+gitc6ea20b-cp310-cp310-win_amd64.whl"
-set GIT=
-set VENV_DIR=
-set "COMMANDLINE_ARGS=--use-ipex --skip-torch-cuda-test --skip-version-check --opt-sdp-attention"
-
-call webui.bat