diff --git a/docs/tutorials/asr_tutorial.ipynb b/docs/tutorials/asr_tutorial.ipynb index ccbfed5d5..d8027b472 100644 --- a/docs/tutorials/asr_tutorial.ipynb +++ b/docs/tutorials/asr_tutorial.ipynb @@ -58,13 +58,13 @@ "from transformers import AutoModelForSpeechSeq2Seq\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", + "dtypech.float16 if torch.cuda.is_available() else torch.float32\n", "\n", "model_id = \"openai/whisper-large-v3\"\n", "\n", "model = AutoModelForSpeechSeq2Seq.from_pretrained(\n", " model_id,\n", - " torch_dtype=torch_dtype,\n", + " dtype,dtype " use_safetensors=True,\n", " low_cpu_mem_usage=True,\n", ")\n", diff --git a/docs/tutorials/baseline_vs_fora.png b/docs/tutorials/baseline_vs_fora.png new file mode 100644 index 000000000..06bd9c9f9 Binary files /dev/null and b/docs/tutorials/baseline_vs_fora.png differ diff --git a/docs/tutorials/deploying_sana_tutorial.ipynb b/docs/tutorials/deploying_sana_tutorial.ipynb index d2d9c8b05..c82a87ac9 100644 --- a/docs/tutorials/deploying_sana_tutorial.ipynb +++ b/docs/tutorials/deploying_sana_tutorial.ipynb @@ -370,7 +370,7 @@ "model_id = \"Efficient-Large-Model/Sana_600M_512px_diffusers\"\n", "\n", "# 2. Load the pre-trained model\n", - "pipe = SanaPipeline.from_pretrained(model_id, variant=\"fp16\", torch_dtype=torch.float16)\n", + "pipe = SanaPipeline.from_pretrained(model_id, variant=\"fp16\", dtype.float16)\n", "pipe = pipe.to(device)\n", "\n", "# 3. Configure Pruna smash\n", @@ -506,9 +506,9 @@ "INFO - Detected diffusers model. Using DiffuserHandler with fixed seed.\n", "- The first element of the batch is passed as input.\n", "- The generated outputs are expected to have .images attribute.\n", - "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.\n", - "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.\n", - "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.\n", + "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `dtype=torch.float16` argument, or use another device for inference.\n", + "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `dtype=torch.float16` argument, or use another device for inference.\n", + "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `dtype=torch.float16` argument, or use another device for inference.\n", "INFO - Evaluating stateful metrics.\n", "INFO - Evaluating isolated inference metrics.\n" ] @@ -545,9 +545,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.\n", - "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.\n", - "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.\n" + "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `dtype=torch.float16` argument, or use another device for inference.\n", + "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `dtype=torch.float16` argument, or use another device for inference.\n", + "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `dtype=torch.float16` argument, or use another device for inference.\n" ] }, { diff --git a/docs/tutorials/diffusion_quantization_acceleration.ipynb b/docs/tutorials/diffusion_quantization_acceleration.ipynb index 8f0c6928e..9a6671b6b 100644 --- a/docs/tutorials/diffusion_quantization_acceleration.ipynb +++ b/docs/tutorials/diffusion_quantization_acceleration.ipynb @@ -62,7 +62,7 @@ "import torch\n", "from diffusers import FluxPipeline\n", "\n", - "pipe = FluxPipeline.from_pretrained(\"black-forest-labs/FLUX.1-schnell\", torch_dtype=torch.bfloat16).to(\"cuda\")" + "pipe = FluxPipeline.from_pretrained(\"black-forest-labs/FLUX.1-schnell\", dtype.bfloat16).to(\"cuda\")" ] }, { @@ -138,19 +138,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", - "\u001b[1;31mClick here for more info. \n", - "\u001b[1;31mView Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "# Define the prompt\n", "prompt = \"a smiling cat dancing on a table. Miyazaki style\"\n", @@ -167,7 +155,7 @@ ] }, { - "cell_type": "raw", + "cell_type": "markdown", "metadata": { "raw_mimetype": "text/restructuredtext", "vscode": { @@ -183,7 +171,7 @@ ], "metadata": { "kernelspec": { - "display_name": "pruna", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -197,7 +185,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/docs/tutorials/flux2klein4b_tutorial copy.ipynb b/docs/tutorials/flux2klein4b_tutorial copy.ipynb new file mode 100644 index 000000000..5c2855e50 --- /dev/null +++ b/docs/tutorials/flux2klein4b_tutorial copy.ipynb @@ -0,0 +1,1017 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimize Flux2 Klein (4B) Image Generation" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Component | Details |\n", + "|-----------|---------|\n", + "| **Goal** | Demonstrate optimizing and evaluating Flux2 Klein 4B with FORA, quantization, and torch compile |\n", + "| **Model** | [black-forest-labs/FLUX.2-klein-base-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B) |\n", + "| **Optimization Algorithms** | cacher(fora), quantizer(torchao fp8), compiler(torch_compile) |\n", + "| **Evaluation** | Baseline vs optimized latency comparison |\n", + "\n", + "This tutorial demonstrates how to use **Pruna** to speed up image generation with Flux2 using a combination of three optimization techniques:\n", + "\n", + "1. **FORA (Fast Output Reuse Acceleration)** - Caches transformer block outputs and reuses them for subsequent diffusion steps\n", + "2. **TorchAO Quantization (FP8)** - Reduces memory bandwidth by using 8-bit floating point weights\n", + "3. **Torch Compile** - JIT compiles the model for optimized GPU execution\n", + "\n", + "Together, these optimizations can achieve **2-3x speedup** while maintaining image quality.\n", + "\n", + "## Prerequisites\n", + "\n", + "- NVIDIA GPU with CUDA support (compute capability ≥ 8.9 for FP8). Note that FP8 quantization is hardware-specific and requires modern GPUs such as H100.\n", + "- `pruna` library installed (`pip install pruna`)\n", + "- `diffusers` with Flux2 support\n", + "- HuggingFace account with access to Flux2 models\n", + "\n", + "## Getting Started\n", + "\n", + "To install the dependencies, run the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pruna" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The device is set to the best available option to maximize the benefits of the optimization process." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to parse `\u001b[36m/root/sdiazlor/prunatree/pruna/pyproject.toml\u001b[39m` during settings discovery:\n", + " TOML parse error at line 76, column 17\n", + " |\n", + " 76 | exclude-newer = \"1 week\" # protection against compromised dependencies\n", + " | ^^^^^^^^\n", + " failed to parse year in date \"1 week\": failed to parse \"1 we\" as year (a four digit integer): invalid digit, expected 0-9 but got \n", + "\u001b[0m\n", + "\u001b[2mUsing Python 3.11.13 environment at: /root/sdiazlor/prunatree/.venv\u001b[0m\n", + "Name: transformers\n", + "Version: 5.1.0\n", + "Location: /root/sdiazlor/prunatree/.venv/lib/python3.11/site-packages\n", + "Requires: huggingface-hub, numpy, packaging, pyyaml, regex, safetensors, tokenizers, tqdm, typer-slim\n", + "Required-by: deepcache, flute-kernel, gliner, hqq, peft, trl\n" + ] + } + ], + "source": [ + "!uv pip show transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load the Model\n", + "\n", + "Before optimizing the model, we load the Flux2-Klein-Base-4B pipeline. You need a HuggingFace account with access to the model; run the login cell below with your token. Do not use `enable_model_cpu_offload()` as it interferes with FORA's caching mechanism." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" + ] + } + ], + "source": [ + "# Login to HuggingFace (required for Flux2 model access)\n", + "from huggingface_hub import login\n", + "\n", + "# Replace with your HuggingFace token\n", + "import os\n", + "\n", + "HF_TOKEN = os.getenv(\"HF_TOKEN\")\n", + "login(token=HF_TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import copy\n", + "import gc\n", + "import matplotlib.pyplot as plt\n", + "from diffusers import Flux2KleinPipeline\n", + "from pruna import SmashConfig, smash\n", + "\n", + "# Check CUDA availability\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'cuda'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "device" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load the Flux2-Klein-Base-4B pipeline. This is a 4 billion parameter model optimized for high-quality image generation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "google/gemma-4-12B-it" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e06194e72b084937a90fdbb2887f844c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "model_index.json: 0%| | 0.00/547 [00:00, ), got .\n", + "Expected types for text_encoder: (,), got .\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1f17f92a342540d6a7d6e07e8868bd28", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/8 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# ============================================================\n", + "# Visual Comparison\n", + "# ============================================================\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 7))\n", + "\n", + "axes[0].imshow(image_baseline)\n", + "axes[0].set_title(f\"Baseline\\n{avg_baseline:.2f}s\", fontsize=14)\n", + "axes[0].axis(\"off\")\n", + "\n", + "axes[1].imshow(image_fora)\n", + "axes[1].set_title(f\"Optimized (FORA + FP8 + Compile)\\n{avg_fora:.2f}s ({speedup:.2f}x faster)\", fontsize=14)\n", + "axes[1].axis(\"off\")\n", + "\n", + "plt.suptitle(f'Prompt: \"{prompt}\"', fontsize=12, y=1.02)\n", + "plt.tight_layout()\n", + "plt.savefig(\"baseline_vs_fora.png\", dpi=150, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "37727" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Free GPU memory\n", + "del smashed_model\n", + "torch.cuda.empty_cache()\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this tutorial, we demonstrated a workflow for optimizing and evaluating the Flux2 Klein 4B image generation model using Pruna. We defined a `SmashConfig` combining FORA (cacher), TorchAO FP8 quantization, and torch compile, applied it with `smash`, and compared baseline vs optimized latency. The results show that these optimizations can achieve significant speedup while maintaining image quality. You can adapt the configuration to your use case or reach out on [Discord](https://discord.gg/JFQmtFKCjd) for questions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/flux2klein4b_tutorial.ipynb b/docs/tutorials/flux2klein4b_tutorial.ipynb index 24018a1dd..26afb9b6b 100644 --- a/docs/tutorials/flux2klein4b_tutorial.ipynb +++ b/docs/tutorials/flux2klein4b_tutorial.ipynb @@ -1,446 +1,953 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimize Flux2 Klein (4B) Image Generation" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Component | Details |\n", + "|-----------|---------|\n", + "| **Goal** | Demonstrate optimizing and evaluating Flux2 Klein 4B with FORA, quantization, and torch compile |\n", + "| **Model** | [black-forest-labs/FLUX.2-klein-base-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B) |\n", + "| **Optimization Algorithms** | cacher(fora), quantizer(torchao fp8), compiler(torch_compile) |\n", + "| **Evaluation** | Baseline vs optimized latency comparison |\n", + "\n", + "This tutorial demonstrates how to use **Pruna** to speed up image generation with Flux2 using a combination of three optimization techniques:\n", + "\n", + "1. **FORA (Fast Output Reuse Acceleration)** - Caches transformer block outputs and reuses them for subsequent diffusion steps\n", + "2. **TorchAO Quantization (FP8)** - Reduces memory bandwidth by using 8-bit floating point weights\n", + "3. **Torch Compile** - JIT compiles the model for optimized GPU execution\n", + "\n", + "Together, these optimizations can achieve **2-3x speedup** while maintaining image quality.\n", + "\n", + "## Prerequisites\n", + "\n", + "- NVIDIA GPU with CUDA support (compute capability ≥ 8.9 for FP8). Note that FP8 quantization is hardware-specific and requires modern GPUs such as H100.\n", + "- `pruna` library installed (`pip install pruna`)\n", + "- `diffusers` with Flux2 support\n", + "- HuggingFace account with access to Flux2 models\n", + "\n", + "## Getting Started\n", + "\n", + "To install the dependencies, run the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pruna" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The device is set to the best available option to maximize the benefits of the optimization process." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Optimize Flux2 Klein (4B) Image Generation" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to parse `\u001b[36m/root/sdiazlor/prunatree/pruna/pyproject.toml\u001b[39m` during settings discovery:\n", + " TOML parse error at line 76, column 17\n", + " |\n", + " 76 | exclude-newer = \"1 week\" # protection against compromised dependencies\n", + " | ^^^^^^^^\n", + " failed to parse year in date \"1 week\": failed to parse \"1 we\" as year (a four digit integer): invalid digit, expected 0-9 but got \n", + "\u001b[0m\n", + "\u001b[2mUsing Python 3.11.13 environment at: /root/sdiazlor/prunatree/.venv\u001b[0m\n", + "Name: transformers\n", + "Version: 5.1.0\n", + "Location: /root/sdiazlor/prunatree/.venv/lib/python3.11/site-packages\n", + "Requires: huggingface-hub, numpy, packaging, pyyaml, regex, safetensors, tokenizers, tqdm, typer-slim\n", + "Required-by: deepcache, flute-kernel, gliner, hqq, peft, trl\n" + ] + } + ], + "source": [ + "!uv pip show transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load the Model\n", + "\n", + "Before optimizing the model, we load the Flux2-Klein-Base-4B pipeline. You need a HuggingFace account with access to the model; run the login cell below with your token. Do not use `enable_model_cpu_offload()` as it interferes with FORA's caching mechanism." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" + ] + } + ], + "source": [ + "# Login to HuggingFace (required for Flux2 model access)\n", + "from huggingface_hub import login\n", + "\n", + "# Replace with your HuggingFace token\n", + "import os\n", + "\n", + "HF_TOKEN = os.getenv(\"HF_TOKEN\")\n", + "login(token=HF_TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import copy\n", + "import gc\n", + "import matplotlib.pyplot as plt\n", + "from diffusers import Flux2KleinPipeline\n", + "from pruna import SmashConfig, smash\n", + "\n", + "# Check CUDA availability\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'cuda'" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "device" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load the Flux2-Klein-Base-4B pipeline. This is a 4 billion parameter model optimized for high-quality image generation." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading Flux2 pipeline...\n" + ] }, { - "cell_type": "raw", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e32e528c741441d0a6eb4015accb7605", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "model_index.json: 0%| | 0.00/422 [00:00" ] + }, + "metadata": {}, + "output_type": "display_data" } - ], - "metadata": { - "kernelspec": { - "display_name": "fluxnew", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.14" + ], + "source": [ + "# ============================================================\n", + "# Visual Comparison\n", + "# ============================================================\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 7))\n", + "\n", + "axes[0].imshow(image_baseline)\n", + "axes[0].set_title(f\"Baseline\\n{avg_baseline:.2f}s\", fontsize=14)\n", + "axes[0].axis(\"off\")\n", + "\n", + "axes[1].imshow(image_fora)\n", + "axes[1].set_title(f\"Optimized (FORA + FP8 + Compile)\\n{avg_fora:.2f}s ({speedup:.2f}x faster)\", fontsize=14)\n", + "axes[1].axis(\"off\")\n", + "\n", + "plt.suptitle(f'Prompt: \"{prompt}\"', fontsize=12, y=1.02)\n", + "plt.tight_layout()\n", + "plt.savefig(\"baseline_vs_fora.png\", dpi=150, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "37727" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "# Free GPU memory\n", + "del smashed_model\n", + "torch.cuda.empty_cache()\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this tutorial, we demonstrated a workflow for optimizing and evaluating the Flux2 Klein 4B image generation model using Pruna. We defined a `SmashConfig` combining FORA (cacher), TorchAO FP8 quantization, and torch compile, applied it with `smash`, and compared baseline vs optimized latency. The results show that these optimizations can achieve significant speedup while maintaining image quality. You can adapt the configuration to your use case or reach out on [Discord](https://discord.gg/JFQmtFKCjd) for questions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 2 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/tutorials/image_generation.ipynb b/docs/tutorials/image_generation.ipynb index 46d12098c..3b794daa0 100644 --- a/docs/tutorials/image_generation.ipynb +++ b/docs/tutorials/image_generation.ipynb @@ -90,7 +90,7 @@ "\n", "pipe = DiffusionPipeline.from_pretrained(\n", " pretrained_model_name_or_path=\"stabilityai/stable-diffusion-xl-base-1.0\",\n", - " torch_dtype=torch.bfloat16,\n", + " dtype.bfloat16,\n", ")\n", "pipe = pipe.to(device)" ] diff --git a/docs/tutorials/llm_quantization_compilation_acceleration.ipynb b/docs/tutorials/llm_quantization_compilation_acceleration.ipynb index 7f9ba2aec..a8b250dee 100644 --- a/docs/tutorials/llm_quantization_compilation_acceleration.ipynb +++ b/docs/tutorials/llm_quantization_compilation_acceleration.ipynb @@ -68,7 +68,7 @@ "# We observed better performance with bfloat16 precision.\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_id,\n", - " torch_dtype=torch.bfloat16,\n", + " dtype.bfloat16,\n", " low_cpu_mem_usage=True,\n", " device_map=\"cuda\",\n", ")\n", diff --git a/docs/tutorials/output.png b/docs/tutorials/output.png new file mode 100644 index 000000000..7ddbef6c9 Binary files /dev/null and b/docs/tutorials/output.png differ diff --git a/docs/tutorials/portable_compilation.ipynb b/docs/tutorials/portable_compilation.ipynb index 747d3714c..a1d7ab06e 100644 --- a/docs/tutorials/portable_compilation.ipynb +++ b/docs/tutorials/portable_compilation.ipynb @@ -73,7 +73,7 @@ "import torch\n", "from diffusers import StableDiffusionPipeline\n", "\n", - "pipe = StableDiffusionPipeline.from_pretrained(\"CompVis/stable-diffusion-v1-4\", torch_dtype=torch.float16)\n", + "pipe = StableDiffusionPipeline.from_pretrained(\"CompVis/stable-diffusion-v1-4\", dtype.float16)\n", "pipe = pipe.to(\"cuda\")\n", "\n", "prompt = \"a photo of an astronaut riding a horse on mars\"" @@ -168,7 +168,7 @@ "\n", "from pruna import PrunaModel, SmashConfig, smash\n", "\n", - "pipe = PrunaModel.from_pretrained(\"smashed_model\", torch_dtype=torch.float16)\n", + "pipe = PrunaModel.from_pretrained(\"smashed_model\", dtype.float16)\n", "prompt = \"a photo of an astronaut riding a horse on mars\"\n", "\n", "for _ in range(2):\n", diff --git a/docs/tutorials/recovery.ipynb b/docs/tutorials/recovery.ipynb index 6e2f3a16c..57a616404 100644 --- a/docs/tutorials/recovery.ipynb +++ b/docs/tutorials/recovery.ipynb @@ -54,7 +54,7 @@ "\n", "pipe = SanaPipeline.from_pretrained(\n", " \"Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers\",\n", - " torch_dtype=torch.bfloat16,\n", + " dtype.bfloat16,\n", ").to(\"cuda\")\n" ] }, diff --git a/docs/tutorials/ring_attn.ipynb b/docs/tutorials/ring_attn.ipynb index da37c4fc2..566a33922 100644 --- a/docs/tutorials/ring_attn.ipynb +++ b/docs/tutorials/ring_attn.ipynb @@ -46,7 +46,7 @@ "import torch\n", "from diffusers import FluxPipeline\n", "\n", - "pipe = FluxPipeline.from_pretrained(\"black-forest-labs/FLUX.1-dev\", torch_dtype=torch.bfloat16)\n", + "pipe = FluxPipeline.from_pretrained(\"black-forest-labs/FLUX.1-dev\", dtype.bfloat16)\n", "pipe.to(\"cuda\")" ] }, diff --git a/docs/tutorials/sana_diffusers_int8.ipynb b/docs/tutorials/sana_diffusers_int8.ipynb index cc2620b3f..e322bea91 100644 --- a/docs/tutorials/sana_diffusers_int8.ipynb +++ b/docs/tutorials/sana_diffusers_int8.ipynb @@ -67,7 +67,7 @@ "model_id = \"Efficient-Large-Model/Sana_600M_512px_diffusers\"\n", "\n", "# Load the pre-trained model\n", - "pipe = SanaPipeline.from_pretrained(model_id, variant=\"fp16\", torch_dtype=torch.float16)\n", + "pipe = SanaPipeline.from_pretrained(model_id, variant=\"fp16\", dtype.float16)\n", "pipe = pipe.to(\"cuda\")" ] }, diff --git a/docs/tutorials/sd_deepcache.ipynb b/docs/tutorials/sd_deepcache.ipynb index ef9fd78ca..bdfa41aac 100644 --- a/docs/tutorials/sd_deepcache.ipynb +++ b/docs/tutorials/sd_deepcache.ipynb @@ -60,7 +60,7 @@ "model_id = \"CompVis/stable-diffusion-v1-4\"\n", "\n", "# Load the pre-trained model\n", - "pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)\n", + "pipe = StableDiffusionPipeline.from_pretrained(model_id, dtype.float16)\n", "pipe = pipe.to(\"cuda\")" ] }, diff --git a/docs/tutorials/target_modules_quanto.ipynb b/docs/tutorials/target_modules_quanto.ipynb index f0c9f2ff9..f1ea3d1f9 100644 --- a/docs/tutorials/target_modules_quanto.ipynb +++ b/docs/tutorials/target_modules_quanto.ipynb @@ -106,7 +106,7 @@ "model_id = \"black-forest-labs/FLUX.1-dev\"\n", "pipe = DiffusionPipeline.from_pretrained(\n", " pretrained_model_name_or_path=model_id,\n", - " torch_dtype=torch.bfloat16,\n", + " dtype.bfloat16,\n", ")\n", "pipe = pipe.to(device)" ] @@ -229,7 +229,7 @@ "source": [ "pipe = DiffusionPipeline.from_pretrained(\n", " pretrained_model_name_or_path=model_id,\n", - " torch_dtype=torch.bfloat16,\n", + " dtype.bfloat16,\n", ")\n", "pipe = pipe.to(device)" ] diff --git a/docs/tutorials/video_generation.ipynb b/docs/tutorials/video_generation.ipynb index e79f64a4c..9da796115 100644 --- a/docs/tutorials/video_generation.ipynb +++ b/docs/tutorials/video_generation.ipynb @@ -90,9 +90,9 @@ "\n", "model_id = \"Wan-AI/Wan2.1-T2V-1.3B-Diffusers\"\n", "\n", - "vae = AutoencoderKLWan.from_pretrained(model_id, subfolder=\"vae\", torch_dtype=torch.float32)\n", + "vae = AutoencoderKLWan.from_pretrained(model_id, subfolder=\"vae\", dtype.float32)\n", "\n", - "pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to(device)" + "pipe = WanPipeline.from_pretrained(model_id, vae=vae, dtype.bfloat16).to(device)" ] }, { @@ -130,20 +130,6 @@ "export_to_video(output, \"base_video.mp4\", fps=15)" ] }, - { - "cell_type": "raw", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -239,20 +225,6 @@ "export_to_video(output, \"smashed_video.mp4\", fps=15)" ] }, - { - "cell_type": "raw", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/docs/user_manual/configure.rst b/docs/user_manual/configure.rst index 4bfb8a67c..be034f7ee 100644 --- a/docs/user_manual/configure.rst +++ b/docs/user_manual/configure.rst @@ -248,7 +248,7 @@ Underneath you can find the list of all the available datasets. - Custom Collate Function - Collate Function Arguments * - Text Generation - - `WikiText `_, `SmolTalk `_, `SmolSmolTalk `_, `PubChem `_, `OpenAssistant `_, `C4 `_ + - `WikiText `_, `TinyWikiText `_, `SmolTalk `_, `SmolSmolTalk `_, `PubChem `_, `OpenAssistant `_, `C4 `_, `TinyIMDB `_, `C4 `_ - ``text_generation_collate`` - ``text: str`` * - Image Generation diff --git a/docs/user_manual/save_load.rst b/docs/user_manual/save_load.rst index 6a31b3b9d..c6c28d40d 100644 --- a/docs/user_manual/save_load.rst +++ b/docs/user_manual/save_load.rst @@ -166,7 +166,7 @@ So, when the base model was loaded with e.g. a specific precision: import torch from diffusers import DiffusionPipeline - base_model = DiffusionPipeline.from_pretrained("segmind/Segmind-Vega", torch_dtype=torch.float16) + base_model = DiffusionPipeline.from_pretrained("segmind/Segmind-Vega", dtype=torch.float16) You should also load the smashed model as follows: @@ -174,7 +174,7 @@ You should also load the smashed model as follows: from pruna import PrunaModel - loaded_model = PrunaModel.from_pretrained("PrunaAI/Segmind-Vega-smashed", torch_dtype=torch.float16) + loaded_model = PrunaModel.from_pretrained("PrunaAI/Segmind-Vega-smashed", dtype=torch.float16) Depending on the saving function of the algorithm combination not all keyword arguments are required for loading (e.g. some are set by the algorithm combination itself). In that case, we discard and log a warning about unused keyword arguments. diff --git a/docs/user_manual/smash.rst b/docs/user_manual/smash.rst index 3916bd0c3..e06bedde3 100644 --- a/docs/user_manual/smash.rst +++ b/docs/user_manual/smash.rst @@ -236,7 +236,7 @@ Example 3: Speech Recognition Optimization # Load the model model_id = "openai/whisper-tiny" - model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True).to("cuda") + model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, dtype=torch.float16, low_cpu_mem_usage=True).to("cuda") # Create and configure SmashConfig smash_config = SmashConfig(["c_whisper", "whisper_s2t"])