diff --git a/Analysis.md b/Analysis.md index dc891ac..de3eb37 100644 --- a/Analysis.md +++ b/Analysis.md @@ -1,11 +1,27 @@ ### Metrics -#### Stable Diffusion on Nvidia Geforce GTX 1070 GPU +#### Stable Diffusion UNet on Nvidia Geforce GTX 1070 GPU -Average inference time: 28.63s ± 3.09s +see `results/benchmarks/benchmark_results.csv` -Average CPU memory usage: 890.11MB ± 59.32MB +``` bash +INFO - 2025-09-01 16:26:26,203 - 3315694995.py - UNet Inference time: 0.10s +INFO - 2025-09-01 16:26:26,448 - 3315694995.py - UNet Inference time: 0.24s +INFO - 2025-09-01 16:26:26,711 - 3315694995.py - UNet Inference time: 0.26s +INFO - 2025-09-01 16:26:26,974 - 3315694995.py - UNet Inference time: 0.26s +INFO - 2025-09-01 16:26:27,238 - 3315694995.py - UNet Inference time: 0.26s +INFO - 2025-09-01 16:26:27,494 - 3315694995.py - UNet Inference time: 0.25s +INFO - 2025-09-01 16:26:27,760 - 3315694995.py - UNet Inference time: 0.27s +INFO - 2025-09-01 16:26:28,017 - 3315694995.py - UNet Inference time: 0.26s +INFO - 2025-09-01 16:26:28,285 - 3315694995.py - UNet Inference time: 0.27s +INFO - 2025-09-01 16:26:28,538 - 3315694995.py - UNet Inference time: 0.25s +INFO - 2025-09-01 16:26:28,538 - 3315694995.py - -Average GPU memory usage: 2486.43MB ± 0.00MB +Average inference time: 0.24s ± 0.05s -see `results/benchmarks/benchmark_results.csv` +Average CPU memory usage: 1009.82MB ± 0.03MB + +Average GPU memory usage: 3817.92MB ± 0.00MB +``` + +--- diff --git a/notebooks/baseline_generation.ipynb b/notebooks/baseline_generation.ipynb index 7d00d16..dcdc903 100644 --- a/notebooks/baseline_generation.ipynb +++ b/notebooks/baseline_generation.ipynb @@ -5,7 +5,9 @@ "id": "0", "metadata": {}, "source": [ - "Choosing to use a LoRA / Distilled model because its lighter, faster, lower VRAM, easier for experimentation, perfect for a baseline and for quantization/optimization later." + "An important thing to keep in mind here is that a stable diffusion model is not a monolithic model but has different parts such as a UNet, VAE, text encoders etc. \n", + "\n", + "_Here (at least initially) we will only focus on and benchmark the UNet_ because its the most compute heavy. And this approach is more simpler than trying to export to ONNX, apply PTQ, QAT etc on all the components." ] }, { @@ -22,8 +24,8 @@ "import statistics\n", "import psutil\n", "import pandas as pd \n", - "import matplotlib.pyplot as plt\n", - "from PIL import Image" + "from tinydiffusion.utils.logger import LoggerConfig #this works in VS Code because of the .env file\n", + "from tinydiffusion.utils.constants import PROMPT" ] }, { @@ -32,14 +34,24 @@ "id": "2", "metadata": {}, "outputs": [], + "source": [ + "LOGGER = LoggerConfig().logger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "print(f\"Using device: {device}\")" + "LOGGER.info(f\"Using device: {device}\")" ] }, { "cell_type": "markdown", - "id": "3", + "id": "4", "metadata": {}, "source": [ "Load a lightweight/distilled Stable Diffusion model (LoRA or small variant)" @@ -48,29 +60,31 @@ { "cell_type": "code", "execution_count": null, - "id": "4", + "id": "5", "metadata": {}, "outputs": [], "source": [ "ROOT_DIR = os.path.dirname(os.getcwd())\n", - "print(f\"Root directory: {ROOT_DIR}\")" + "LOGGER.info(f\"Root directory: {ROOT_DIR}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "5", + "id": "6", "metadata": {}, "outputs": [], "source": [ + "from tinydiffusion.utils.constants import ModelType\n", + "\n", "# Example: \"stabilityai/stable-diffusion-2-base\" is smaller than SD 1.5 full\n", "model_cache_dir = os.path.join(ROOT_DIR, \"checkpoints\", \"stablediffusion\")\n", - "model_id = \"stabilityai/stable-diffusion-2-base\" # This is not LoRA checkpoint" + "model_id = ModelType.STABLE_DIFFUSION_2_BASE.value " ] }, { "cell_type": "markdown", - "id": "6", + "id": "7", "metadata": {}, "source": [ "Below we load the fp16 variant (as opposed to downloading the fp32 variant and then converting to fp16). [Ref](https://huggingface.co/docs/diffusers/en/using-diffusers/loading#:~:text=There%20are%20two%20important%20arguments%20for%20loading%20variants%3A)" @@ -79,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7", + "id": "8", "metadata": {}, "outputs": [], "source": [ @@ -98,12 +112,13 @@ " cache_dir=model_cache_dir,\n", " torch_dtype=torch.float32 \n", " )\n", - "pipe = pipe.to(device)" + "pipe = pipe.to(device)\n", + "pipe.unet.eval()" ] }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": {}, "source": [ "[StableDiffusionPipeline.enable_attention_slicing()](https://huggingface.co/docs/diffusers/v0.3.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_attention_slicing)" @@ -112,7 +127,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "10", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +137,50 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", + "metadata": {}, + "source": [ + "Create inputs of UNet since as noted at the beginning of the notebook, we intend to benchmark just the UNet. So we need to explicitly pass the inputs to the UNet and get just the UNet ouput.\n", + "\n", + "See [this](https://medium.com/@onkarmishra/stable-diffusion-explained-1f101284484d) for a quick reference about the stable diffusion architecture.\n", + "\n", + "Instead of running text encoder → U-Net denoising loop → VAE decode we:\n", + "\n", + "- Generate fake random latents (the \"noisy\" image at some timestep).\n", + "- Pick a timestep (e.g. 50).\n", + "- Encode text prompt via `pipe.text_encoder` ie, Stable Diffusion's text encoder (whatever it uses).\n", + "- Run just the U-Net forward pass.\n", + "\n", + "This means we dont actually generate a final image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 1\n", + "height = width = 64 # for 512x512 images\n", + "\n", + "# dummy latent image representation for 1 image\n", + "latents = torch.randn(\n", + " (batch_size, pipe.unet.config.in_channels, height, width),\n", + " device=device,\n", + " dtype=pipe.unet.dtype\n", + ")\n", + "timestep = torch.tensor([10], device=device, dtype=torch.int64) # arbitrary diffusion step\n", + "text_embeddings = pipe.text_encoder(\n", + " pipe.tokenizer(PROMPT, return_tensors=\"pt\").input_ids.to(device)\n", + ")[0]\n", + "\n", + "LOGGER.info(f\"Text embeddings shape: {text_embeddings.shape}\")" + ] + }, + { + "cell_type": "markdown", + "id": "13", "metadata": {}, "source": [ "Benchmarking" @@ -131,7 +189,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -144,26 +202,27 @@ { "cell_type": "code", "execution_count": null, - "id": "12", + "id": "15", "metadata": {}, "outputs": [], "source": [ - "prompt = \"A whale falling through a starry sky beside a floating bowl of petunias, painted in a surreal \" \\\n", - " \"cosmic landscape, whimsical and dreamlike, detailed digital art.\"\n", + "prompt = PROMPT\n", "num_samples = 10\n", "\n", "process = psutil.Process(os.getpid())\n", "\n", - "GEN_IMG_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), \"results\", \"generated_images\")\n", - "os.makedirs(GEN_IMG_SAVE_PATH, exist_ok=True)\n", + "#GEN_IMG_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), \"results\", \"generated_images\")\n", + "#os.makedirs(GEN_IMG_SAVE_PATH, exist_ok=True)\n", "\n", - "print(f\"Generating images. Will be saved to: {GEN_IMG_SAVE_PATH}\")\n", + "#LOGGER.info(f\"Generating images. Will be saved to: {GEN_IMG_SAVE_PATH}\")\n", "\n", "results = []\n", "\n", "for i in range(num_samples):\n", " start_time = time.time()\n", - " image = pipe(prompt, guidance_scale=7.5, num_inference_steps=50).images[0]\n", + " with torch.no_grad():\n", + " #image = pipe(prompt, guidance_scale=7.5, num_inference_steps=50).images[0] # this is what we would typically do to generate the image\n", + " noise_pred = pipe.unet(latents, timestep, text_embeddings).sample\n", " end_time = time.time()\n", " inference_time.append(end_time - start_time)\n", "\n", @@ -178,17 +237,16 @@ " gpu_mem = 0 \n", " # Memory usage - END\n", "\n", - " print(f\"Saved sample_{i+1}.png | Inference time: {(end_time - start_time):.2f}s\", end=\"\\n\")\n", - " image.save(os.path.join(GEN_IMG_SAVE_PATH, f\"sample_{i}.png\"))\n", + " LOGGER.info(f\"UNet Inference time: {(end_time - start_time):.2f}s\")\n", "\n", - "print(f\"\\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s\")\n", - "print(f\"\\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB\")\n", + "LOGGER.info(f\"\\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s\")\n", + "LOGGER.info(f\"\\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB\")\n", "if device == \"cuda\":\n", - " print(f\"\\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB\")\n", + " LOGGER.info(f\"\\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB\")\n", "\n", "# store results\n", "results.append({\n", - " \"desc\": \"stable_diffusion_GPU\",\n", + " \"desc\": \"stable_diffusion_UNet_GPU\",\n", " \"avg_inference_time\": statistics.mean(inference_time),\n", " \"std_inference_time\": statistics.stdev(inference_time),\n", " \"avg_cpu_mem_usage\": statistics.mean(cpu_mem_usage),\n", @@ -200,7 +258,15 @@ }, { "cell_type": "markdown", - "id": "13", + "id": "16", + "metadata": {}, + "source": [ + "The inference time would be really small here because we're running only one denoising step of the UNet as opposed to say 50 denoising steps. " + ] + }, + { + "cell_type": "markdown", + "id": "17", "metadata": {}, "source": [ "Save benchmark details as CSV" @@ -209,7 +275,7 @@ { "cell_type": "code", "execution_count": null, - "id": "14", + "id": "18", "metadata": {}, "outputs": [], "source": [ @@ -220,20 +286,20 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "19", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(results)\n", "csv_path = os.path.join(BENCHMARK_SAVE_PATH, \"benchmark_results.csv\")\n", "df.to_csv(csv_path, index=False)\n", - "print(f\"Saved benchmark results to {csv_path}\")" + "LOGGER.info(f\"Saved benchmark results to {csv_path}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "20", "metadata": {}, "outputs": [], "source": [] diff --git a/results/benchmarks/benchmark_results.csv b/results/benchmarks/benchmark_results.csv index aef6978..49331d3 100644 --- a/results/benchmarks/benchmark_results.csv +++ b/results/benchmarks/benchmark_results.csv @@ -1,2 +1,2 @@ desc,avg_inference_time,std_inference_time,avg_cpu_mem_usage,std_cpu_mem_usage,avg_gpu_mem_usage,std_gpu_mem_usage -stable_diffusion_GPU,28.794894647598266,2.808588022429784,897.65859375,1.677354443989871,2486.42822265625,0.0 +stable_diffusion_UNet_GPU,0.24329500198364257,0.04984202499570953,1009.8203125,0.03146626555623586,3817.91748046875,0.0 diff --git a/tinydiffusion/src/onnx_export.py b/tinydiffusion/src/onnx_export.py new file mode 100644 index 0000000..a7f64f0 --- /dev/null +++ b/tinydiffusion/src/onnx_export.py @@ -0,0 +1,115 @@ +""" +This script exports the UNet model of the Stable Diffusion pipeline to ONNX format. +""" + +import os +import torch +from diffusers import StableDiffusionPipeline + +from tinydiffusion.utils.constants import ModelType, PROMPT +from tinydiffusion.utils.logger import LoggerConfig + +LOGGER = LoggerConfig().logger + +# ONNX export save path +onnx_path = os.path.join( + os.path.dirname(__file__), "..", "..", "checkpoints", "onnx", "unet.onnx" +) +os.makedirs(os.path.dirname(onnx_path), exist_ok=True) + +# Model cache directory +ROOT_DIR = os.path.dirname(os.getcwd()) +model_cache_dir = os.path.join(ROOT_DIR, "checkpoints", "stablediffusion") +os.makedirs(os.path.dirname(model_cache_dir), exist_ok=True) + +device = "cuda" if torch.cuda.is_available() else "cpu" + +LOGGER.info(f"Using device: {device}") + + +def load_sd_pipeline(model_id: str) -> StableDiffusionPipeline: + """ + Load and return the Stable Diffusion pipeline. + + Args: + model_id (str): The model ID from HuggingFace to load. + + Returns: + StableDiffusionPipeline: The loaded Stable Diffusion pipeline. + """ + + if device == "cuda": + pipe = StableDiffusionPipeline.from_pretrained( + model_id, + cache_dir=model_cache_dir, + variant="fp16", + torch_dtype=torch.float16, + ) + else: + # for CPU use fp32 if available + pipe = StableDiffusionPipeline.from_pretrained( + model_id, cache_dir=model_cache_dir, torch_dtype=torch.float32 + ) + pipe = pipe.to(device) + pipe.enable_attention_slicing() + pipe.unet.eval() # Set to eval because we want to do inference and not training + + return pipe + + +def export_onnx_model(model_id: str) -> None: + """ + Export the UNet model of the Stable Diffusion pipeline to ONNX format. + + Args: + model_id (str): The model ID from HuggingFace to export. + + """ + + pipe = load_sd_pipeline(model_id) + + batch_size = 1 + # for 512x512 images + height = 64 + width = 64 + + dummy_latents = torch.randn( + batch_size, + pipe.unet.config.in_channels, + height, + width, + device=device, + dtype=pipe.unet.dtype, + ) + dummy_timestep = torch.tensor( + [10], device=device, dtype=torch.int64 + ) # arbitrary diffusion step + + # text embeddings + tokenized = pipe.tokenizer(PROMPT, return_tensors="pt").input_ids.to(device) + text_embeddings = pipe.text_encoder(tokenized)[0] + + if not os.path.exists(onnx_path): + torch.onnx.export( + pipe.unet, + (dummy_latents, dummy_timestep, text_embeddings), + onnx_path, + export_params=True, + opset_version=17, + input_names=["latents", "timestep", "text_embeddings"], + output_names=["output"], + dynamic_axes={ + "latents": {0: "batch", 2: "height", 3: "width"}, + "text_embeddings": {0: "batch"}, + "output": {0: "batch", 2: "height", 3: "width"}, + }, + ) + LOGGER.info(f"Saved ONNX model to {onnx_path}") + else: + LOGGER.info(f"ONNX model already exists at {onnx_path}") + + +if __name__ == "__main__": + model_id = ModelType.STABLE_DIFFUSION_2_BASE.value + + export_onnx_model(model_id) diff --git a/tinydiffusion/utils/constants.py b/tinydiffusion/utils/constants.py new file mode 100644 index 0000000..985c0e2 --- /dev/null +++ b/tinydiffusion/utils/constants.py @@ -0,0 +1,12 @@ +from enum import Enum + +PROMPT = ( + "A whale falling through a starry sky beside a floating bowl of petunias, painted in a surreal " + "cosmic landscape, whimsical and dreamlike, detailed digital art." +) + + +class ModelType(Enum): + STABLE_DIFFUSION_2_BASE = ( + "stabilityai/stable-diffusion-2-base" # This is not LoRA checkpoint + )