From c02cbbfa316057cb71e421ea17e2e59ff84ebc99 Mon Sep 17 00:00:00 2001
From: Divyendu Dutta <connect2divyendu@gmail.com>
Date: Mon, 1 Sep 2025 16:35:16 +0200
Subject: [PATCH 1/4] [FEAT] Modify benchmark to evaluate only UNet component
 of Stable Diffusion

Simplified the benchmarking pipeline to focus exclusively on the UNet module.
---
 notebooks/baseline_generation.ipynb | 132 +++++++++++++++++++++-------
 1 file changed, 98 insertions(+), 34 deletions(-)

diff --git a/notebooks/baseline_generation.ipynb b/notebooks/baseline_generation.ipynb
index 7d00d16..ee32272 100644
--- a/notebooks/baseline_generation.ipynb
+++ b/notebooks/baseline_generation.ipynb
@@ -5,7 +5,9 @@
    "id": "0",
    "metadata": {},
    "source": [
-    "Choosing to use a LoRA / Distilled model because its lighter, faster, lower VRAM, easier for experimentation, perfect for a baseline and for quantization/optimization later."
+    "An important thing to keep in mind here is that a stable diffusion model is not a monolithic model but has different parts such as a UNet, VAE, text encoders etc. \n",
+    "\n",
+    "_Here (at least initially) we will only focus on and benchmark the UNet_ because its the most compute heavy. And this approach is more simpler than trying to export to ONNX, apply PTQ, QAT etc on all the components."
    ]
   },
   {
@@ -22,8 +24,8 @@
     "import statistics\n",
     "import psutil\n",
     "import pandas as pd \n",
-    "import matplotlib.pyplot as plt\n",
-    "from PIL import Image"
+    "from tinydiffusion.utils.logger import LoggerConfig #this works in VS Code because of the .env file\n",
+    "from tinydiffusion.utils.constants import PROMPT"
    ]
   },
   {
@@ -32,14 +34,24 @@
    "id": "2",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "LOGGER = LoggerConfig().logger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "print(f\"Using device: {device}\")"
+    "LOGGER.info(f\"Using device: {device}\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3",
+   "id": "4",
    "metadata": {},
    "source": [
     "Load a lightweight/distilled Stable Diffusion model (LoRA or small variant)"
@@ -48,29 +60,31 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4",
+   "id": "5",
    "metadata": {},
    "outputs": [],
    "source": [
     "ROOT_DIR = os.path.dirname(os.getcwd())\n",
-    "print(f\"Root directory: {ROOT_DIR}\")"
+    "LOGGER.info(f\"Root directory: {ROOT_DIR}\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5",
+   "id": "6",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from tinydiffusion.utils.constants import ModelType\n",
+    "\n",
     "# Example: \"stabilityai/stable-diffusion-2-base\" is smaller than SD 1.5 full\n",
     "model_cache_dir = os.path.join(ROOT_DIR, \"checkpoints\", \"stablediffusion\")\n",
-    "model_id = \"stabilityai/stable-diffusion-2-base\"  # This is not LoRA checkpoint"
+    "model_id = ModelType.STABLE_DIFFUSION_2_BASE.value  "
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "6",
+   "id": "7",
    "metadata": {},
    "source": [
     "Below we load the fp16 variant (as opposed to downloading the fp32 variant and then converting to fp16). [Ref](https://huggingface.co/docs/diffusers/en/using-diffusers/loading#:~:text=There%20are%20two%20important%20arguments%20for%20loading%20variants%3A)"
@@ -79,7 +93,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7",
+   "id": "8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,12 +112,13 @@
     "        cache_dir=model_cache_dir,\n",
     "        torch_dtype=torch.float32 \n",
     "    )\n",
-    "pipe = pipe.to(device)"
+    "pipe = pipe.to(device)\n",
+    "pipe.unet.eval()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8",
+   "id": "9",
    "metadata": {},
    "source": [
     "[StableDiffusionPipeline.enable_attention_slicing()](https://huggingface.co/docs/diffusers/v0.3.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_attention_slicing)"
@@ -112,7 +127,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9",
+   "id": "10",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -122,7 +137,48 @@
   },
   {
    "cell_type": "markdown",
-   "id": "10",
+   "id": "11",
+   "metadata": {},
+   "source": [
+    "Create inputs of UNet since as noted at the beginning of the notebook, we intend to benchmark just the UNet. So we need to explicitly pass the inputs to the UNet and get just the UNet ouput.\n",
+    "\n",
+    "See [this](https://medium.com/@onkarmishra/stable-diffusion-explained-1f101284484d) for a quick reference about the stable diffusion architecture.\n",
+    "\n",
+    "Instead of running text encoder → U-Net denoising loop → VAE decode we:\n",
+    "\n",
+    "- Generate fake random latents (the \"noisy\" image at some timestep).\n",
+    "- Pick a timestep (e.g. 50).\n",
+    "- Encode text prompt via `pipe.text_encoder` ie, Stable Diffusion's text encoder (whatever it uses).\n",
+    "- Run just the U-Net forward pass.\n",
+    "\n",
+    "This means we dont actually generate a final image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 1\n",
+    "height = width = 64\n",
+    "latents = torch.randn(\n",
+    "    (batch_size, pipe.unet.config.in_channels, height, width),\n",
+    "    device=device,\n",
+    "    dtype=pipe.unet.dtype\n",
+    ")\n",
+    "timestep = torch.tensor([10], device=device, dtype=torch.int64)  # arbitrary diffusion step\n",
+    "text_embeddings = pipe.text_encoder(\n",
+    "    pipe.tokenizer(PROMPT, return_tensors=\"pt\").input_ids.to(device)\n",
+    ")[0]\n",
+    "\n",
+    "LOGGER.info(f\"Text embeddings shape: {text_embeddings.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13",
    "metadata": {},
    "source": [
     "Benchmarking"
@@ -131,7 +187,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11",
+   "id": "14",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,26 +200,27 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "12",
+   "id": "15",
    "metadata": {},
    "outputs": [],
    "source": [
-    "prompt = \"A whale falling through a starry sky beside a floating bowl of petunias, painted in a surreal \" \\\n",
-    "         \"cosmic landscape, whimsical and dreamlike, detailed digital art.\"\n",
+    "prompt = PROMPT\n",
     "num_samples = 10\n",
     "\n",
     "process = psutil.Process(os.getpid())\n",
     "\n",
-    "GEN_IMG_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), \"results\", \"generated_images\")\n",
-    "os.makedirs(GEN_IMG_SAVE_PATH, exist_ok=True)\n",
+    "#GEN_IMG_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), \"results\", \"generated_images\")\n",
+    "#os.makedirs(GEN_IMG_SAVE_PATH, exist_ok=True)\n",
     "\n",
-    "print(f\"Generating images. Will be saved to: {GEN_IMG_SAVE_PATH}\")\n",
+    "#LOGGER.info(f\"Generating images. Will be saved to: {GEN_IMG_SAVE_PATH}\")\n",
     "\n",
     "results = []\n",
     "\n",
     "for i in range(num_samples):\n",
     "    start_time = time.time()\n",
-    "    image = pipe(prompt, guidance_scale=7.5, num_inference_steps=50).images[0]\n",
+    "    with torch.no_grad():\n",
+    "        #image = pipe(prompt, guidance_scale=7.5, num_inference_steps=50).images[0] # this is what we would typically do to generate the image\n",
+    "        noise_pred = pipe.unet(latents, timestep, text_embeddings).sample\n",
     "    end_time = time.time()\n",
     "    inference_time.append(end_time - start_time)\n",
     "\n",
@@ -178,17 +235,16 @@
     "        gpu_mem = 0 \n",
     "    # Memory usage - END\n",
     "\n",
-    "    print(f\"Saved sample_{i+1}.png | Inference time: {(end_time - start_time):.2f}s\", end=\"\\n\")\n",
-    "    image.save(os.path.join(GEN_IMG_SAVE_PATH, f\"sample_{i}.png\"))\n",
+    "    LOGGER.info(f\"UNet Inference time: {(end_time - start_time):.2f}s\")\n",
     "\n",
-    "print(f\"\\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s\")\n",
-    "print(f\"\\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB\")\n",
+    "LOGGER.info(f\"\\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s\")\n",
+    "LOGGER.info(f\"\\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB\")\n",
     "if device == \"cuda\":\n",
-    "    print(f\"\\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB\")\n",
+    "    LOGGER.info(f\"\\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB\")\n",
     "\n",
     "# store results\n",
     "results.append({\n",
-    "    \"desc\": \"stable_diffusion_GPU\",\n",
+    "    \"desc\": \"stable_diffusion_UNet_GPU\",\n",
     "    \"avg_inference_time\": statistics.mean(inference_time),\n",
     "    \"std_inference_time\": statistics.stdev(inference_time),\n",
     "    \"avg_cpu_mem_usage\": statistics.mean(cpu_mem_usage),\n",
@@ -200,7 +256,15 @@
   },
   {
    "cell_type": "markdown",
-   "id": "13",
+   "id": "16",
+   "metadata": {},
+   "source": [
+    "The inference time would be really small here because we're running only one denoising step of the UNet as opposed to say 50 denoising steps. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17",
    "metadata": {},
    "source": [
     "Save benchmark details as CSV"
@@ -209,7 +273,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "14",
+   "id": "18",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -220,20 +284,20 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "15",
+   "id": "19",
    "metadata": {},
    "outputs": [],
    "source": [
     "df = pd.DataFrame(results)\n",
     "csv_path = os.path.join(BENCHMARK_SAVE_PATH, \"benchmark_results.csv\")\n",
     "df.to_csv(csv_path, index=False)\n",
-    "print(f\"Saved benchmark results to {csv_path}\")"
+    "LOGGER.info(f\"Saved benchmark results to {csv_path}\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "16",
+   "id": "20",
    "metadata": {},
    "outputs": [],
    "source": []

From 1880f0c33b0dfff82eb72d6945128e3999c82b00 Mon Sep 17 00:00:00 2001
From: Divyendu Dutta <connect2divyendu@gmail.com>
Date: Mon, 1 Sep 2025 16:36:42 +0200
Subject: [PATCH 2/4] [DOC][CHORE] Update benchmarking results for UNet
 component

Revised the benchmark results CSV and .md to reflect evaluations of only the UNet module.
---
 Analysis.md                              | 26 +++++++++++++++++++-----
 results/benchmarks/benchmark_results.csv |  2 +-
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/Analysis.md b/Analysis.md
index dc891ac..de3eb37 100644
--- a/Analysis.md
+++ b/Analysis.md
@@ -1,11 +1,27 @@
 ### Metrics
 
-#### Stable Diffusion on Nvidia Geforce GTX 1070 GPU
+#### Stable Diffusion UNet on Nvidia Geforce GTX 1070 GPU
 
-Average inference time: 28.63s ± 3.09s
+see `results/benchmarks/benchmark_results.csv`
 
-Average CPU memory usage: 890.11MB ± 59.32MB
+``` bash
+INFO - 2025-09-01 16:26:26,203 - 3315694995.py - UNet Inference time: 0.10s
+INFO - 2025-09-01 16:26:26,448 - 3315694995.py - UNet Inference time: 0.24s
+INFO - 2025-09-01 16:26:26,711 - 3315694995.py - UNet Inference time: 0.26s
+INFO - 2025-09-01 16:26:26,974 - 3315694995.py - UNet Inference time: 0.26s
+INFO - 2025-09-01 16:26:27,238 - 3315694995.py - UNet Inference time: 0.26s
+INFO - 2025-09-01 16:26:27,494 - 3315694995.py - UNet Inference time: 0.25s
+INFO - 2025-09-01 16:26:27,760 - 3315694995.py - UNet Inference time: 0.27s
+INFO - 2025-09-01 16:26:28,017 - 3315694995.py - UNet Inference time: 0.26s
+INFO - 2025-09-01 16:26:28,285 - 3315694995.py - UNet Inference time: 0.27s
+INFO - 2025-09-01 16:26:28,538 - 3315694995.py - UNet Inference time: 0.25s
+INFO - 2025-09-01 16:26:28,538 - 3315694995.py -
 
-Average GPU memory usage: 2486.43MB ± 0.00MB
+Average inference time: 0.24s ± 0.05s
 
-see `results/benchmarks/benchmark_results.csv`
+Average CPU memory usage: 1009.82MB ± 0.03MB
+
+Average GPU memory usage: 3817.92MB ± 0.00MB
+```
+
+---
diff --git a/results/benchmarks/benchmark_results.csv b/results/benchmarks/benchmark_results.csv
index aef6978..49331d3 100644
--- a/results/benchmarks/benchmark_results.csv
+++ b/results/benchmarks/benchmark_results.csv
@@ -1,2 +1,2 @@
 desc,avg_inference_time,std_inference_time,avg_cpu_mem_usage,std_cpu_mem_usage,avg_gpu_mem_usage,std_gpu_mem_usage
-stable_diffusion_GPU,28.794894647598266,2.808588022429784,897.65859375,1.677354443989871,2486.42822265625,0.0
+stable_diffusion_UNet_GPU,0.24329500198364257,0.04984202499570953,1009.8203125,0.03146626555623586,3817.91748046875,0.0

From 55fcbb2cb522e8f28ed5361d486e6c5d9d210e8e Mon Sep 17 00:00:00 2001
From: Divyendu Dutta <connect2divyendu@gmail.com>
Date: Mon, 1 Sep 2025 16:38:26 +0200
Subject: [PATCH 3/4] [REFACTOR] Move prompt and Hugging Face model ID to
 separate script

---
 tinydiffusion/utils/constants.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 tinydiffusion/utils/constants.py

diff --git a/tinydiffusion/utils/constants.py b/tinydiffusion/utils/constants.py
new file mode 100644
index 0000000..985c0e2
--- /dev/null
+++ b/tinydiffusion/utils/constants.py
@@ -0,0 +1,12 @@
+from enum import Enum
+
+PROMPT = (
+    "A whale falling through a starry sky beside a floating bowl of petunias, painted in a surreal "
+    "cosmic landscape, whimsical and dreamlike, detailed digital art."
+)
+
+
+class ModelType(Enum):
+    STABLE_DIFFUSION_2_BASE = (
+        "stabilityai/stable-diffusion-2-base"  # This is not LoRA checkpoint
+    )

From 5dc434ff6d91f5244cf781630755582b65bee409 Mon Sep 17 00:00:00 2001
From: Divyendu Dutta <connect2divyendu@gmail.com>
Date: Mon, 1 Sep 2025 22:08:17 +0200
Subject: [PATCH 4/4] [FEAT] Add ONNX export script for UNet component of
 Stable Diffusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implemented script to export the UNet module from Hugging Face’s Stable Diffusion model to ONNX format.
---
 notebooks/baseline_generation.ipynb |   4 +-
 tinydiffusion/src/onnx_export.py    | 115 ++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 tinydiffusion/src/onnx_export.py

diff --git a/notebooks/baseline_generation.ipynb b/notebooks/baseline_generation.ipynb
index ee32272..dcdc903 100644
--- a/notebooks/baseline_generation.ipynb
+++ b/notebooks/baseline_generation.ipynb
@@ -162,7 +162,9 @@
    "outputs": [],
    "source": [
     "batch_size = 1\n",
-    "height = width = 64\n",
+    "height = width = 64 # for 512x512 images\n",
+    "\n",
+    "# dummy latent image representation for 1 image\n",
     "latents = torch.randn(\n",
     "    (batch_size, pipe.unet.config.in_channels, height, width),\n",
     "    device=device,\n",
diff --git a/tinydiffusion/src/onnx_export.py b/tinydiffusion/src/onnx_export.py
new file mode 100644
index 0000000..a7f64f0
--- /dev/null
+++ b/tinydiffusion/src/onnx_export.py
@@ -0,0 +1,115 @@
+"""
+This script exports the UNet model of the Stable Diffusion pipeline to ONNX format.
+"""
+
+import os
+import torch
+from diffusers import StableDiffusionPipeline
+
+from tinydiffusion.utils.constants import ModelType, PROMPT
+from tinydiffusion.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+# ONNX export save path
+onnx_path = os.path.join(
+    os.path.dirname(__file__), "..", "..", "checkpoints", "onnx", "unet.onnx"
+)
+os.makedirs(os.path.dirname(onnx_path), exist_ok=True)
+
+# Model cache directory
+ROOT_DIR = os.path.dirname(os.getcwd())
+model_cache_dir = os.path.join(ROOT_DIR, "checkpoints", "stablediffusion")
+os.makedirs(os.path.dirname(model_cache_dir), exist_ok=True)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+LOGGER.info(f"Using device: {device}")
+
+
+def load_sd_pipeline(model_id: str) -> StableDiffusionPipeline:
+    """
+    Load and return the Stable Diffusion pipeline.
+
+    Args:
+        model_id (str): The model ID from HuggingFace to load.
+
+    Returns:
+        StableDiffusionPipeline: The loaded Stable Diffusion pipeline.
+    """
+
+    if device == "cuda":
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id,
+            cache_dir=model_cache_dir,
+            variant="fp16",
+            torch_dtype=torch.float16,
+        )
+    else:
+        # for CPU use fp32 if available
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id, cache_dir=model_cache_dir, torch_dtype=torch.float32
+        )
+    pipe = pipe.to(device)
+    pipe.enable_attention_slicing()
+    pipe.unet.eval()  # Set to eval because we want to do inference and not training
+
+    return pipe
+
+
+def export_onnx_model(model_id: str) -> None:
+    """
+    Export the UNet model of the Stable Diffusion pipeline to ONNX format.
+
+    Args:
+        model_id (str): The model ID from HuggingFace to export.
+
+    """
+
+    pipe = load_sd_pipeline(model_id)
+
+    batch_size = 1
+    # for 512x512 images
+    height = 64
+    width = 64
+
+    dummy_latents = torch.randn(
+        batch_size,
+        pipe.unet.config.in_channels,
+        height,
+        width,
+        device=device,
+        dtype=pipe.unet.dtype,
+    )
+    dummy_timestep = torch.tensor(
+        [10], device=device, dtype=torch.int64
+    )  # arbitrary diffusion step
+
+    # text embeddings
+    tokenized = pipe.tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
+    text_embeddings = pipe.text_encoder(tokenized)[0]
+
+    if not os.path.exists(onnx_path):
+        torch.onnx.export(
+            pipe.unet,
+            (dummy_latents, dummy_timestep, text_embeddings),
+            onnx_path,
+            export_params=True,
+            opset_version=17,
+            input_names=["latents", "timestep", "text_embeddings"],
+            output_names=["output"],
+            dynamic_axes={
+                "latents": {0: "batch", 2: "height", 3: "width"},
+                "text_embeddings": {0: "batch"},
+                "output": {0: "batch", 2: "height", 3: "width"},
+            },
+        )
+        LOGGER.info(f"Saved ONNX model to {onnx_path}")
+    else:
+        LOGGER.info(f"ONNX model already exists at {onnx_path}")
+
+
+if __name__ == "__main__":
+    model_id = ModelType.STABLE_DIFFUSION_2_BASE.value
+
+    export_onnx_model(model_id)