diff --git a/Analysis.md b/Analysis.md
index de3eb37..49da254 100644
--- a/Analysis.md
+++ b/Analysis.md
@@ -1,27 +1,73 @@
 ### Metrics
 
-#### Stable Diffusion UNet on Nvidia Geforce GTX 1070 GPU
-
 see `results/benchmarks/benchmark_results.csv`
 
+GPU - Nvidia Geforce GTX 1070 GPU
+CPU - Intel Core i7-8750H
+
+#### Stable Diffusion UNet on GPU
+
 ``` bash
-INFO - 2025-09-01 16:26:26,203 - 3315694995.py - UNet Inference time: 0.10s
-INFO - 2025-09-01 16:26:26,448 - 3315694995.py - UNet Inference time: 0.24s
-INFO - 2025-09-01 16:26:26,711 - 3315694995.py - UNet Inference time: 0.26s
-INFO - 2025-09-01 16:26:26,974 - 3315694995.py - UNet Inference time: 0.26s
-INFO - 2025-09-01 16:26:27,238 - 3315694995.py - UNet Inference time: 0.26s
-INFO - 2025-09-01 16:26:27,494 - 3315694995.py - UNet Inference time: 0.25s
-INFO - 2025-09-01 16:26:27,760 - 3315694995.py - UNet Inference time: 0.27s
-INFO - 2025-09-01 16:26:28,017 - 3315694995.py - UNet Inference time: 0.26s
-INFO - 2025-09-01 16:26:28,285 - 3315694995.py - UNet Inference time: 0.27s
-INFO - 2025-09-01 16:26:28,538 - 3315694995.py - UNet Inference time: 0.25s
-INFO - 2025-09-01 16:26:28,538 - 3315694995.py -
-
-Average inference time: 0.24s ± 0.05s
-
-Average CPU memory usage: 1009.82MB ± 0.03MB
-
-Average GPU memory usage: 3817.92MB ± 0.00MB
+INFO - UNet Inference time: 0.52s
+INFO - UNet Inference time: 0.15s
+INFO - UNet Inference time: 0.26s
+INFO - UNet Inference time: 0.26s
+INFO - UNet Inference time: 0.26s
+INFO - UNet Inference time: 0.26s
+INFO - UNet Inference time: 0.24s
+INFO - UNet Inference time: 0.28s
+INFO - UNet Inference time: 0.24s
+INFO - UNet Inference time: 0.27s
+
+Average inference time: 0.27s ± 0.09s
+
+Average CPU memory usage: 889.63MB ± 0.00MB
+
+Average GPU memory usage: 2511.14MB ± 0.00MB
 ```
 
 ---
+
+#### Stable Diffusion UNet via ONNX Runtime on GPU
+
+```bash
+INFO - ONNXRuntime Inference time: 1.82s
+INFO - ONNXRuntime Inference time: 1.59s
+INFO - ONNXRuntime Inference time: 1.59s
+INFO - ONNXRuntime Inference time: 1.60s
+INFO - ONNXRuntime Inference time: 1.59s
+INFO - ONNXRuntime Inference time: 1.59s
+INFO - ONNXRuntime Inference time: 1.58s
+INFO - ONNXRuntime Inference time: 1.58s
+INFO - ONNXRuntime Inference time: 1.60s
+INFO - ONNXRuntime Inference time: 1.58s
+
+Average inference time: 1.61s ± 0.07s
+
+Average CPU memory usage: 1390.87MB ± 0.03MB
+
+Average GPU memory usage: 1355.65MB ± 0.00MB
+
+```
+
+---
+
+#### Stable Diffusion UNet via ONNX Runtime on CPU
+
+```bash
+INFO - ONNXRuntime Inference time: 8.94s
+INFO - ONNXRuntime Inference time: 9.48s
+INFO - ONNXRuntime Inference time: 7.13s
+INFO - ONNXRuntime Inference time: 7.42s
+INFO - ONNXRuntime Inference time: 7.06s
+INFO - ONNXRuntime Inference time: 6.98s
+INFO - ONNXRuntime Inference time: 6.91s
+INFO - ONNXRuntime Inference time: 7.57s
+INFO - ONNXRuntime Inference time: 6.90s
+INFO - ONNXRuntime Inference time: 6.98s
+
+Average inference time: 7.54s ± 0.92s
+
+Average CPU memory usage: 4968.51MB ± 344.71MB
+
+```
diff --git a/notebooks/baseline_generation.ipynb b/notebooks/baseline_generation.ipynb
index dcdc903..1fd310e 100644
--- a/notebooks/baseline_generation.ipynb
+++ b/notebooks/baseline_generation.ipynb
@@ -160,6 +160,16 @@
    "id": "12",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "LOGGER.info(f\"Input channels: {pipe.unet.config.in_channels}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "batch_size = 1\n",
     "height = width = 64 # for 512x512 images\n",
@@ -180,7 +190,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "13",
+   "id": "14",
    "metadata": {},
    "source": [
     "Benchmarking"
@@ -189,7 +199,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "14",
+   "id": "15",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -202,7 +212,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "15",
+   "id": "16",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,7 +268,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16",
+   "id": "17",
    "metadata": {},
    "source": [
     "The inference time would be really small here because we're running only one denoising step of the UNet as opposed to say 50 denoising steps. "
@@ -266,21 +276,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "17",
-   "metadata": {},
-   "source": [
-    "Save benchmark details as CSV"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "id": "18",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "BENCHMARK_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), \"results\", \"benchmarks\")\n",
-    "os.makedirs(BENCHMARK_SAVE_PATH, exist_ok=True)"
+    "Save benchmark details as CSV"
    ]
   },
   {
@@ -290,10 +289,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.DataFrame(results)\n",
-    "csv_path = os.path.join(BENCHMARK_SAVE_PATH, \"benchmark_results.csv\")\n",
-    "df.to_csv(csv_path, index=False)\n",
-    "LOGGER.info(f\"Saved benchmark results to {csv_path}\")"
+    "from tinydiffusion.utils.csv_utils import save_results_to_csv\n",
+    "\n",
+    "save_results_to_csv(results)"
    ]
   },
   {
diff --git a/notebooks/onnxruntime_generation.ipynb b/notebooks/onnxruntime_generation.ipynb
new file mode 100644
index 0000000..215feb5
--- /dev/null
+++ b/notebooks/onnxruntime_generation.ipynb
@@ -0,0 +1,346 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "Here we benchmark the UNet part of the stable diffusion model with inference done using Onnx runtime on a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "import psutil\n",
+    "import statistics\n",
+    "import torch\n",
+    "import onnxruntime as ort\n",
+    "from tinydiffusion.utils.logger import LoggerConfig\n",
+    "from tinydiffusion.utils.constants import PROMPT\n",
+    "from transformers import CLIPTokenizer, CLIPTextModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LOGGER = LoggerConfig().logger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LOGGER.info(f\"{ort.get_available_providers()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "Above shows that we have tensorrt, CUDA and CPU runtimes to perform inference on"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "# device = \"cpu\"\n",
+    "LOGGER.info(f\"Using device: {device}\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "UNET_ONNX_PATH = os.path.join(os.getcwd(), \"..\", \"checkpoints\", \"onnx\", \"unet.onnx\")\n",
+    "\n",
+    "if not os.path.exists(UNET_ONNX_PATH):\n",
+    "    LOGGER.warning(f\"ONNX model not found at {UNET_ONNX_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROOT_DIR = os.path.dirname(os.getcwd())\n",
+    "LOGGER.info(f\"Root directory: {ROOT_DIR}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tinydiffusion.utils.constants import ModelType\n",
+    "\n",
+    "tokenizer_model_cache_dir = os.path.join(ROOT_DIR, \"checkpoints\", \"cliptokenizer\")\n",
+    "text_encoder_model_cache_dir = os.path.join(ROOT_DIR, \"checkpoints\", \"cliptextencoder\")\n",
+    "\n",
+    "tokenizer_model_id = ModelType.LAION_CLIP_VIT.value\n",
+    "text_encoder_model_id = ModelType.LAION_CLIP_VIT.value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "available_providers = ort.get_available_providers()\n",
+    "\n",
+    "# CUDA\n",
+    "provider = \"CUDAExecutionProvider\" if \"CUDAExecutionProvider\" in available_providers else \"CPUExecutionProvider\"\n",
+    "\n",
+    "# CPU\n",
+    "#provider = \"CPUExecutionProvider\"\n",
+    "\n",
+    "# TensorRT\n",
+    "# provider = \"TensorrtExecutionProvider\"\n",
+    "\n",
+    "LOGGER.info(f\"Using ONNX Runtime provider: {provider}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "session = ort.InferenceSession(UNET_ONNX_PATH, providers=[provider])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11",
+   "metadata": {},
+   "source": [
+    "Load text encoder & tokenizer from HuggingFace. This matches what HF's stable diffusion model uses."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = CLIPTokenizer.from_pretrained(tokenizer_model_id, cache_dir=tokenizer_model_cache_dir)\n",
+    "text_encoder = CLIPTextModel.from_pretrained(text_encoder_model_id, cache_dir=text_encoder_model_cache_dir).to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13",
+   "metadata": {},
+   "source": [
+    "Benchmarking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Metrics\n",
+    "inference_time = []\n",
+    "cpu_mem_usage = []\n",
+    "gpu_mem_usage = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 1\n",
+    "height = width = 64 # for 512x512 images\n",
+    "\n",
+    "# dummy image latents\n",
+    "dummy_latents = torch.randn(\n",
+    "    batch_size, \n",
+    "    4,                        # UNet in_channels is 4 - see baseline_generation.ipynb\n",
+    "    height, \n",
+    "    width, \n",
+    "    device=device, \n",
+    "    dtype=torch.float16 if device==\"cuda\" else torch.float32\n",
+    ")\n",
+    "\n",
+    "# arbitrary timestep\n",
+    "dummy_timestep = torch.tensor([10], device=device, dtype=torch.int64)\n",
+    "\n",
+    "# prompt token embeddings\n",
+    "with torch.no_grad():\n",
+    "    input_ids = tokenizer(PROMPT, return_tensors=\"pt\").input_ids.to(device)\n",
+    "    text_embeddings = text_encoder(input_ids)[0]\n",
+    "\n",
+    "LOGGER.info(f\"Text embeddings shape: {text_embeddings.shape}\")\n",
+    "\n",
+    "# ensure text embeddings match with stable diffusion - see baseline_generation.ipynb\n",
+    "assert text_embeddings.shape == torch.Size([1, 34, 1024]), f\"Unexpected text embeddings shape: {text_embeddings.shape}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = PROMPT\n",
+    "num_samples = 10\n",
+    "\n",
+    "process = psutil.Process(os.getpid())\n",
+    "\n",
+    "results = []\n",
+    "\n",
+    "dummy_latents = dummy_latents.cpu().numpy().astype(\"float16\") # to match Stable Diffusion UNet's dtype\n",
+    "dummy_timestep = dummy_timestep.cpu().numpy().astype(\"int64\")\n",
+    "text_embeddings = text_embeddings.cpu().numpy().astype(\"float16\")\n",
+    "\n",
+    "for i in range(num_samples):\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    # run ONNX inference - dummy inputs and for 1 denoising step of the UNet\n",
+    "    outputs = session.run(\n",
+    "        None,\n",
+    "        {\n",
+    "            \"latents\": dummy_latents,\n",
+    "            \"timestep\": dummy_timestep,\n",
+    "            \"text_embeddings\": text_embeddings,\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "    end_time = time.time()\n",
+    "    inference_time.append(end_time - start_time)\n",
+    "\n",
+    "    # Memory usage - START\n",
+    "    cpu_mem = process.memory_info().rss / (1024**2)  # MB\n",
+    "    cpu_mem_usage.append(cpu_mem)\n",
+    "\n",
+    "    if device == \"cuda\":\n",
+    "        gpu_mem = torch.cuda.memory_allocated(0) / (1024**2)  # MB\n",
+    "        gpu_mem_usage.append(gpu_mem)\n",
+    "    else:\n",
+    "        gpu_mem = 0 \n",
+    "    # Memory usage - END\n",
+    "\n",
+    "    LOGGER.info(f\"ONNXRuntime Inference time: {(end_time - start_time):.2f}s\")\n",
+    "\n",
+    "LOGGER.info(f\"\\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s\")\n",
+    "LOGGER.info(f\"\\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB\")\n",
+    "if device == \"cuda\":\n",
+    "    LOGGER.info(f\"\\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB\")\n",
+    "\n",
+    "# store results\n",
+    "results.append({\n",
+    "    \"desc\": \"stable_diffusion_ONNX_UNet_GPU\",\n",
+    "    \"avg_inference_time\": statistics.mean(inference_time),\n",
+    "    \"std_inference_time\": statistics.stdev(inference_time),\n",
+    "    \"avg_cpu_mem_usage\": statistics.mean(cpu_mem_usage),\n",
+    "    \"std_cpu_mem_usage\": statistics.stdev(cpu_mem_usage),\n",
+    "    \"avg_gpu_mem_usage\": statistics.mean(gpu_mem_usage) if device == \"cuda\" else 0,\n",
+    "    \"std_gpu_mem_usage\": statistics.stdev(gpu_mem_usage) if device == \"cuda\" else 0,\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17",
+   "metadata": {},
+   "source": [
+    "Ooof, that pretty bad compared to running the original Stable Diffusion UNet directly. And that is probably because pytorch UNet has a bunch of optimizations which are not being used by the exported ONNX model (if it was even exported at all)\n",
+    "\n",
+    "**Change the execution provider and device and rerun this notebook on CPU. Also try tensorrt provider + CUDA.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18",
+   "metadata": {},
+   "source": [
+    "See `Analysis.md`. ONNX Runtime with CPU provider has the worst latency."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19",
+   "metadata": {},
+   "source": [
+    "Save benchmark details as CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tinydiffusion.utils.csv_utils import save_results_to_csv\n",
+    "\n",
+    "save_results_to_csv(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tiny-diffusion",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/results/benchmarks/benchmark_results.csv b/results/benchmarks/benchmark_results.csv
index 49331d3..a3872d4 100644
--- a/results/benchmarks/benchmark_results.csv
+++ b/results/benchmarks/benchmark_results.csv
@@ -1,2 +1,4 @@
 desc,avg_inference_time,std_inference_time,avg_cpu_mem_usage,std_cpu_mem_usage,avg_gpu_mem_usage,std_gpu_mem_usage
-stable_diffusion_UNet_GPU,0.24329500198364257,0.04984202499570953,1009.8203125,0.03146626555623586,3817.91748046875,0.0
+stable_diffusion_UNet_GPU,0.27318849563598635,0.09205775216375249,889.631640625,0.0037057941330098194,2511.1396484375,0.0
+stable_diffusion_ONNX_UNet_GPU,1.6123629570007325,0.07474299686629637,1390.87265625,0.03450889327807801,1355.65380859375,0.0
+stable_diffusion_ONNX_UNet_CPU,7.5368633508682255,0.9180746064937988,4968.514453125,344.70516150741446,0,0
diff --git a/tinydiffusion/utils/constants.py b/tinydiffusion/utils/constants.py
index 985c0e2..1507e8a 100644
--- a/tinydiffusion/utils/constants.py
+++ b/tinydiffusion/utils/constants.py
@@ -10,3 +10,7 @@ class ModelType(Enum):
     STABLE_DIFFUSION_2_BASE = (
         "stabilityai/stable-diffusion-2-base"  # This is not LoRA checkpoint
     )
+
+    LAION_CLIP_VIT = (
+        "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"  # Tokenizer and Text encoder
+    )
diff --git a/tinydiffusion/utils/csv_utils.py b/tinydiffusion/utils/csv_utils.py
new file mode 100644
index 0000000..211dcfe
--- /dev/null
+++ b/tinydiffusion/utils/csv_utils.py
@@ -0,0 +1,27 @@
+import os
+import pandas as pd
+
+from tinydiffusion.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+BENCHMARK_SAVE_PATH = os.path.join(
+    os.path.dirname(os.getcwd()), "results", "benchmarks"
+)
+os.makedirs(BENCHMARK_SAVE_PATH, exist_ok=True)
+
+
+def save_results_to_csv(results: list) -> None:
+    """
+    Save benchmark results to a CSV file.
+
+    Args:
+        results (list): A list of benchmark results to save.
+    """
+    df = pd.DataFrame(results)
+    csv_path = os.path.join(BENCHMARK_SAVE_PATH, "benchmark_results.csv")
+
+    file_exists = os.path.isfile(csv_path)
+
+    df.to_csv(csv_path, mode="a", header=not file_exists, index=False)
+    LOGGER.info(f"Saved benchmark results to {csv_path}")