diff --git a/Analysis.md b/Analysis.md index de3eb37..49da254 100644 --- a/Analysis.md +++ b/Analysis.md @@ -1,27 +1,73 @@ ### Metrics -#### Stable Diffusion UNet on Nvidia Geforce GTX 1070 GPU - see `results/benchmarks/benchmark_results.csv` +GPU - Nvidia Geforce GTX 1070 GPU +CPU - Intel Core i7-8750H + +#### Stable Diffusion UNet on GPU + ``` bash -INFO - 2025-09-01 16:26:26,203 - 3315694995.py - UNet Inference time: 0.10s -INFO - 2025-09-01 16:26:26,448 - 3315694995.py - UNet Inference time: 0.24s -INFO - 2025-09-01 16:26:26,711 - 3315694995.py - UNet Inference time: 0.26s -INFO - 2025-09-01 16:26:26,974 - 3315694995.py - UNet Inference time: 0.26s -INFO - 2025-09-01 16:26:27,238 - 3315694995.py - UNet Inference time: 0.26s -INFO - 2025-09-01 16:26:27,494 - 3315694995.py - UNet Inference time: 0.25s -INFO - 2025-09-01 16:26:27,760 - 3315694995.py - UNet Inference time: 0.27s -INFO - 2025-09-01 16:26:28,017 - 3315694995.py - UNet Inference time: 0.26s -INFO - 2025-09-01 16:26:28,285 - 3315694995.py - UNet Inference time: 0.27s -INFO - 2025-09-01 16:26:28,538 - 3315694995.py - UNet Inference time: 0.25s -INFO - 2025-09-01 16:26:28,538 - 3315694995.py - - -Average inference time: 0.24s ± 0.05s - -Average CPU memory usage: 1009.82MB ± 0.03MB - -Average GPU memory usage: 3817.92MB ± 0.00MB +INFO - UNet Inference time: 0.52s +INFO - UNet Inference time: 0.15s +INFO - UNet Inference time: 0.26s +INFO - UNet Inference time: 0.26s +INFO - UNet Inference time: 0.26s +INFO - UNet Inference time: 0.26s +INFO - UNet Inference time: 0.24s +INFO - UNet Inference time: 0.28s +INFO - UNet Inference time: 0.24s +INFO - UNet Inference time: 0.27s + +Average inference time: 0.27s ± 0.09s + +Average CPU memory usage: 889.63MB ± 0.00MB + +Average GPU memory usage: 2511.14MB ± 0.00MB ``` --- + +#### Stable Diffusion UNet via ONNX Runtime on GPU + +```bash +INFO - ONNXRuntime Inference time: 1.82s +INFO - ONNXRuntime Inference time: 1.59s +INFO - ONNXRuntime Inference time: 1.59s +INFO - ONNXRuntime Inference time: 1.60s +INFO - ONNXRuntime Inference time: 1.59s +INFO - ONNXRuntime Inference time: 1.59s +INFO - ONNXRuntime Inference time: 1.58s +INFO - ONNXRuntime Inference time: 1.58s +INFO - ONNXRuntime Inference time: 1.60s +INFO - ONNXRuntime Inference time: 1.58s + +Average inference time: 1.61s ± 0.07s + +Average CPU memory usage: 1390.87MB ± 0.03MB + +Average GPU memory usage: 1355.65MB ± 0.00MB + +``` + +--- + +#### Stable Diffusion UNet via ONNX Runtime on CPU + +```bash +INFO - ONNXRuntime Inference time: 8.94s +INFO - ONNXRuntime Inference time: 9.48s +INFO - ONNXRuntime Inference time: 7.13s +INFO - ONNXRuntime Inference time: 7.42s +INFO - ONNXRuntime Inference time: 7.06s +INFO - ONNXRuntime Inference time: 6.98s +INFO - ONNXRuntime Inference time: 6.91s +INFO - ONNXRuntime Inference time: 7.57s +INFO - ONNXRuntime Inference time: 6.90s +INFO - ONNXRuntime Inference time: 6.98s + +Average inference time: 7.54s ± 0.92s + +Average CPU memory usage: 4968.51MB ± 344.71MB + +``` diff --git a/notebooks/baseline_generation.ipynb b/notebooks/baseline_generation.ipynb index dcdc903..1fd310e 100644 --- a/notebooks/baseline_generation.ipynb +++ b/notebooks/baseline_generation.ipynb @@ -160,6 +160,16 @@ "id": "12", "metadata": {}, "outputs": [], + "source": [ + "LOGGER.info(f\"Input channels: {pipe.unet.config.in_channels}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": {}, + "outputs": [], "source": [ "batch_size = 1\n", "height = width = 64 # for 512x512 images\n", @@ -180,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "13", + "id": "14", "metadata": {}, "source": [ "Benchmarking" @@ -189,7 +199,7 @@ { "cell_type": "code", "execution_count": null, - "id": "14", + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -202,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": {}, "outputs": [], "source": [ @@ -258,7 +268,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "17", "metadata": {}, "source": [ "The inference time would be really small here because we're running only one denoising step of the UNet as opposed to say 50 denoising steps. " @@ -266,21 +276,10 @@ }, { "cell_type": "markdown", - "id": "17", - "metadata": {}, - "source": [ - "Save benchmark details as CSV" - ] - }, - { - "cell_type": "code", - "execution_count": null, "id": "18", "metadata": {}, - "outputs": [], "source": [ - "BENCHMARK_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), \"results\", \"benchmarks\")\n", - "os.makedirs(BENCHMARK_SAVE_PATH, exist_ok=True)" + "Save benchmark details as CSV" ] }, { @@ -290,10 +289,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame(results)\n", - "csv_path = os.path.join(BENCHMARK_SAVE_PATH, \"benchmark_results.csv\")\n", - "df.to_csv(csv_path, index=False)\n", - "LOGGER.info(f\"Saved benchmark results to {csv_path}\")" + "from tinydiffusion.utils.csv_utils import save_results_to_csv\n", + "\n", + "save_results_to_csv(results)" ] }, { diff --git a/notebooks/onnxruntime_generation.ipynb b/notebooks/onnxruntime_generation.ipynb new file mode 100644 index 0000000..215feb5 --- /dev/null +++ b/notebooks/onnxruntime_generation.ipynb @@ -0,0 +1,346 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "Here we benchmark the UNet part of the stable diffusion model with inference done using Onnx runtime on a GPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "import psutil\n", + "import statistics\n", + "import torch\n", + "import onnxruntime as ort\n", + "from tinydiffusion.utils.logger import LoggerConfig\n", + "from tinydiffusion.utils.constants import PROMPT\n", + "from transformers import CLIPTokenizer, CLIPTextModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "LOGGER = LoggerConfig().logger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "LOGGER.info(f\"{ort.get_available_providers()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4", + "metadata": {}, + "source": [ + "Above shows that we have tensorrt, CUDA and CPU runtimes to perform inference on" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "# device = \"cpu\"\n", + "LOGGER.info(f\"Using device: {device}\") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "UNET_ONNX_PATH = os.path.join(os.getcwd(), \"..\", \"checkpoints\", \"onnx\", \"unet.onnx\")\n", + "\n", + "if not os.path.exists(UNET_ONNX_PATH):\n", + " LOGGER.warning(f\"ONNX model not found at {UNET_ONNX_PATH}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "ROOT_DIR = os.path.dirname(os.getcwd())\n", + "LOGGER.info(f\"Root directory: {ROOT_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "from tinydiffusion.utils.constants import ModelType\n", + "\n", + "tokenizer_model_cache_dir = os.path.join(ROOT_DIR, \"checkpoints\", \"cliptokenizer\")\n", + "text_encoder_model_cache_dir = os.path.join(ROOT_DIR, \"checkpoints\", \"cliptextencoder\")\n", + "\n", + "tokenizer_model_id = ModelType.LAION_CLIP_VIT.value\n", + "text_encoder_model_id = ModelType.LAION_CLIP_VIT.value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "available_providers = ort.get_available_providers()\n", + "\n", + "# CUDA\n", + "provider = \"CUDAExecutionProvider\" if \"CUDAExecutionProvider\" in available_providers else \"CPUExecutionProvider\"\n", + "\n", + "# CPU\n", + "#provider = \"CPUExecutionProvider\"\n", + "\n", + "# TensorRT\n", + "# provider = \"TensorrtExecutionProvider\"\n", + "\n", + "LOGGER.info(f\"Using ONNX Runtime provider: {provider}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "session = ort.InferenceSession(UNET_ONNX_PATH, providers=[provider])" + ] + }, + { + "cell_type": "markdown", + "id": "11", + "metadata": {}, + "source": [ + "Load text encoder & tokenizer from HuggingFace. This matches what HF's stable diffusion model uses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = CLIPTokenizer.from_pretrained(tokenizer_model_id, cache_dir=tokenizer_model_cache_dir)\n", + "text_encoder = CLIPTextModel.from_pretrained(text_encoder_model_id, cache_dir=text_encoder_model_cache_dir).to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "Benchmarking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "# Metrics\n", + "inference_time = []\n", + "cpu_mem_usage = []\n", + "gpu_mem_usage = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 1\n", + "height = width = 64 # for 512x512 images\n", + "\n", + "# dummy image latents\n", + "dummy_latents = torch.randn(\n", + " batch_size, \n", + " 4, # UNet in_channels is 4 - see baseline_generation.ipynb\n", + " height, \n", + " width, \n", + " device=device, \n", + " dtype=torch.float16 if device==\"cuda\" else torch.float32\n", + ")\n", + "\n", + "# arbitrary timestep\n", + "dummy_timestep = torch.tensor([10], device=device, dtype=torch.int64)\n", + "\n", + "# prompt token embeddings\n", + "with torch.no_grad():\n", + " input_ids = tokenizer(PROMPT, return_tensors=\"pt\").input_ids.to(device)\n", + " text_embeddings = text_encoder(input_ids)[0]\n", + "\n", + "LOGGER.info(f\"Text embeddings shape: {text_embeddings.shape}\")\n", + "\n", + "# ensure text embeddings match with stable diffusion - see baseline_generation.ipynb\n", + "assert text_embeddings.shape == torch.Size([1, 34, 1024]), f\"Unexpected text embeddings shape: {text_embeddings.shape}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = PROMPT\n", + "num_samples = 10\n", + "\n", + "process = psutil.Process(os.getpid())\n", + "\n", + "results = []\n", + "\n", + "dummy_latents = dummy_latents.cpu().numpy().astype(\"float16\") # to match Stable Diffusion UNet's dtype\n", + "dummy_timestep = dummy_timestep.cpu().numpy().astype(\"int64\")\n", + "text_embeddings = text_embeddings.cpu().numpy().astype(\"float16\")\n", + "\n", + "for i in range(num_samples):\n", + " start_time = time.time()\n", + "\n", + " # run ONNX inference - dummy inputs and for 1 denoising step of the UNet\n", + " outputs = session.run(\n", + " None,\n", + " {\n", + " \"latents\": dummy_latents,\n", + " \"timestep\": dummy_timestep,\n", + " \"text_embeddings\": text_embeddings,\n", + " }\n", + " )\n", + "\n", + " end_time = time.time()\n", + " inference_time.append(end_time - start_time)\n", + "\n", + " # Memory usage - START\n", + " cpu_mem = process.memory_info().rss / (1024**2) # MB\n", + " cpu_mem_usage.append(cpu_mem)\n", + "\n", + " if device == \"cuda\":\n", + " gpu_mem = torch.cuda.memory_allocated(0) / (1024**2) # MB\n", + " gpu_mem_usage.append(gpu_mem)\n", + " else:\n", + " gpu_mem = 0 \n", + " # Memory usage - END\n", + "\n", + " LOGGER.info(f\"ONNXRuntime Inference time: {(end_time - start_time):.2f}s\")\n", + "\n", + "LOGGER.info(f\"\\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s\")\n", + "LOGGER.info(f\"\\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB\")\n", + "if device == \"cuda\":\n", + " LOGGER.info(f\"\\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB\")\n", + "\n", + "# store results\n", + "results.append({\n", + " \"desc\": \"stable_diffusion_ONNX_UNet_GPU\",\n", + " \"avg_inference_time\": statistics.mean(inference_time),\n", + " \"std_inference_time\": statistics.stdev(inference_time),\n", + " \"avg_cpu_mem_usage\": statistics.mean(cpu_mem_usage),\n", + " \"std_cpu_mem_usage\": statistics.stdev(cpu_mem_usage),\n", + " \"avg_gpu_mem_usage\": statistics.mean(gpu_mem_usage) if device == \"cuda\" else 0,\n", + " \"std_gpu_mem_usage\": statistics.stdev(gpu_mem_usage) if device == \"cuda\" else 0,\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": {}, + "source": [ + "Ooof, that pretty bad compared to running the original Stable Diffusion UNet directly. And that is probably because pytorch UNet has a bunch of optimizations which are not being used by the exported ONNX model (if it was even exported at all)\n", + "\n", + "**Change the execution provider and device and rerun this notebook on CPU. Also try tensorrt provider + CUDA.**" + ] + }, + { + "cell_type": "markdown", + "id": "18", + "metadata": {}, + "source": [ + "See `Analysis.md`. ONNX Runtime with CPU provider has the worst latency." + ] + }, + { + "cell_type": "markdown", + "id": "19", + "metadata": {}, + "source": [ + "Save benchmark details as CSV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": {}, + "outputs": [], + "source": [ + "from tinydiffusion.utils.csv_utils import save_results_to_csv\n", + "\n", + "save_results_to_csv(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tiny-diffusion", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/results/benchmarks/benchmark_results.csv b/results/benchmarks/benchmark_results.csv index 49331d3..a3872d4 100644 --- a/results/benchmarks/benchmark_results.csv +++ b/results/benchmarks/benchmark_results.csv @@ -1,2 +1,4 @@ desc,avg_inference_time,std_inference_time,avg_cpu_mem_usage,std_cpu_mem_usage,avg_gpu_mem_usage,std_gpu_mem_usage -stable_diffusion_UNet_GPU,0.24329500198364257,0.04984202499570953,1009.8203125,0.03146626555623586,3817.91748046875,0.0 +stable_diffusion_UNet_GPU,0.27318849563598635,0.09205775216375249,889.631640625,0.0037057941330098194,2511.1396484375,0.0 +stable_diffusion_ONNX_UNet_GPU,1.6123629570007325,0.07474299686629637,1390.87265625,0.03450889327807801,1355.65380859375,0.0 +stable_diffusion_ONNX_UNet_CPU,7.5368633508682255,0.9180746064937988,4968.514453125,344.70516150741446,0,0 diff --git a/tinydiffusion/utils/constants.py b/tinydiffusion/utils/constants.py index 985c0e2..1507e8a 100644 --- a/tinydiffusion/utils/constants.py +++ b/tinydiffusion/utils/constants.py @@ -10,3 +10,7 @@ class ModelType(Enum): STABLE_DIFFUSION_2_BASE = ( "stabilityai/stable-diffusion-2-base" # This is not LoRA checkpoint ) + + LAION_CLIP_VIT = ( + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" # Tokenizer and Text encoder + ) diff --git a/tinydiffusion/utils/csv_utils.py b/tinydiffusion/utils/csv_utils.py new file mode 100644 index 0000000..211dcfe --- /dev/null +++ b/tinydiffusion/utils/csv_utils.py @@ -0,0 +1,27 @@ +import os +import pandas as pd + +from tinydiffusion.utils.logger import LoggerConfig + +LOGGER = LoggerConfig().logger + +BENCHMARK_SAVE_PATH = os.path.join( + os.path.dirname(os.getcwd()), "results", "benchmarks" +) +os.makedirs(BENCHMARK_SAVE_PATH, exist_ok=True) + + +def save_results_to_csv(results: list) -> None: + """ + Save benchmark results to a CSV file. + + Args: + results (list): A list of benchmark results to save. + """ + df = pd.DataFrame(results) + csv_path = os.path.join(BENCHMARK_SAVE_PATH, "benchmark_results.csv") + + file_exists = os.path.isfile(csv_path) + + df.to_csv(csv_path, mode="a", header=not file_exists, index=False) + LOGGER.info(f"Saved benchmark results to {csv_path}")