From b35d68168cc69eda0dd870618bc71e372984fa5c Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Sun, 6 Apr 2025 17:44:11 +0800 Subject: [PATCH 01/17] changes to azure, aws, cloudflare models --- .../multiple_models/aws.json | 7 +- .../multiple_models/azure.json | 9 +- .../multiple_models/cloudflare.json | 11 +- .../multiple_models/open_ai.json | 3 +- .../multiple_models/together_ai.json | 5 +- benchmarking/experiments/sample_config.json | 4 +- providers/aws_provider.py | 28 ++- providers/azure_provider.py | 186 +++++++++++------- providers/cloudflare_provider.py | 2 + .../providers/cloudflare_provider_test.py | 2 + 10 files changed, 162 insertions(+), 95 deletions(-) diff --git a/benchmarking/experiments/providers_streaming/multiple_models/aws.json b/benchmarking/experiments/providers_streaming/multiple_models/aws.json index 438cb711..11b5879a 100644 --- a/benchmarking/experiments/providers_streaming/multiple_models/aws.json +++ b/benchmarking/experiments/providers_streaming/multiple_models/aws.json @@ -3,12 +3,11 @@ "AWSBedrock" ], "models": [ - "common-model" + "meta-llama-3-8b-instruct", "mistral-23b-instruct-v0.1", "mistral-48b-instruct-v0.1", "meta-llama-3-70b-instruct", "mistral-124b-instruct-v0.1" ], - "num_requests": 100, + "num_requests": 1, "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/providers_streaming/multiple_models/azure.json b/benchmarking/experiments/providers_streaming/multiple_models/azure.json index 53643191..f3d2d5ad 100644 --- a/benchmarking/experiments/providers_streaming/multiple_models/azure.json +++ b/benchmarking/experiments/providers_streaming/multiple_models/azure.json @@ -4,12 +4,13 @@ ], "models": [ "meta-llama-3.1-8b-instruct", - "meta-llama-3.1-70b-instruct" + "mistral-23b-instruct-v0.1", + "meta-llama-3.3-70b-instruct", + "meta-llama-3.1-405b-instruct" ], - "num_requests": 100, + "num_requests": 1, "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/providers_streaming/multiple_models/cloudflare.json b/benchmarking/experiments/providers_streaming/multiple_models/cloudflare.json index 6894e9ae..2f5d4354 100644 --- a/benchmarking/experiments/providers_streaming/multiple_models/cloudflare.json +++ b/benchmarking/experiments/providers_streaming/multiple_models/cloudflare.json @@ -3,16 +3,13 @@ "Cloudflare" ], "models": [ - "google-gemma-2b-it", - "phi-2", - "meta-llama-3.2-3b-instruct", - "meta-llama-3.1-70b-instruct", - "mistral-7b-instruct-v0.1" + "meta-llama-2-7b", + "deepseek-ai-R1-32b", + "meta-llama-3.1-70b-instruct" ], "num_requests": 100, "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/providers_streaming/multiple_models/open_ai.json b/benchmarking/experiments/providers_streaming/multiple_models/open_ai.json index 15925217..252dc718 100644 --- a/benchmarking/experiments/providers_streaming/multiple_models/open_ai.json +++ b/benchmarking/experiments/providers_streaming/multiple_models/open_ai.json @@ -11,6 +11,5 @@ "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/providers_streaming/multiple_models/together_ai.json b/benchmarking/experiments/providers_streaming/multiple_models/together_ai.json index d66a61c8..97bc38a1 100644 --- a/benchmarking/experiments/providers_streaming/multiple_models/together_ai.json +++ b/benchmarking/experiments/providers_streaming/multiple_models/together_ai.json @@ -3,8 +3,6 @@ "TogetherAI" ], "models": [ - "google-gemma-2b-it", - "meta-llama-3.2-3b-instruct", "mistral-7b-instruct-v0.1", "meta-llama-3.1-70b-instruct", "meta-llama-3.1-405b-instruct" @@ -13,6 +11,5 @@ "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/sample_config.json b/benchmarking/experiments/sample_config.json index 244f8f2d..e79f6667 100644 --- a/benchmarking/experiments/sample_config.json +++ b/benchmarking/experiments/sample_config.json @@ -3,9 +3,9 @@ "Azure" ], "models": [ - "common-model" + "mistral-23b-instruct-v0.1" ], - "num_requests": 3, + "num_requests": 1, "input_tokens": 10, "streaming": true, "max_output": 100, diff --git a/providers/aws_provider.py b/providers/aws_provider.py index d522d93b..9775b3ed 100644 --- a/providers/aws_provider.py +++ b/providers/aws_provider.py @@ -25,12 +25,20 @@ def __init__(self): # model names self.model_map = { "meta-llama-3-70b-instruct": "meta.llama3-70b-instruct-v1:0", + "meta-llama-3-8b-instruct": "meta.llama3-8b-instruct-v1:0", + "mistral-48b-instruct-v0.1": "mistral.mixtral-8x7b-instruct-v0:1", + "mistral-23b-instruct-v0.1": "mistral.mistral-small-2402-v1:0", + "mistral-124b-instruct-v0.1":"mistral.mistral-large-2402-v1:0", "common-model": "meta.llama3-70b-instruct-v1:0", + "common-model-small": "meta.llama3-8b-instruct-v1:0" } def get_model_name(self, model): return self.model_map.get(model, None) # or model + def get_model_provider(self, model_id): + return model_id[:model_id.find('.')] + def format_prompt(self, user_prompt): """ Combines the system prompt and user prompt into a single formatted prompt. @@ -92,13 +100,18 @@ def perform_inference_streaming( print("[INFO] Performing streaming inference...") model_id = self.get_model_name(model) + model_provider = self.get_model_provider(model_id) # Prepare the request payload formatted_prompt = self.format_prompt(prompt) + if model_provider == "meta": + max_tokens_config = 'max_gen_len' + else: + max_tokens_config = 'max_tokens' native_request = { "prompt": formatted_prompt, - "max_gen_len": max_output, + max_tokens_config: max_output, } request_body = json.dumps(native_request) @@ -122,14 +135,17 @@ def perform_inference_streaming( # print(f"[DEBUG] Failed to decode chunk: {e}") continue - if chunk["stop_reason"] == 'length': + if chunk.get("stop_reason") == "length" or chunk.get("outputs", [{}])[0].get("stop_reason") == "length": total_time = time.perf_counter() - start_time print(chunk) break - - if "generation" in chunk: - current_token = chunk["generation"] - + + if "outputs" in chunk or 'generation' in chunk: + if "outputs" in chunk : + current_token = chunk["outputs"][0]['text'] + if 'generation' in chunk: + current_token = chunk["generation"] + # print(current_token) # Calculate timing current_time = time.perf_counter() if first_token_time is None: diff --git a/providers/azure_provider.py b/providers/azure_provider.py index cfe82ce4..632f6731 100644 --- a/providers/azure_provider.py +++ b/providers/azure_provider.py @@ -5,18 +5,31 @@ from time import perf_counter as timer import re +from azure.ai.inference import ChatCompletionsClient +from azure.ai.inference.models import SystemMessage, UserMessage +from azure.core.credentials import AzureKeyCredential + class Azure(ProviderInterface): def __init__(self): """Initialize AzureProvider with required API information.""" super().__init__() + endpoint = "https://ai-kavifyp0693ai007107396570.services.ai.azure.com/models" + azure_api = os.environ.get("AZURE_API_KEY") + + self.client = ChatCompletionsClient( + endpoint=endpoint, + credential=AzureKeyCredential(azure_api), + ) # Map model names to Azure model IDs self.model_map = { # "mistral-7b-instruct-v0.1": "mistral-7b-instruct-v0.1", - "meta-llama-3.1-8b-instruct": "Meta-Llama-3-1-8B-Instruct-fyp", - "meta-llama-3.1-70b-instruct": "Meta-Llama-3-1-70B-Instruct-fyp", + "meta-llama-3.1-8b-instruct": "Meta-Llama-3.1-8B-Instruct", + "meta-llama-3.3-70b-instruct": "Llama-3.3-70B-Instruct", "mistral-large": "Mistral-Large-2411-yatcd", + "mistral-23b-instruct-v0.1": "Mistral-small", + "meta-llama-3.1-405b-instruct": "Meta-Llama-3.1-405B-Instruct", "common-model": "Mistral-Large-2411-yatcd", } @@ -25,7 +38,9 @@ def __init__(self): # "mistral-7b-instruct-v0.1": os.environ.get("MISTRAL_API_KEY"), "meta-llama-3.1-8b-instruct": os.environ.get("AZURE_LLAMA_8B_API"), "meta-llama-3.1-70b-instruct": os.environ.get("AZURE_LLAMA_3.1_70B_API"), + "meta-llama-3.1-405b-instruct": os.environ.get("AZURE_LLAMA_3.1_405B_API"), "mistral-large": os.environ.get("MISTRAL_LARGE_API"), + "mistral-small": os.environ.get("MISTRAL_SMALL_API"), "common-model": os.environ.get("MISTRAL_LARGE_API") } @@ -46,38 +61,48 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True): """Performs non-streaming inference request to Azure.""" try: model_id = self.get_model_name(model) - api_key = self.get_model_api_key(model) + # api_key = self.get_model_api_key(model) if model_id is None: print(f"Model {model} not available.") return None start_time = timer() - endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" - response = requests.post( - f"{endpoint}", - headers={ - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - }, - json={ - "messages": [ - {"role": "system", "content": self.system_prompt}, - {"role": "user", "content": prompt}, - ], - "max_tokens": max_output, - }, - timeout=500, + # endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" + # response = requests.post( + # f"{endpoint}", + # headers={ + # "Authorization": f"Bearer {api_key}", + # "Content-Type": "application/json", + # }, + # json={ + # "messages": [ + # {"role": "system", "content": self.system_prompt}, + # {"role": "user", "content": prompt}, + # ], + # "max_tokens": max_output, + # }, + # timeout=500, + # ) + + response = self.client.complete( + messages=[ + SystemMessage(content=self.system_prompt), + UserMessage(content=prompt) + ], + max_tokens=max_output, + model=model_id ) elapsed = timer() - start_time - if response.status_code != 200: - print(f"Error: {response.status_code} - {response.text}") - return None + # print(response) + # if response.status_code != 200: + # print(f"Error: {response.status_code} - {response.text}") + # return None # Parse and display response - inference = response.json() + # inference = response.json() self.log_metrics(model, "response_times", elapsed) if verbosity: - print(f"Response: {inference['choices'][0]['message']['content']}") - return inference + print(f"Response: {response['choices'][0]['message']['content']}") + return response except Exception as e: print(f"[ERROR] Inference failed for model '{model}': {e}") @@ -88,76 +113,105 @@ def perform_inference_streaming( ): """Performs streaming inference request to Azure.""" model_id = self.get_model_name(model) - api_key = self.get_model_api_key(model) + # api_key = self.get_model_api_key(model) + # api_key = os.environ.get("AZURE_API_KEY") if model_id is None: print(f"Model {model} not available.") return None inter_token_latencies = [] endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" + # endpoint = f"https://ai-kavifyp0693ai007107396570.services.ai.azure.com/models" + # print(model_id, endpoint) start_time = timer() try: - response = requests.post( - f"{endpoint}", - headers={ - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - }, - json={ - "messages": [ - # {"role": "system", "content": self.system_prompt + "\nThe number appended at the end is not important."}, - # {"role": "user", "content": prompt + " " + str(timer())}, - {"role": "system", "content": self.system_prompt}, - {"role": "user", "content": prompt}, - ], - "max_tokens": max_output, - "stream": True, - }, + # response = requests.post( + # f"{endpoint}", + # headers={ + # "Authorization": f"Bearer {api_key}", + # "Content-Type": "application/json", + # }, + # json={ + # # "model": model_id, + # "messages": [ + # # {"role": "system", "content": self.system_prompt + "\nThe number appended at the end is not important."}, + # # {"role": "user", "content": prompt + " " + str(timer())}, + # {"role": "system", "content": self.system_prompt}, + # {"role": "user", "content": prompt}, + # ], + # "max_tokens": max_output, + # "stream": True, + # }, + # stream=True, + # timeout=500, + # ) + response = self.client.complete( stream=True, - timeout=500, + messages=[ + SystemMessage(content=self.system_prompt), + UserMessage(content=prompt) + ], + max_tokens=max_output, + model=model_id ) first_token_time = None - for line in response.iter_lines(): - if line: - # print(line) + for update in response: + if update.choices: if first_token_time is None: - # print(line) first_token_time = timer() ttft = first_token_time - start_time prev_token_time = first_token_time if verbosity: print(f"##### Time to First Token (TTFT): {ttft:.4f} seconds\n") - - line_str = line.decode("utf-8").strip() - if line_str == "data: [DONE]": - # print(line_str) - # print("here") - total_time = timer() - start_time - break - - # Capture token timing time_to_next_token = timer() inter_token_latency = time_to_next_token - prev_token_time prev_token_time = time_to_next_token inter_token_latencies.append(inter_token_latency) - - # Display token if verbosity is enabled - match = re.search(r'"content"\s*:\s*"(.*?)"', line_str) - if match: - print(match.group(1), end="") - # if verbosity: - # if len(inter_token_latencies) < 20: - # print(line_str[19:].split('"')[5], end="") - # elif len(inter_token_latencies) == 20: - # print("...") + print(update.choices[0].delta.content or "", end="") + + total_time = timer() - start_time + # for line in response.iter_lines(): + # if line: + # # print(line) + # if first_token_time is None: + # # print(line) + # first_token_time = timer() + # ttft = first_token_time - start_time + # prev_token_time = first_token_time + # if verbosity: + # print(f"##### Time to First Token (TTFT): {ttft:.4f} seconds\n") + + # line_str = line.decode("utf-8").strip() + + # if line_str == "data: [DONE]": + # # print(line_str) + # # print("here") + # total_time = timer() - start_time + # break + + # # Capture token timing + # time_to_next_token = timer() + # inter_token_latency = time_to_next_token - prev_token_time + # prev_token_time = time_to_next_token + # inter_token_latencies.append(inter_token_latency) + + # # Display token if verbosity is enabled + # match = re.search(r'"content"\s*:\s*"(.*?)"', line_str) + # if match: + # print(match.group(1), end="") + # # if verbosity: + # # if len(inter_token_latencies) < 20: + # # print(line_str[19:].split('"')[5], end="") + # # elif len(inter_token_latencies) == 20: + # # print("...") # Calculate total metrics if verbosity: print(f"\nTotal Response Time: {total_time:.4f} seconds") - print(len(inter_token_latencies)) + # print(inter_token_latencies) # Log metrics avg_tbt = sum(inter_token_latencies) / len(inter_token_latencies) diff --git a/providers/cloudflare_provider.py b/providers/cloudflare_provider.py index 46e57ccf..3a4288b5 100644 --- a/providers/cloudflare_provider.py +++ b/providers/cloudflare_provider.py @@ -34,6 +34,8 @@ def __init__(self): "meta-llama-3.2-3b-instruct": "@cf/meta/llama-3.2-3b-instruct", "mistral-7b-instruct-v0.1": "@cf/mistral/mistral-7b-instruct-v0.1", "meta-llama-3.1-70b-instruct": "@cf/meta/llama-3.1-70b-instruct", + "meta-llama-2-7b":"@cf/meta/llama-2-7b-chat-fp16", + "deepseek-ai-R1-32b": "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b", "common-model": "@cf/meta/llama-3.3-70b-instruct-fp8-fast", } diff --git a/test_files/providers/cloudflare_provider_test.py b/test_files/providers/cloudflare_provider_test.py index 1454e023..17d43237 100644 --- a/test_files/providers/cloudflare_provider_test.py +++ b/test_files/providers/cloudflare_provider_test.py @@ -28,6 +28,8 @@ def test_cloudflare_provider_initialization(setup_cloudflare_provider): "meta-llama-3.2-3b-instruct": "@cf/meta/llama-3.2-3b-instruct", "mistral-7b-instruct-v0.1": "@cf/mistral/mistral-7b-instruct-v0.1", "meta-llama-3.1-70b-instruct": "@cf/meta/llama-3.1-70b-instruct", + "meta-llama-2-7b":"@cf/meta/llama-2-7b-chat-fp16", + "deepseek-ai-R1-32b": "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b", "common-model": "@cf/meta/llama-3.3-70b-instruct-fp8-fast", } From 6a58e71846e33f27faaf0dabb8f15bbc59c524b9 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Sun, 6 Apr 2025 18:33:24 +0800 Subject: [PATCH 02/17] update requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4e1e54db..a8818d7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ groq==0.13.0 google-generativeai==0.8.3 fastapi==0.115.6 uvicorn==0.32.1 -pytest-asyncio==0.25.0 \ No newline at end of file +pytest-asyncio==0.25.0 +azure-ai-inference \ No newline at end of file From 1afe79208b43d1664173bcabd6588a78a91c66da Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Sun, 6 Apr 2025 18:45:20 +0800 Subject: [PATCH 03/17] fix flake --- providers/aws_provider.py | 2 +- providers/azure_provider.py | 6 +++--- providers/cloudflare_provider.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/providers/aws_provider.py b/providers/aws_provider.py index 9775b3ed..9b759755 100644 --- a/providers/aws_provider.py +++ b/providers/aws_provider.py @@ -28,7 +28,7 @@ def __init__(self): "meta-llama-3-8b-instruct": "meta.llama3-8b-instruct-v1:0", "mistral-48b-instruct-v0.1": "mistral.mixtral-8x7b-instruct-v0:1", "mistral-23b-instruct-v0.1": "mistral.mistral-small-2402-v1:0", - "mistral-124b-instruct-v0.1":"mistral.mistral-large-2402-v1:0", + "mistral-124b-instruct-v0.1": "mistral.mistral-large-2402-v1:0", "common-model": "meta.llama3-70b-instruct-v1:0", "common-model-small": "meta.llama3-8b-instruct-v1:0" } diff --git a/providers/azure_provider.py b/providers/azure_provider.py index 632f6731..fcdaaed4 100644 --- a/providers/azure_provider.py +++ b/providers/azure_provider.py @@ -1,9 +1,9 @@ import os -import requests +# import requests import numpy as np from providers.base_provider import ProviderInterface from time import perf_counter as timer -import re +# import re from azure.ai.inference import ChatCompletionsClient from azure.ai.inference.models import SystemMessage, UserMessage @@ -120,7 +120,7 @@ def perform_inference_streaming( return None inter_token_latencies = [] - endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" + # endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" # endpoint = f"https://ai-kavifyp0693ai007107396570.services.ai.azure.com/models" # print(model_id, endpoint) start_time = timer() diff --git a/providers/cloudflare_provider.py b/providers/cloudflare_provider.py index 3a4288b5..e42d2b7f 100644 --- a/providers/cloudflare_provider.py +++ b/providers/cloudflare_provider.py @@ -34,7 +34,7 @@ def __init__(self): "meta-llama-3.2-3b-instruct": "@cf/meta/llama-3.2-3b-instruct", "mistral-7b-instruct-v0.1": "@cf/mistral/mistral-7b-instruct-v0.1", "meta-llama-3.1-70b-instruct": "@cf/meta/llama-3.1-70b-instruct", - "meta-llama-2-7b":"@cf/meta/llama-2-7b-chat-fp16", + "meta-llama-2-7b": "@cf/meta/llama-2-7b-chat-fp16", "deepseek-ai-R1-32b": "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b", "common-model": "@cf/meta/llama-3.3-70b-instruct-fp8-fast", } From f387abe5bf240446a054d51d654153b61773f458 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Sun, 6 Apr 2025 19:01:48 +0800 Subject: [PATCH 04/17] fix unit tests --- benchmarking/benchmark_main.py | 6 +++--- benchmarking/dynamo_bench.py | 6 +++--- test_files/main_test.py | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmarking/benchmark_main.py b/benchmarking/benchmark_main.py index c7a09c9a..4afe715e 100644 --- a/benchmarking/benchmark_main.py +++ b/benchmarking/benchmark_main.py @@ -156,9 +156,9 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - if i % 20 == 0: - # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(120) + # if i % 20 == 0: + # # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") + # time.sleep(120) if self.streaming: if provider_name == "vLLM": diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index 53dffb4c..1a3b1995 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -234,9 +234,9 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - if i % 20 == 0: - # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(120) + # if i % 20 == 0: + # # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") + # time.sleep(120) if self.streaming: if provider_name == "vLLM": diff --git a/test_files/main_test.py b/test_files/main_test.py index 47d7f29d..c3d0be52 100644 --- a/test_files/main_test.py +++ b/test_files/main_test.py @@ -38,6 +38,7 @@ def setUp(self): "AWS_BEDROCK_ACCESS_KEY_ID": "aws_test_id", "AWS_BEDROCK_SECRET_ACCESS_KEY": "aws_test_key", "AWS_BEDROCK_REGION": "us-east-1", + "AZURE_API": "test_azure_api", }, ) From f86ecc71bfb4b551540fb882e3d0595d6f78879c Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Sun, 6 Apr 2025 19:07:34 +0800 Subject: [PATCH 05/17] fix unit tests --- test_files/main_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_files/main_test.py b/test_files/main_test.py index c3d0be52..9bb34a39 100644 --- a/test_files/main_test.py +++ b/test_files/main_test.py @@ -38,7 +38,7 @@ def setUp(self): "AWS_BEDROCK_ACCESS_KEY_ID": "aws_test_id", "AWS_BEDROCK_SECRET_ACCESS_KEY": "aws_test_key", "AWS_BEDROCK_REGION": "us-east-1", - "AZURE_API": "test_azure_api", + "AZURE_API_KEY": "test_azure_api", }, ) From daf1714902adacfee8f96f716d5dc411fd31546b Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Sun, 6 Apr 2025 19:09:50 +0800 Subject: [PATCH 06/17] fix flake --- benchmarking/benchmark_main.py | 6 +++--- benchmarking/dynamo_bench.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarking/benchmark_main.py b/benchmarking/benchmark_main.py index 4afe715e..c7a09c9a 100644 --- a/benchmarking/benchmark_main.py +++ b/benchmarking/benchmark_main.py @@ -156,9 +156,9 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - # if i % 20 == 0: - # # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - # time.sleep(120) + if i % 20 == 0: + # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") + time.sleep(120) if self.streaming: if provider_name == "vLLM": diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index 1a3b1995..53dffb4c 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -234,9 +234,9 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - # if i % 20 == 0: - # # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - # time.sleep(120) + if i % 20 == 0: + # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") + time.sleep(120) if self.streaming: if provider_name == "vLLM": From 014da9c9c70ca1ab508854997355230e5e1706f3 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 15:03:58 +0800 Subject: [PATCH 07/17] increase delay --- benchmarking/dynamo_bench.py | 2 +- benchmarking/experiments/compare_providers_streaming.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index 53dffb4c..a477f2d5 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -236,7 +236,7 @@ def run(self): if i % 20 == 0: # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(120) + time.sleep(600) if self.streaming: if provider_name == "vLLM": diff --git a/benchmarking/experiments/compare_providers_streaming.json b/benchmarking/experiments/compare_providers_streaming.json index 9f90ccec..6c91d007 100644 --- a/benchmarking/experiments/compare_providers_streaming.json +++ b/benchmarking/experiments/compare_providers_streaming.json @@ -1,12 +1,12 @@ { "providers": [ + "Hyperbolic", + "Groq", "Anthropic", "AWSBedrock", "Azure", "Cloudflare", "Google", - "Groq", - "Hyperbolic", "OpenAI", "PerplexityAI", "TogetherAI" From cfc3f31e488b201b008084954857a573b93a2240 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 15:10:51 +0800 Subject: [PATCH 08/17] update workflow w azure api --- .github/workflows/compare_providers_streaming.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/compare_providers_streaming.yml b/.github/workflows/compare_providers_streaming.yml index 21798c80..e22bca05 100644 --- a/.github/workflows/compare_providers_streaming.yml +++ b/.github/workflows/compare_providers_streaming.yml @@ -30,6 +30,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.AWS_REGION }} MISTRAL_LARGE_API: ${{ secrets.MISTRAL_LARGE_API }} + AZURE_API_KEY: ${{ secrects.AZURE_API_KEY }} steps: - name: Check out repository From 4d41d957371fcfe9ba200d40183f616c5e953ceb Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 15:11:54 +0800 Subject: [PATCH 09/17] update workflow w azure api --- .github/workflows/compare_providers_streaming.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compare_providers_streaming.yml b/.github/workflows/compare_providers_streaming.yml index e22bca05..24e8cfd3 100644 --- a/.github/workflows/compare_providers_streaming.yml +++ b/.github/workflows/compare_providers_streaming.yml @@ -30,7 +30,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.AWS_REGION }} MISTRAL_LARGE_API: ${{ secrets.MISTRAL_LARGE_API }} - AZURE_API_KEY: ${{ secrects.AZURE_API_KEY }} + AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }} steps: - name: Check out repository From a78628dc215968b7aec9bdfca5c08998cbdb4f5e Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 16:04:56 +0800 Subject: [PATCH 10/17] update delay time --- benchmarking/dynamo_bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index a477f2d5..6cf02baa 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -236,7 +236,7 @@ def run(self): if i % 20 == 0: # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(600) + time.sleep(400) if self.streaming: if provider_name == "vLLM": From 2d013a32e5e1d6201a44c8ba93a9562096c9661e Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 18:40:04 +0800 Subject: [PATCH 11/17] add delay --- benchmarking/dynamo_bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index 6cf02baa..fdd47ff1 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -236,7 +236,7 @@ def run(self): if i % 20 == 0: # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(400) + time.sleep(300) if self.streaming: if provider_name == "vLLM": From f3a60eeabd5bc1e248964a0077e871f23794c30d Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 19:03:16 +0800 Subject: [PATCH 12/17] more frequent delay --- benchmarking/dynamo_bench.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index fdd47ff1..c8e1a78e 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -234,9 +234,10 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - if i % 20 == 0: + if i % 5 == 0 and (provider_name == "Groq" or provider_name == "Hyperbolic"): # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(300) + time.sleep(100) + print("finished sleeping") if self.streaming: if provider_name == "vLLM": From 587f40cdb07d5b9d0555ea880c13de5a99721a22 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 7 Apr 2025 22:15:08 +0800 Subject: [PATCH 13/17] more frequent delay --- benchmarking/dynamo_bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index c8e1a78e..9d30a6dd 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -234,7 +234,7 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - if i % 5 == 0 and (provider_name == "Groq" or provider_name == "Hyperbolic"): + if i % 2 == 0 and (provider_name == "Groq" or provider_name == "Hyperbolic"): # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") time.sleep(100) print("finished sleeping") From df0cc74496d4a29647cf3c1bfa6ce76aa2de4779 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Tue, 8 Apr 2025 09:07:44 +0800 Subject: [PATCH 14/17] update timeout --- .../compare_providers_streaming.json | 2 +- providers/azure_provider.py | 70 ------------------- providers/base_provider.py | 4 +- test_files/providers/base_provider_test.py | 4 +- 4 files changed, 5 insertions(+), 75 deletions(-) diff --git a/benchmarking/experiments/compare_providers_streaming.json b/benchmarking/experiments/compare_providers_streaming.json index 6c91d007..72474a59 100644 --- a/benchmarking/experiments/compare_providers_streaming.json +++ b/benchmarking/experiments/compare_providers_streaming.json @@ -1,7 +1,7 @@ { "providers": [ - "Hyperbolic", "Groq", + "Hyperbolic", "Anthropic", "AWSBedrock", "Azure", diff --git a/providers/azure_provider.py b/providers/azure_provider.py index fcdaaed4..14255627 100644 --- a/providers/azure_provider.py +++ b/providers/azure_provider.py @@ -66,22 +66,6 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True): print(f"Model {model} not available.") return None start_time = timer() - # endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" - # response = requests.post( - # f"{endpoint}", - # headers={ - # "Authorization": f"Bearer {api_key}", - # "Content-Type": "application/json", - # }, - # json={ - # "messages": [ - # {"role": "system", "content": self.system_prompt}, - # {"role": "user", "content": prompt}, - # ], - # "max_tokens": max_output, - # }, - # timeout=500, - # ) response = self.client.complete( messages=[ @@ -125,26 +109,6 @@ def perform_inference_streaming( # print(model_id, endpoint) start_time = timer() try: - # response = requests.post( - # f"{endpoint}", - # headers={ - # "Authorization": f"Bearer {api_key}", - # "Content-Type": "application/json", - # }, - # json={ - # # "model": model_id, - # "messages": [ - # # {"role": "system", "content": self.system_prompt + "\nThe number appended at the end is not important."}, - # # {"role": "user", "content": prompt + " " + str(timer())}, - # {"role": "system", "content": self.system_prompt}, - # {"role": "user", "content": prompt}, - # ], - # "max_tokens": max_output, - # "stream": True, - # }, - # stream=True, - # timeout=500, - # ) response = self.client.complete( stream=True, messages=[ @@ -172,40 +136,6 @@ def perform_inference_streaming( print(update.choices[0].delta.content or "", end="") total_time = timer() - start_time - # for line in response.iter_lines(): - # if line: - # # print(line) - # if first_token_time is None: - # # print(line) - # first_token_time = timer() - # ttft = first_token_time - start_time - # prev_token_time = first_token_time - # if verbosity: - # print(f"##### Time to First Token (TTFT): {ttft:.4f} seconds\n") - - # line_str = line.decode("utf-8").strip() - - # if line_str == "data: [DONE]": - # # print(line_str) - # # print("here") - # total_time = timer() - start_time - # break - - # # Capture token timing - # time_to_next_token = timer() - # inter_token_latency = time_to_next_token - prev_token_time - # prev_token_time = time_to_next_token - # inter_token_latencies.append(inter_token_latency) - - # # Display token if verbosity is enabled - # match = re.search(r'"content"\s*:\s*"(.*?)"', line_str) - # if match: - # print(match.group(1), end="") - # # if verbosity: - # # if len(inter_token_latencies) < 20: - # # print(line_str[19:].split('"')[5], end="") - # # elif len(inter_token_latencies) == 20: - # # print("...") # Calculate total metrics diff --git a/providers/base_provider.py b/providers/base_provider.py index 88ed152a..6d6ddf4a 100644 --- a/providers/base_provider.py +++ b/providers/base_provider.py @@ -34,7 +34,7 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True): {"role": "user", "content": prompt}, ], max_tokens=max_output, - timeout=(1, 2) + timeout=(10, 100) ) elapsed = timer() - start self.log_metrics(model, "response_times", elapsed) @@ -65,7 +65,7 @@ def perform_inference_streaming( ], stream=True, max_tokens=max_output, - timeout=(1, 2) + timeout=(10, 100) ) for chunk in response: diff --git a/test_files/providers/base_provider_test.py b/test_files/providers/base_provider_test.py index d2eccd26..f08f503f 100644 --- a/test_files/providers/base_provider_test.py +++ b/test_files/providers/base_provider_test.py @@ -37,7 +37,7 @@ def test_perform_inference( {"role": "user", "content": "What is the test prompt?"}, ], max_tokens=100, - timeout=(1,2) + timeout=(10,100) ) mock_log_metrics.assert_called_with("test-model", "response_times", 1.0) @@ -86,7 +86,7 @@ def test_perform_inference_streaming( ], stream=True, max_tokens=100, - timeout=(1,2) + timeout=(10,100) ) avg_tbt = sum([0.5, 0.5]) / len([0.5, 0.5]) From 90e8d158471e6733f94dffacdaef3d6c40921dc1 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 21 Apr 2025 17:21:25 +0800 Subject: [PATCH 15/17] update experiment --- benchmarking/dynamo_bench.py | 4 ++-- benchmarking/experiments/compare_providers_streaming.json | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarking/dynamo_bench.py b/benchmarking/dynamo_bench.py index 9d30a6dd..8f605fa5 100644 --- a/benchmarking/dynamo_bench.py +++ b/benchmarking/dynamo_bench.py @@ -234,9 +234,9 @@ def run(self): if self.verbosity: print(f"Request {i + 1}/{self.num_requests}") - if i % 2 == 0 and (provider_name == "Groq" or provider_name == "Hyperbolic"): + if i % 10 == 0: # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(100) + time.sleep(120) print("finished sleeping") if self.streaming: diff --git a/benchmarking/experiments/compare_providers_streaming.json b/benchmarking/experiments/compare_providers_streaming.json index 72474a59..179de4bd 100644 --- a/benchmarking/experiments/compare_providers_streaming.json +++ b/benchmarking/experiments/compare_providers_streaming.json @@ -2,14 +2,14 @@ "providers": [ "Groq", "Hyperbolic", - "Anthropic", "AWSBedrock", "Azure", "Cloudflare", - "Google", "OpenAI", - "PerplexityAI", - "TogetherAI" + "TogetherAI", + "Anthropic", + "Google", + "PerplexityAI" ], "models": [ "common-model" From dc85748aef78684cf538019fe0764be69a7630cd Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Mon, 21 Apr 2025 17:35:10 +0800 Subject: [PATCH 16/17] update azure common model --- providers/azure_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/providers/azure_provider.py b/providers/azure_provider.py index 14255627..ae9cec12 100644 --- a/providers/azure_provider.py +++ b/providers/azure_provider.py @@ -30,7 +30,7 @@ def __init__(self): "mistral-large": "Mistral-Large-2411-yatcd", "mistral-23b-instruct-v0.1": "Mistral-small", "meta-llama-3.1-405b-instruct": "Meta-Llama-3.1-405B-Instruct", - "common-model": "Mistral-Large-2411-yatcd", + "common-model": "Llama-3.3-70B-Instruct", } # Define API keys for each model From 8c46a08a7df2f32638096075098b91bc43e32fa1 Mon Sep 17 00:00:00 2001 From: kavi-99 Date: Tue, 3 Jun 2025 15:45:21 +0800 Subject: [PATCH 17/17] added input and output experiments --- .gitignore | 5 +- benchmarking/benchmark_main.py | 186 ++++++++++++------ .../compare_providers_streaming.json | 5 +- .../single_model/groq.json | 7 +- benchmarking/experiments/sample_config.json | 8 +- main.py | 45 +++-- providers/aws_provider.py | 54 +++-- providers/azure_provider.py | 33 ++-- providers/base_provider.py | 27 +-- providers/provider_interface.py | 30 ++- providers/together_ai_provider.py | 3 +- utils/prompt_generator.py | 19 +- 12 files changed, 278 insertions(+), 144 deletions(-) diff --git a/.gitignore b/.gitignore index 8c56d88f..e0ffce2b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,7 @@ dynamodb/docker/docker/dynamodb/shared-local-instance.db .coverage benchmark_graph/* .pytest_cache/* - +*.ipynb # Testing and coverage coverage.xml @@ -17,6 +17,9 @@ htmlcov # Notebooks notebooks/.ipynb_checkpoints/ +.ipynb_checkpoints/ +exp_dir/ +experiments/ azure-test.py \ No newline at end of file diff --git a/benchmarking/benchmark_main.py b/benchmarking/benchmark_main.py index c7a09c9a..c6a27872 100644 --- a/benchmarking/benchmark_main.py +++ b/benchmarking/benchmark_main.py @@ -4,6 +4,9 @@ import time from datetime import datetime from matplotlib.ticker import LogLocator, FormatStrFormatter +import random +from utils.db_utils import save_flattened_metrics_to_csv + class Benchmark: """ @@ -25,11 +28,12 @@ def __init__( providers, num_requests, models, - max_output, - prompt, + outputs, + prompts, streaming=False, verbosity=False, - vllm_ip=None + vllm_ip=None, + exp_dir=None ): """ Initializes the Benchmark instance with provided parameters. @@ -46,11 +50,15 @@ def __init__( self.providers = providers self.num_requests = num_requests self.models = models - self.prompt = prompt + self.prompts = prompts self.streaming = streaming - self.max_output = max_output + self.outputs = outputs self.verbosity = verbosity self.vllm_ip = vllm_ip + self.exp_dir = exp_dir + # New parameters for retry mechanism + self.max_retries = 3 + self.base_timeout = 10 base_dir = "streaming" if streaming else "end_to_end" @@ -77,33 +85,40 @@ def plot_metrics(self, metric, filename_suffix): for provider in self.providers: provider_name = provider.__class__.__name__ - for model, latencies in provider.metrics[metric].items(): - # Convert to milliseconds and sort for CDF - latencies_sorted = np.sort(latencies) * 1000 - cdf = np.arange(1, len(latencies_sorted) + 1) / len(latencies_sorted) - model_name = provider.get_model_name(model) + save_flattened_metrics_to_csv(provider, metric, f"{self.exp_dir}/{metric}_logs.csv") + print(provider.metrics[metric].items()) + for model, input_dict in provider.metrics[metric].items(): + for input_size, output_dict in input_dict.items(): + for max_output, latencies in output_dict.items(): + # Convert to milliseconds and sort for CDF + print(model, input_size, len(latencies)) + latencies_sorted = np.sort(latencies) * 1000 + cdf = np.arange(1, len(latencies_sorted) + 1) / len(latencies_sorted) + model_name = provider.get_model_name(model) - if provider_name.lower() == "vllm": - plt.plot( - latencies_sorted, - cdf, - marker="o", - linestyle="-", - markersize=6, # Slightly larger marker size - color="black", # Black color for the marker - label=f"{provider_name} - {model_name}", - linewidth=2, # Bold line - ) - else: - plt.plot( - latencies_sorted, - cdf, - marker="o", - linestyle="-", - markersize=5, - label=f"{provider_name} - {model_name}", - ) - + if provider_name.lower() == "vllm": + plt.plot( + latencies_sorted, + cdf, + marker="o", + linestyle="-", + markersize=6, # Slightly larger marker size + color="black", # Black color for the marker + # label=f"{provider_name} - {model_name}", + label=f"{provider_name}", + linewidth=2, # Bold line + ) + else: + plt.plot( + latencies_sorted, + cdf, + marker="o", + linestyle="-", + markersize=5, + # label=f"{provider_name} - {model_name}", + label=f"{provider_name} - {max_output}", + ) + plt.xlabel("Latency (ms)", fontsize=12) plt.ylabel("Portion of requests", fontsize=12) plt.grid(True) @@ -147,38 +162,94 @@ def run(self): for provider in self.providers: provider_name = provider.__class__.__name__ # logging.debug(f"{provider_name}") - print(f"{provider_name}") + # print(f"{provider_name}") for model in self.models: model_name = provider.get_model_name(model) - print(f"Model: {model_name}\nPrompt: {self.prompt}") + # print(f"Model: {model_name}") + + for prompt in self.prompts: + # print(f"Prompt: {prompt}") + for max_output in self.outputs: + for i in range(self.num_requests): + if self.verbosity: + print(f"Request {i + 1}/{self.num_requests}") + print(f"{provider_name}" + f" Model: {model_name}") - for i in range(self.num_requests): - if self.verbosity: - print(f"Request {i + 1}/{self.num_requests}") + # if ((i+1) % 10) == 0: + # # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") + # time.sleep(120) - if i % 20 == 0: - # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...") - time.sleep(120) + if self.streaming: + if provider_name == "vLLM": + provider.perform_inference_streaming( + model, prompt, self.vllm_ip, max_output, self.verbosity + ) + else: + provider.perform_inference_streaming( + model, prompt, max_output, self.verbosity + ) + else: + if provider_name == "vLLM": + provider.perform_inference( + model, prompt, self.vllm_ip, max_output, self.verbosity + ) + else: + provider.perform_inference( + model, prompt, max_output, self.verbosity + ) - if self.streaming: - if provider_name == "vLLM": - provider.perform_inference_streaming( - model, self.prompt, self.vllm_ip, self.max_output, self.verbosity - ) - else: - provider.perform_inference_streaming( - model, self.prompt, self.max_output, self.verbosity - ) - else: - if provider_name == "vLLM": - provider.perform_inference( - model, self.prompt, self.vllm_ip, self.max_output, self.verbosity - ) - else: - provider.perform_inference( - model, self.prompt, self.max_output, self.verbosity - ) + # # Simple retry mechanism + + # success = False + # attempts = 0 + + # while not success and attempts < self.max_retries: + # try: + # # Calculate timeout with exponential backoff + # timeout = self.base_timeout * (2 ** attempts) + # print(timeout) + + # if self.streaming: + # if provider_name == "vLLM": + # provider.perform_inference_streaming( + # model, self.prompt, self.vllm_ip, + # self.max_output, self.verbosity, timeout=timeout + # ) + # else: + # provider.perform_inference_streaming( + # model, self.prompt, timeout, self.max_output, + # self.verbosity + # ) + # else: + # if provider_name == "vLLM": + # provider.perform_inference( + # model, self.prompt, self.vllm_ip, + # self.max_output, self.verbosity, timeout=timeout + # ) + # else: + # provider.perform_inference( + # model, self.prompt, self.max_output, + # self.verbosity, timeout=timeout + # ) + + # # If we get here, the request was successful + # success = True + + # except Exception as e: + # attempts += 1 + # if self.verbosity: + # print(f"Request failed (attempt {attempts}/{self.max_retries}): {str(e)}") + + # # Add some jitter to avoid request storms on retry + # if attempts < self.max_retries: + # jitter = random.uniform(0.5, 1.5) + # wait_time = min(30, (2 ** attempts) * jitter) + # time.sleep(wait_time) + + # # Record failed request if all attempts failed + # if not success: + # print(f"Request {i+1} failed after {self.max_retries} attempts") if not self.streaming: self.plot_metrics("response_times", "response_times") else: @@ -188,3 +259,4 @@ def run(self): self.plot_metrics("timebetweentokens", "timebetweentokens") self.plot_metrics("timebetweentokens_median", "timebetweentokens_median") self.plot_metrics("timebetweentokens_p95", "timebetweentokens_p95") + self.plot_metrics("totaltokens", "totaltokens") diff --git a/benchmarking/experiments/compare_providers_streaming.json b/benchmarking/experiments/compare_providers_streaming.json index 179de4bd..f38ce34e 100644 --- a/benchmarking/experiments/compare_providers_streaming.json +++ b/benchmarking/experiments/compare_providers_streaming.json @@ -14,10 +14,9 @@ "models": [ "common-model" ], - "num_requests": 100, + "num_requests": 1, "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/providers_streaming/single_model/groq.json b/benchmarking/experiments/providers_streaming/single_model/groq.json index 8c3f28f7..276f96e2 100644 --- a/benchmarking/experiments/providers_streaming/single_model/groq.json +++ b/benchmarking/experiments/providers_streaming/single_model/groq.json @@ -3,12 +3,11 @@ "Groq" ], "models": [ - "google-gemma-7b-it" + "common-model" ], - "num_requests": 100, + "num_requests": 3, "input_tokens": 10, "streaming": true, "max_output": 100, - "verbose": true, - "backend": true + "verbose": true } \ No newline at end of file diff --git a/benchmarking/experiments/sample_config.json b/benchmarking/experiments/sample_config.json index e79f6667..20b7c41c 100644 --- a/benchmarking/experiments/sample_config.json +++ b/benchmarking/experiments/sample_config.json @@ -3,11 +3,11 @@ "Azure" ], "models": [ - "mistral-23b-instruct-v0.1" + "common-model-small" ], - "num_requests": 1, - "input_tokens": 10, + "num_requests": 10, + "input_tokens": [100], "streaming": true, - "max_output": 100, + "max_output": [10000], "verbose": true } \ No newline at end of file diff --git a/main.py b/main.py index f69cc95a..c7caa3f8 100644 --- a/main.py +++ b/main.py @@ -17,6 +17,7 @@ vLLM ) from utils.prompt_generator import get_prompt +from utils.db_utils import create_experiment_folder, save_config # Load environment variables load_dotenv() @@ -41,7 +42,7 @@ input_sizes = [10, 100, 1000, 10000, 100000] # Define possible max output tokens -OUTPUT_SIZE_UPPER_LIMIT = 5000 +OUTPUT_SIZE_UPPER_LIMIT = 100000 OUTPUT_SIZE_LOWER_LIMIT = 100 @@ -149,15 +150,17 @@ def validate_selected_models(selected_models, common_models, selected_providers) # Main function to run the benchmark def run_benchmark(config, vllm_ip=None): + exp_id, exp_dir = create_experiment_folder() + save_config(config, exp_dir) """Runs the benchmark based on the given configuration.""" providers = config.get("providers", []) num_requests = config.get("num_requests", 1) models = config.get("models", []) - input_tokens = config.get("input_tokens", 10) - # input_tokens = config.get("input_tokens", [10]) + # input_tokens = config.get("input_tokens", 10) + input_tokens = config.get("input_tokens", [10]) streaming = config.get("streaming", False) - max_output = config.get("max_output", 100) - # max_output = config.get("max_output", [100]) + # max_output = config.get("max_output", 100) + outputs = config.get("max_output", [100]) verbose = config.get("verbose", False) backend = config.get("backend", False) # Select Benchmark class based on backend flag @@ -193,30 +196,36 @@ def run_benchmark(config, vllm_ip=None): print(f"Selected Models: {valid_models}") # handling input tokens - if input_tokens not in input_sizes: - print(f"Please enter an input token from the following choices: {input_sizes}") - return - - prompt = get_prompt(input_tokens) + for input_token_size in input_tokens: + if input_token_size not in input_sizes: + print(f"Please enter an input token from the following choices: {input_sizes}") + return + + prompts = [] + for input_token_size in input_tokens: + prompt = get_prompt(input_token_size) + prompts.append(prompt) # print(f"Prompt: {prompt}") - if max_output < OUTPUT_SIZE_LOWER_LIMIT or max_output > OUTPUT_SIZE_UPPER_LIMIT: - print( - f"Please enter an output token length between \ - {OUTPUT_SIZE_LOWER_LIMIT} and {OUTPUT_SIZE_UPPER_LIMIT}." - ) - return + for max_output in outputs: + if max_output < OUTPUT_SIZE_LOWER_LIMIT or max_output > OUTPUT_SIZE_UPPER_LIMIT: + print( + f"Please enter an output token length between \ + {OUTPUT_SIZE_LOWER_LIMIT} and {OUTPUT_SIZE_UPPER_LIMIT}." + ) + return print("\nRunning benchmark...") benchmark = Benchmark( selected_providers, num_requests, valid_models, - max_output, - prompt=prompt, + outputs, + prompts=prompts, streaming=streaming, verbosity=verbose, vllm_ip=vllm_ip, + exp_dir=exp_dir ) benchmark.run() diff --git a/providers/aws_provider.py b/providers/aws_provider.py index 9b759755..e4e3b42b 100644 --- a/providers/aws_provider.py +++ b/providers/aws_provider.py @@ -5,6 +5,7 @@ import numpy as np from dotenv import load_dotenv from providers.provider_interface import ProviderInterface +import math class AWSBedrock(ProviderInterface): @@ -30,7 +31,7 @@ def __init__(self): "mistral-23b-instruct-v0.1": "mistral.mistral-small-2402-v1:0", "mistral-124b-instruct-v0.1": "mistral.mistral-large-2402-v1:0", "common-model": "meta.llama3-70b-instruct-v1:0", - "common-model-small": "meta.llama3-8b-instruct-v1:0" + "common-model-small": "meta.llama3-1-8b-instruct-v1:0" } def get_model_name(self, model): @@ -45,7 +46,7 @@ def format_prompt(self, user_prompt): """ return f""" <|begin_of_text|><|start_header_id|>system<|end_header_id|> - {self.system_prompt} + You are an obedient assistant. <|start_header_id|>user<|end_header_id|> {user_prompt} <|eot_id|> @@ -61,6 +62,7 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True): model_id = self.get_model_name(model) formatted_prompt = self.format_prompt(prompt) print(formatted_prompt) + print(max_output) # Prepare the request payload native_request = { "prompt": formatted_prompt, @@ -113,11 +115,14 @@ def perform_inference_streaming( "prompt": formatted_prompt, max_tokens_config: max_output, } + print(max_output) request_body = json.dumps(native_request) inter_token_latencies = [] first_token_time = None ttft = None + total_tokens = 0 + print(prompt) start_time = time.perf_counter() try: streaming_response = self.bedrock_client.invoke_model_with_response_stream( @@ -127,6 +132,7 @@ def perform_inference_streaming( # Process the streaming response for event in streaming_response["body"]: if event: + # print(event) try: # print(f"[DEBUG] {event}") chunk = json.loads(event["chunk"]["bytes"].decode("utf-8")) @@ -137,7 +143,9 @@ def perform_inference_streaming( if chunk.get("stop_reason") == "length" or chunk.get("outputs", [{}])[0].get("stop_reason") == "length": total_time = time.perf_counter() - start_time - print(chunk) + # print(chunk) + total_tokens = chunk['generation_token_count'] + print(f"Total tokens (real): {chunk['generation_token_count']}") break if "outputs" in chunk or 'generation' in chunk: @@ -162,11 +170,14 @@ def perform_inference_streaming( inter_token_latency = time_to_next_token - prev_token_time prev_token_time = time_to_next_token inter_token_latencies.append(inter_token_latency) - if verbosity: - if len(inter_token_latencies) < 20: - print(current_token, end="") # Print the token - elif len(inter_token_latencies) == 21: - print("...") + print(current_token, end=" | ") + + # if verbosity: + # if len(inter_token_latencies) < 20: + # print(current_token, end="") + # elif len(inter_token_latencies) == 21: + # print("...") + # Measure total response time total_time = time.perf_counter() - start_time @@ -176,16 +187,17 @@ def perform_inference_streaming( avg_tbt = sum(inter_token_latencies) / len(inter_token_latencies) median = np.percentile(inter_token_latencies, 50) p95 = np.percentile(inter_token_latencies, 95) - print("[INFO] avg_tbt - ", avg_tbt, median, p95) - self.log_metrics(model, "timetofirsttoken", ttft) - self.log_metrics(model, "response_times", total_time) - self.log_metrics(model, "timebetweentokens", avg_tbt) + # print("[INFO] avg_tbt - ", avg_tbt, median, p95, len(inter_token_latencies)) + # print(prompt, 10 ** math.ceil(math.log10(10 ** math.ceil(math.log10(len(prompt.split(" "))))))) + self.log_metrics(model_name=model, input_size=10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output=max_output, metric="timetofirsttoken", value=ttft) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "response_times", total_time) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens", avg_tbt) # print(median, p95) - self.log_metrics(model, "timebetweentokens_median", median) - self.log_metrics(model, "timebetweentokens_p95", p95) - self.log_metrics(model, "totaltokens", len(inter_token_latencies) + 1) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens_median", median) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens_p95", p95) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "totaltokens", total_tokens) self.log_metrics( - model, "tps", (len(inter_token_latencies) + 1) / total_time + model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "tps", (len(inter_token_latencies) + 1) / total_time ) return total_time, inter_token_latencies @@ -199,14 +211,14 @@ def perform_inference_streaming( if __name__ == "__main__": aws_bedrock = AWSBedrock() model = "common-model" - prompt = "Tell me a story." + prompt = "Write a standalone narrative of exactly 100 000 tokens: a complete, self-contained fantasy saga with clear chapters, rising action, characters, dialogue, and resolution. Do not add any extra explanation—produce only the story text until you reach 100 000 tokens exact." # Single-prompt inference - generated_text, total_time = aws_bedrock.perform_inference( - model=model, prompt=prompt, max_output=100, verbosity=True - ) + # generated_text, total_time = aws_bedrock.perform_inference( + # model=model, prompt=prompt, max_output=100, verbosity=True + # ) # Streaming inference total_time, inter_token_latencies = aws_bedrock.perform_inference_streaming( - model=model, prompt=prompt, max_output=10, verbosity=True + model=model, prompt=prompt, max_output=10000, verbosity=True ) diff --git a/providers/azure_provider.py b/providers/azure_provider.py index ae9cec12..ac37bf07 100644 --- a/providers/azure_provider.py +++ b/providers/azure_provider.py @@ -4,6 +4,7 @@ from providers.base_provider import ProviderInterface from time import perf_counter as timer # import re +import math from azure.ai.inference import ChatCompletionsClient from azure.ai.inference.models import SystemMessage, UserMessage @@ -31,6 +32,7 @@ def __init__(self): "mistral-23b-instruct-v0.1": "Mistral-small", "meta-llama-3.1-405b-instruct": "Meta-Llama-3.1-405B-Instruct", "common-model": "Llama-3.3-70B-Instruct", + "common-model-small": "Meta-Llama-3.1-8B-Instruct" } # Define API keys for each model @@ -107,6 +109,7 @@ def perform_inference_streaming( # endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions" # endpoint = f"https://ai-kavifyp0693ai007107396570.services.ai.azure.com/models" # print(model_id, endpoint) + total_time = 0 start_time = timer() try: response = self.client.complete( @@ -116,7 +119,8 @@ def perform_inference_streaming( UserMessage(content=prompt) ], max_tokens=max_output, - model=model_id + model=model_id, + temperature=1.5 # try 0.7 also ) first_token_time = None @@ -129,16 +133,23 @@ def perform_inference_streaming( if verbosity: print(f"##### Time to First Token (TTFT): {ttft:.4f} seconds\n") + if update.choices[0].finish_reason: + total_time = timer() - start_time + print() + print(update) + time_to_next_token = timer() inter_token_latency = time_to_next_token - prev_token_time prev_token_time = time_to_next_token inter_token_latencies.append(inter_token_latency) - print(update.choices[0].delta.content or "", end="") - - total_time = timer() - start_time + current_token = update.choices[0].delta.content + print(current_token, end=" ") + # if len(inter_token_latencies) < 20: + # print(current_token, end="") + # elif len(inter_token_latencies) == 21: + # print("...") # Calculate total metrics - if verbosity: print(f"\nTotal Response Time: {total_time:.4f} seconds") # print(inter_token_latencies) @@ -146,16 +157,16 @@ def perform_inference_streaming( # Log metrics avg_tbt = sum(inter_token_latencies) / len(inter_token_latencies) print(f"{avg_tbt:.4f}, {len(inter_token_latencies)}") - self.log_metrics(model, "timetofirsttoken", ttft) - self.log_metrics(model, "response_times", total_time) - self.log_metrics(model, "timebetweentokens", avg_tbt) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timetofirsttoken", ttft) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "response_times", total_time) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens", avg_tbt) self.log_metrics( - model, "timebetweentokens_median", np.median(inter_token_latencies) + model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens_median", np.median(inter_token_latencies) ) self.log_metrics( - model, "timebetweentokens_p95", np.percentile(inter_token_latencies, 95) + model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens_p95", np.percentile(inter_token_latencies, 95) ) - self.log_metrics(model, "totaltokens", len(inter_token_latencies) + 1) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "totaltokens", len(inter_token_latencies) + 1) except Exception as e: print(f"[ERROR] Streaming inference failed for model '{model}': {e}") diff --git a/providers/base_provider.py b/providers/base_provider.py index 6d6ddf4a..9daf79e2 100644 --- a/providers/base_provider.py +++ b/providers/base_provider.py @@ -1,8 +1,9 @@ # base_provider.py for chat completions api from timeit import default_timer as timer +import time import numpy as np from providers.provider_interface import ProviderInterface - +import math class BaseProvider(ProviderInterface): def __init__(self, api_key, client_class, base_url=None): @@ -20,7 +21,7 @@ def __init__(self, api_key, client_class, base_url=None): def get_model_name(self, model): return self.model_map.get(model, None) - def perform_inference(self, model, prompt, max_output=100, verbosity=True): + def perform_inference(self, model, prompt, timeout, max_output=100, verbosity=True): try: model_id = self.get_model_name(model) @@ -34,7 +35,7 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True): {"role": "user", "content": prompt}, ], max_tokens=max_output, - timeout=(10, 100) + # timeout=(10, 100) ) elapsed = timer() - start self.log_metrics(model, "response_times", elapsed) @@ -49,13 +50,13 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True): def perform_inference_streaming( self, model, prompt, max_output=100, verbosity=True ): + print(model, prompt, max_output, verbosity) try: model_id = self.get_model_name(model) if model_id is None: raise ValueError(f"Model {model} not available for provider.") first_token_time = None inter_token_latencies = [] - start = timer() response = self.client.chat.completions.create( model=model_id, @@ -65,9 +66,11 @@ def perform_inference_streaming( ], stream=True, max_tokens=max_output, - timeout=(10, 100) + # timeout=(10, 100) ) + # print(response) + for chunk in response: if first_token_time is None: first_token_time = timer() @@ -100,15 +103,15 @@ def perform_inference_streaming( print( f"\nNumber of output tokens/chunks: {len(inter_token_latencies) + 1}, Avg TBT: {avg_tbt:.4f}, Time to First Token (TTFT): {ttft:.4f} seconds, Total Response Time: {elapsed:.4f} seconds" ) - self.log_metrics(model, "timetofirsttoken", ttft) - self.log_metrics(model, "response_times", elapsed) - self.log_metrics(model, "timebetweentokens", avg_tbt) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timetofirsttoken", ttft) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "response_times", elapsed) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens", avg_tbt) median = np.percentile(inter_token_latencies, 50) p95 = np.percentile(inter_token_latencies, 95) - self.log_metrics(model, "timebetweentokens_median", median) - self.log_metrics(model, "timebetweentokens_p95", p95) - self.log_metrics(model, "totaltokens", len(inter_token_latencies) + 1) - self.log_metrics(model, "tps", (len(inter_token_latencies) + 1) / elapsed) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens_median", median) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "timebetweentokens_p95", p95) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "totaltokens", len(inter_token_latencies) + 1) + self.log_metrics(model, 10 ** math.ceil(math.log10(len(prompt.split(" ")))), max_output, "tps", (len(inter_token_latencies) + 1) / elapsed) except Exception as e: print(f"[ERROR] Streaming inference failed for model '{model}': {e}") diff --git a/providers/provider_interface.py b/providers/provider_interface.py index 8e3351ff..51cddc89 100644 --- a/providers/provider_interface.py +++ b/providers/provider_interface.py @@ -9,9 +9,9 @@ def __init__(self): Initializes the Provider with the necessary API key and client. """ # experiment constants - self.min_tokens = 10000 + self.min_tokens = 100000 self.system_prompt = ( - f"Please provide a detailed response of MORE THAN {self.min_tokens} words" + f"Please provide a detailed response of STRICTLY MORE THAN {self.min_tokens} words" ) # metrics @@ -25,16 +25,34 @@ def __init__(self): "timebetweentokens_p95": {}, } - def log_metrics(self, model_name, metric, value): + # def log_metrics(self, model_name, input_size, metric, value): + # """ + # Logs metrics + # """ + # if metric not in self.metrics: + # raise ValueError(f"Metric type '{metric}' is not defined.") + # if model_name not in self.metrics[metric]: + # self.metrics[metric][model_name] = [] + + # self.metrics[metric][model_name].append(value) + + def log_metrics(self, model_name, input_size, max_output, metric, value): """ - Logs metrics + Logs metrics in a nested structure: metrics[metric][model_name][input_size] -> list of values """ if metric not in self.metrics: raise ValueError(f"Metric type '{metric}' is not defined.") + if model_name not in self.metrics[metric]: - self.metrics[metric][model_name] = [] + self.metrics[metric][model_name] = {} + + if input_size not in self.metrics[metric][model_name]: + self.metrics[metric][model_name][input_size] = {} + + if max_output not in self.metrics[metric][model_name][input_size]: + self.metrics[metric][model_name][input_size][max_output] = [] - self.metrics[metric][model_name].append(value) + self.metrics[metric][model_name][input_size][max_output].append(value) @abstractmethod def perform_inference(self, model, prompt): diff --git a/providers/together_ai_provider.py b/providers/together_ai_provider.py index f06f405e..44d52e55 100644 --- a/providers/together_ai_provider.py +++ b/providers/together_ai_provider.py @@ -18,5 +18,6 @@ def __init__(self): "mistral-7b-instruct-v0.1": "mistralai/Mistral-7B-Instruct-v0.1", "meta-llama-3.1-70b-instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "meta-llama-3.1-405b-instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", - "common-model": "meta-llama/Llama-3.3-70B-Instruct-Turbo" + "common-model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "common-model-small": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo" } diff --git a/utils/prompt_generator.py b/utils/prompt_generator.py index 6b736559..6de4b37b 100644 --- a/utils/prompt_generator.py +++ b/utils/prompt_generator.py @@ -1,9 +1,16 @@ -PROMPT_100_TOKENS = "Tell me a long story based on the following story description: \ -In a future world where emotions are regulated by technology, \ -a girl discovers a hidden garden that awakens real feelings. \ -She embarks on a journey to understand these new emotions, challenging the societal \ -norms and the oppressive tech that controls them. Alongside a group of outcasts, \ -she fights to bring authentic emotions back to a world that has forgotten how to feel." +# PROMPT_100_TOKENS = "Tell me a long story based on the following story description: \ +# In a future world where emotions are regulated by technology, \ +# a girl discovers a hidden garden that awakens real feelings. \ +# She embarks on a journey to understand these new emotions, challenging the societal \ +# norms and the oppressive tech that controls them. Alongside a group of outcasts, \ +# she fights to bring authentic emotions back to a world that has forgotten how to feel." + +PROMPT_100_TOKENS = """Compose a science-fiction saga divided into 200 numbered sections. +• Each section must be at least 500 words (so 200×500 = 100 000 words). +• Begin each section with “Section 1: ” through “Section 200: <title>.” +• Do not include any explanation or instructions—only the story text. +Keep going until all 200 sections are written. +""" PROMPT_1000_TOKENS = "In a world where technology has surpassed all imaginable limits, the line between human and machine became indistinct. The year was 2147, and Earth was no longer the sole hub of human civilization. Colonies sprawled across Mars, Titan, and even Europa, each developing its distinct identity shaped by the unique challenges of their environments. Amidst this age of prosperity and interstellar expansion, there lay a hidden truth only known to a few – a secret that could unravel the fabric of society. \ \