From 725a304fc824de332c7c12584607ecd32f9c1b04 Mon Sep 17 00:00:00 2001 From: pyjeebz Date: Thu, 25 Jun 2026 16:34:15 -0400 Subject: [PATCH 1/3] feat: add prescale run launch-readiness load test Replace the old API-driven commands (predict/detect/recommend/status/ agent/config) with a single self-contained command: prescale run . Ramp virtual users against a URL with an httpx + asyncio engine, capture latency percentiles and error kinds per level, detect the error-onset point (error rate or p95 latency wall), and print a plain-English readiness verdict with a v0 bottleneck hint. Includes a basic safety gate (--i-own-this + confirm for non-local hosts). Zero new dependencies. Co-Authored-By: Claude Opus 4.8 --- cli/src/prescale_cli/__init__.py | 2 +- cli/src/prescale_cli/commands/agent.py | 146 --------- cli/src/prescale_cli/commands/config.py | 221 ------------- cli/src/prescale_cli/commands/detect.py | 193 ------------ cli/src/prescale_cli/commands/predict.py | 190 ------------ cli/src/prescale_cli/commands/recommend.py | 182 ----------- cli/src/prescale_cli/commands/run.py | 196 ++++++++++++ cli/src/prescale_cli/commands/status.py | 145 --------- cli/src/prescale_cli/loadtest.py | 241 +++++++++++++++ cli/src/prescale_cli/main.py | 66 +--- cli/tests/test_cli.py | 344 ++------------------- cli/tests/test_loadtest.py | 82 +++++ 12 files changed, 559 insertions(+), 1449 deletions(-) delete mode 100644 cli/src/prescale_cli/commands/agent.py delete mode 100644 cli/src/prescale_cli/commands/config.py delete mode 100644 cli/src/prescale_cli/commands/detect.py delete mode 100644 cli/src/prescale_cli/commands/predict.py delete mode 100644 cli/src/prescale_cli/commands/recommend.py create mode 100644 cli/src/prescale_cli/commands/run.py delete mode 100644 cli/src/prescale_cli/commands/status.py create mode 100644 cli/src/prescale_cli/loadtest.py create mode 100644 cli/tests/test_loadtest.py diff --git a/cli/src/prescale_cli/__init__.py b/cli/src/prescale_cli/__init__.py index d201ec7..20f07f2 100644 --- a/cli/src/prescale_cli/__init__.py +++ b/cli/src/prescale_cli/__init__.py @@ -1,3 +1,3 @@ -"""Prescale CLI - Predictive Infrastructure Intelligence Platform.""" +"""PreScale - launch-readiness load testing for solo/indie devs.""" __version__ = "0.1.0" diff --git a/cli/src/prescale_cli/commands/agent.py b/cli/src/prescale_cli/commands/agent.py deleted file mode 100644 index c8e8c87..0000000 --- a/cli/src/prescale_cli/commands/agent.py +++ /dev/null @@ -1,146 +0,0 @@ -"""Agent management commands for Prescale CLI.""" - -import click -from rich.console import Console -from rich.table import Table - -console = Console() - - -@click.group() -def agent(): - """Manage Prescale metrics collection agents.""" - pass - - -@agent.command("status") -@click.pass_context -def agent_status(ctx: click.Context): - """Check agent status and metrics sources.""" - import psutil - import platform - - console.print("[bold]Prescale Agent Status[/bold]\n") - - # System info - table = Table(title="System Information", show_header=False) - table.add_column("Property", style="cyan") - table.add_column("Value") - - table.add_row("Platform", platform.platform()) - table.add_row("Python", platform.python_version()) - table.add_row("CPU Cores", str(psutil.cpu_count())) - table.add_row("CPU Usage", f"{psutil.cpu_percent():.1f}%") - - mem = psutil.virtual_memory() - table.add_row("Memory Total", f"{mem.total / 1024**3:.1f} GB") - table.add_row("Memory Used", f"{mem.percent:.1f}%") - - console.print(table) - - # Available sources - try: - from prescale_agent.sources import list_sources - sources = list_sources() - console.print(f"\n[bold]Available Source Types:[/bold] {', '.join(sources)}") - except ImportError: - console.print("\n[yellow]prescale-agent not installed. Install with:[/yellow]") - console.print(" pip install prescale-agent") - - -@agent.command("sources") -def agent_sources(): - """List available metrics source types.""" - try: - from prescale_agent.sources import list_sources - - console.print("[bold]Available Metrics Sources:[/bold]\n") - - table = Table(show_header=True) - table.add_column("Type", style="cyan") - table.add_column("Description") - table.add_column("Install Extra", style="dim") - - source_info = { - "system": ("Local system metrics via psutil", "included"), - "prometheus": ("Prometheus server scraping", "included"), - "datadog": ("Datadog API", "pip install prescale-agent[datadog]"), - "cloudwatch": ("AWS CloudWatch", "pip install prescale-agent[aws]"), - "azure_monitor": ("Azure Monitor", "pip install prescale-agent[azure]"), - "gcp_monitoring": ("Google Cloud Monitoring", "pip install prescale-agent[gcp]"), - } - - registered = list_sources() - - for source_type, (desc, install) in source_info.items(): - status = "[green]+" if source_type in registered else "[red]x" - table.add_row(f"{status} {source_type}", desc, install) - - console.print(table) - console.print("\n[dim]+ = available, x = missing dependencies[/dim]") - - except ImportError: - console.print("[yellow]prescale-agent not installed. Install with:[/yellow]") - console.print(" pip install prescale-agent") - - -@agent.command("init") -@click.option("--output", "-o", type=click.Path(), default="prescale-agent.yaml", help="Output file path") -def agent_init(output: str): - """Generate a sample agent configuration file.""" - try: - from prescale_agent.cli_v2 import init as agent_init_cmd - # Invoke the agent's init command - import subprocess - import sys - result = subprocess.run( - [sys.executable, "-m", "prescale_agent.cli_v2", "init", "-o", output], - capture_output=True, - text=True, - ) - console.print(result.stdout) - if result.stderr: - console.print(result.stderr) - except ImportError: - # Fallback: write config directly - sample_config = """# Prescale Agent Configuration -endpoint: - url: http://localhost:8000 - api_key: ${PRESCALE_API_KEY} - -sources: - - name: local-system - type: system - enabled: true - interval: 15 - -batch_size: 100 -flush_interval: 10 -log_level: INFO -""" - with open(output, "w") as f: - f.write(sample_config) - - console.print(f"[green]+ Created config file: {output}[/green]") - console.print("\nRun the agent with:") - console.print(f" [cyan]prescale-agent run -c {output}[/cyan]") - - -@agent.command("test") -@click.option("--config", "-c", "config_path", help="Path to config file") -def agent_test(config_path: str | None): - """Test agent collectors without sending data.""" - try: - import subprocess - import sys - - cmd = [sys.executable, "-m", "prescale_agent.cli_v2", "test"] - if config_path: - cmd.extend(["-c", config_path]) - - result = subprocess.run(cmd, capture_output=False) - - except Exception as e: - console.print(f"[red]Error: {e}[/red]") - console.print("\n[yellow]Make sure prescale-agent is installed:[/yellow]") - console.print(" pip install prescale-agent") diff --git a/cli/src/prescale_cli/commands/config.py b/cli/src/prescale_cli/commands/config.py deleted file mode 100644 index 895f813..0000000 --- a/cli/src/prescale_cli/commands/config.py +++ /dev/null @@ -1,221 +0,0 @@ -"""Configuration commands for Prescale CLI.""" - -import json -import os -from pathlib import Path - -import click -import yaml -from rich.console import Console -from rich.table import Table - -console = Console() - -CONFIG_DIR = Path.home() / ".prescale" -CONFIG_FILE = CONFIG_DIR / "config.yaml" - - -@click.group() -def config() -> None: - """Manage Prescale CLI configuration. - - Configure endpoints, credentials, and preferences for the Prescale CLI. - """ - pass - - -@config.command() -@click.pass_context -def show(ctx: click.Context) -> None: - """Show current configuration. - - Displays all configured settings including endpoint and preferences. - """ - output_format = ctx.obj.get("output", "table") - - config_data = _load_config() - - # Add runtime config - config_data["runtime"] = { - "endpoint": ctx.obj.get("endpoint", "http://localhost:8000"), - "api_key_set": bool(ctx.obj.get("api_key")), - } - - if output_format == "json": - # Mask API key - if "api_key" in config_data: - config_data["api_key"] = "***" if config_data["api_key"] else None - console.print(json.dumps(config_data, indent=2)) - elif output_format == "yaml": - if "api_key" in config_data: - config_data["api_key"] = "***" if config_data["api_key"] else None - console.print(yaml.dump(config_data, default_flow_style=False)) - else: - _display_config(config_data) - - -@config.command() -@click.argument("key") -@click.argument("value") -def set(key: str, value: str) -> None: - """Set a configuration value. - - Available keys: - - \b - endpoint - Prescale API endpoint URL - api_key - API key for authentication - output - Default output format (table, json, yaml) - - Examples: - - \b - prescale config set endpoint http://prescale.example.com:8000 - prescale config set output json - """ - valid_keys = ["endpoint", "api_key", "output"] - - if key not in valid_keys: - console.print(f"[red]Error:[/red] Invalid key '{key}'. Valid keys: {', '.join(valid_keys)}") - raise SystemExit(1) - - if key == "output" and value not in ["table", "json", "yaml"]: - console.print(f"[red]Error:[/red] Invalid output format. Must be: table, json, or yaml") - raise SystemExit(1) - - config_data = _load_config() - config_data[key] = value - _save_config(config_data) - - display_value = "***" if key == "api_key" else value - console.print(f"[green]✓[/green] Set {key} = {display_value}") - - -@config.command() -@click.argument("key") -def unset(key: str) -> None: - """Remove a configuration value. - - Example: - prescale config unset api_key - """ - config_data = _load_config() - - if key in config_data: - del config_data[key] - _save_config(config_data) - console.print(f"[green]✓[/green] Removed {key}") - else: - console.print(f"[yellow]![/yellow] Key '{key}' not found in configuration") - - -@config.command() -def init() -> None: - """Initialize configuration interactively. - - Guides you through setting up the Prescale CLI configuration. - """ - console.print("[bold]Prescale CLI Configuration[/bold]") - console.print() - - # Endpoint - default_endpoint = "http://localhost:8000" - endpoint = click.prompt( - "Prescale API endpoint", - default=default_endpoint, - ) - - # API Key - api_key = click.prompt( - "API key (leave empty for none)", - default="", - hide_input=True, - show_default=False, - ) - - # Output format - output = click.prompt( - "Default output format", - type=click.Choice(["table", "json", "yaml"]), - default="table", - ) - - config_data = { - "endpoint": endpoint, - "output": output, - } - - if api_key: - config_data["api_key"] = api_key - - _save_config(config_data) - - console.print() - console.print(f"[green]✓[/green] Configuration saved to {CONFIG_FILE}") - console.print() - console.print("[bold]Test your connection:[/bold]") - console.print(" prescale status") - - -@config.command() -def path() -> None: - """Show configuration file path.""" - console.print(f"Configuration file: {CONFIG_FILE}") - console.print(f"Exists: {'Yes' if CONFIG_FILE.exists() else 'No'}") - - -def _load_config() -> dict: - """Load configuration from file.""" - if CONFIG_FILE.exists(): - with open(CONFIG_FILE) as f: - return yaml.safe_load(f) or {} - return {} - - -def _save_config(config_data: dict) -> None: - """Save configuration to file.""" - CONFIG_DIR.mkdir(parents=True, exist_ok=True) - with open(CONFIG_FILE, "w") as f: - yaml.dump(config_data, f, default_flow_style=False) - # Set restrictive permissions for API key security - os.chmod(CONFIG_FILE, 0o600) - - -def _display_config(config_data: dict) -> None: - """Display configuration in table format.""" - console.print() - console.print(f"[bold]Configuration File:[/bold] {CONFIG_FILE}") - console.print() - - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Setting") - table.add_column("Value") - table.add_column("Source") - - # File config - file_config = {k: v for k, v in config_data.items() if k != "runtime"} - runtime = config_data.get("runtime", {}) - - # Endpoint - file_endpoint = file_config.get("endpoint", "") - runtime_endpoint = runtime.get("endpoint", "http://localhost:8000") - effective_endpoint = file_endpoint or runtime_endpoint - source = "config file" if file_endpoint else "default/env" - table.add_row("endpoint", effective_endpoint, source) - - # API Key - api_key = file_config.get("api_key", "") - api_key_display = "***" if api_key else "(not set)" - api_key_source = "config file" if api_key else "env: PRESCALE_API_KEY" if runtime.get("api_key_set") else "(not set)" - table.add_row("api_key", api_key_display, api_key_source) - - # Output - output = file_config.get("output", "table") - table.add_row("output", output, "config file" if "output" in file_config else "default") - - console.print(table) - - console.print() - console.print("[bold]Environment Variables:[/bold]") - console.print(" PRESCALE_ENDPOINT - Override API endpoint") - console.print(" PRESCALE_API_KEY - API key for authentication") diff --git a/cli/src/prescale_cli/commands/detect.py b/cli/src/prescale_cli/commands/detect.py deleted file mode 100644 index 6405cce..0000000 --- a/cli/src/prescale_cli/commands/detect.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Anomaly detection commands for Prescale CLI.""" - -import json -from datetime import datetime, timedelta, timezone -import random - -import click -import httpx -from rich.console import Console -from rich.table import Table -from rich.panel import Panel - -console = Console() - - -def generate_sample_metrics(lookback_hours: int = 1): - """Generate sample metrics data for anomaly detection.""" - now = datetime.now(timezone.utc) - points = [] - - # Generate realistic time-series with possible anomaly - base_cpu = 0.45 - for i in range(lookback_hours * 12): # 5-min intervals - timestamp = now - timedelta(minutes=(lookback_hours * 60) - (i * 5)) - # Normal variation with occasional spike - noise = random.uniform(-0.05, 0.05) - value = base_cpu + noise - - # Add a spike at 70% through the data - if 0.68 <= i / (lookback_hours * 12) <= 0.72: - value = random.uniform(0.85, 0.95) - - points.append({ - "timestamp": timestamp.strftime("%Y-%m-%dT%H:%M:%SZ"), - "value": round(value, 3) - }) - - return points - - -@click.command() -@click.option("--deployment", "-d", default=None, help="Deployment name (optional, for labeling)") -@click.option("--namespace", "-n", default=None, help="Kubernetes namespace (optional, for labeling)") -@click.option("--lookback", "-l", default=1, type=int, help="Hours of data to analyze") -@click.option("--sensitivity", "-s", default="medium", - type=click.Choice(["low", "medium", "high"]), - help="Detection sensitivity (low=3σ, medium=2.5σ, high=2σ)") -@click.option("--metric", "-m", default="cpu_utilization", - type=click.Choice(["cpu_utilization", "memory_utilization", "request_latency", "error_rate"]), - help="Metric to analyze") -@click.pass_context -def detect(ctx: click.Context, deployment: str | None, namespace: str | None, lookback: int, - sensitivity: str, metric: str) -> None: - """Detect anomalies in resource metrics. - - Analyzes metrics to identify unusual patterns that may indicate issues. - - \b - Examples: - prescale detect # Quick detect with defaults - prescale detect -d my-app -n production # Specific deployment - prescale detect --sensitivity high # More sensitive detection - prescale detect -m memory_utilization # Check memory instead - """ - endpoint = ctx.obj["endpoint"] - api_key = ctx.obj["api_key"] - output_format = ctx.obj["output"] - - # Map sensitivity to threshold sigma - threshold_map = {"low": 3.0, "medium": 2.5, "high": 2.0} - threshold_sigma = threshold_map[sensitivity] - - # Generate sample metrics data - metrics_data = generate_sample_metrics(lookback) - - with console.status(f"[bold blue]Analyzing {metric} for anomalies..."): - try: - headers = {"Content-Type": "application/json"} - if api_key: - headers["Authorization"] = f"Bearer {api_key}" - - # Build request matching the actual API schema - request_body = { - "metrics": { - metric: metrics_data - }, - "threshold_sigma": threshold_sigma - } - - response = httpx.post( - f"{endpoint}/detect", - json=request_body, - headers=headers, - timeout=30.0, - ) - response.raise_for_status() - data = response.json() - except httpx.HTTPStatusError as e: - console.print(f"[red]Error:[/red] API returned {e.response.status_code}: {e.response.text}") - raise SystemExit(1) - except httpx.HTTPError as e: - console.print(f"[red]Error:[/red] Failed to connect to Prescale: {e}") - raise SystemExit(1) - - if output_format == "json": - console.print(json.dumps(data, indent=2)) - elif output_format == "yaml": - import yaml - console.print(yaml.dump(data, default_flow_style=False)) - else: - _display_anomalies(data, deployment, namespace, metric, sensitivity) - - -def _display_anomalies(data: dict, deployment: str | None, namespace: str | None, metric: str, sensitivity: str) -> None: - """Display anomaly detection results.""" - console.print() - - anomalies = data.get("anomalies", []) - summary = data.get("summary", {}) - status = summary.get("status", "unknown") - - status_color = {"healthy": "green", "attention": "yellow", "warning": "red", "critical": "bold red"}.get(status, "white") - - info_lines = ["[bold]Anomaly Detection Results[/bold]"] - if deployment: - info_lines.append(f"Deployment: [cyan]{deployment}[/cyan]") - if namespace: - info_lines.append(f"Namespace: [cyan]{namespace}[/cyan]") - info_lines.extend([ - f"Metric: [cyan]{metric}[/cyan]", - f"Sensitivity: [cyan]{sensitivity}[/cyan]", - f"Data Points: [cyan]{data.get('data_points_analyzed', 0)}[/cyan]", - ]) - - console.print(Panel( - "\n".join(info_lines) + "\n" - f"Status: [{status_color}]{status.upper()}[/{status_color}]", - title="🔍 Prescale Anomaly Detection", - border_style=status_color.split()[-1] if " " in status_color else status_color, - )) - console.print() - - if anomalies: - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Time", style="dim") - table.add_column("Metric") - table.add_column("Severity", justify="center") - table.add_column("Value", justify="right") - table.add_column("Expected", justify="right") - table.add_column("Score", justify="right") - - for anomaly in anomalies: - timestamp = anomaly.get("timestamp", "") - if timestamp: - try: - dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00")) - timestamp = dt.strftime("%H:%M:%S") - except ValueError: - pass - - severity = anomaly.get("severity", "medium") - severity_style = { - "low": "yellow", - "medium": "orange1", - "high": "red", - "critical": "bold red", - }.get(severity, "white") - - table.add_row( - timestamp, - anomaly.get("metric", "unknown"), - f"[{severity_style}]{severity.upper()}[/{severity_style}]", - f"{anomaly.get('value', 0):.2%}", - f"{anomaly.get('expected_value', 0):.2%}", - f"{anomaly.get('anomaly_score', 0):.2f}σ", - ) - - console.print(table) - console.print() - - # Summary by severity - by_severity = summary.get("by_severity", {}) - if by_severity: - console.print("[bold]Summary:[/bold]") - for sev, count in by_severity.items(): - console.print(f" • {sev.capitalize()}: {count}") - else: - console.print("[green]✓ All metrics are within normal ranges[/green]") - - # Show anomaly rate - rate = summary.get("anomaly_rate", 0) - console.print() - console.print(f"[bold]Anomaly Rate:[/bold] {rate:.1%}") diff --git a/cli/src/prescale_cli/commands/predict.py b/cli/src/prescale_cli/commands/predict.py deleted file mode 100644 index 33fb38b..0000000 --- a/cli/src/prescale_cli/commands/predict.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Prediction commands for Prescale CLI.""" - -import json -from datetime import datetime - -import click -import httpx -from rich.console import Console -from rich.table import Table -from rich.panel import Panel - -console = Console() - - -@click.group() -def predict() -> None: - """Generate resource predictions. - - Use these commands to forecast CPU, memory, and other resource usage - for your Kubernetes deployments. - - \b - Examples: - prescale predict cpu # Quick CPU prediction - prescale predict memory -d my-app # Memory for specific app - """ - pass - - -@predict.command() -@click.option("--deployment", "-d", default=None, help="Deployment name (optional)") -@click.option("--namespace", "-n", default="default", help="Kubernetes namespace") -@click.option("--periods", "-p", default=12, type=int, help="Number of prediction periods") -@click.option("--model", "-m", default="baseline", - type=click.Choice(["baseline", "prophet", "xgboost"]), - help="Prediction model to use") -@click.pass_context -def cpu(ctx: click.Context, deployment: str | None, namespace: str, periods: int, model: str) -> None: - """Predict CPU usage for a deployment. - - \b - Examples: - prescale predict cpu # Quick prediction - prescale predict cpu -d my-app -p 24 # 24 periods ahead - prescale predict cpu --model prophet # Use Prophet model - """ - _run_prediction(ctx, "cpu_utilization", deployment, namespace, periods, model) - - -@predict.command() -@click.option("--deployment", "-d", default=None, help="Deployment name (optional)") -@click.option("--namespace", "-n", default="default", help="Kubernetes namespace") -@click.option("--periods", "-p", default=12, type=int, help="Number of prediction periods") -@click.option("--model", "-m", default="baseline", - type=click.Choice(["baseline", "prophet", "xgboost"]), - help="Prediction model to use") -@click.pass_context -def memory(ctx: click.Context, deployment: str | None, namespace: str, periods: int, model: str) -> None: - """Predict memory usage for a deployment. - - \b - Examples: - prescale predict memory # Quick prediction - prescale predict memory -d my-app -p 48 # 48 periods ahead - """ - _run_prediction(ctx, "memory_utilization", deployment, namespace, periods, model) - - -def _run_prediction(ctx: click.Context, metric: str, deployment: str | None, - namespace: str, periods: int, model: str) -> None: - """Run prediction request against API.""" - endpoint = ctx.obj["endpoint"] - api_key = ctx.obj["api_key"] - output_format = ctx.obj["output"] - - metric_display = metric.replace("_", " ").title() - - with console.status(f"[bold blue]Fetching {metric_display} predictions..."): - try: - headers = {"Content-Type": "application/json"} - if api_key: - headers["Authorization"] = f"Bearer {api_key}" - - # Build request matching the actual API schema - request_body = { - "metric": metric, - "periods": periods, - "model": model - } - - response = httpx.post( - f"{endpoint}/predict", - json=request_body, - headers=headers, - timeout=30.0, - ) - response.raise_for_status() - data = response.json() - except httpx.HTTPStatusError as e: - if e.response.status_code == 503: - console.print(Panel( - "[yellow]ML models not loaded[/yellow]\n\n" - "The prediction service requires trained models.\n" - "Run [cyan]python ml/train.py[/cyan] to train models.", - title="⚠️ Models Not Ready", - border_style="yellow" - )) - else: - console.print(f"[red]Error:[/red] API returned {e.response.status_code}: {e.response.text}") - raise SystemExit(1) - except httpx.HTTPError as e: - console.print(f"[red]Error:[/red] Failed to connect to Prescale: {e}") - raise SystemExit(1) - - if output_format == "json": - console.print(json.dumps(data, indent=2)) - elif output_format == "yaml": - import yaml - console.print(yaml.dump(data, default_flow_style=False)) - else: - _display_prediction_table(data, metric_display, deployment or "cluster", namespace, model) - - -def _display_prediction_table(data: dict, metric_type: str, deployment: str | None, namespace: str, model: str) -> None: - """Display prediction data in a formatted table.""" - console.print() - - info_lines = [f"[bold]{metric_type} Prediction[/bold]"] - if deployment: - info_lines.append(f"Deployment: [cyan]{deployment}[/cyan]") - info_lines.append(f"Namespace: [cyan]{namespace}[/cyan]") - info_lines.append(f"Model: [cyan]{model}[/cyan]") - - console.print(Panel( - "\n".join(info_lines), - title="🔮 Prescale Prediction", - border_style="blue", - )) - console.print() - - predictions = data.get("predictions", []) - if predictions: - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Time", style="dim") - table.add_column("Predicted", justify="right") - table.add_column("Lower Bound", justify="right", style="yellow") - table.add_column("Upper Bound", justify="right", style="yellow") - table.add_column("Confidence", justify="right") - - for pred in predictions[:10]: # Show first 10 - timestamp = pred.get("timestamp", "") - if timestamp: - try: - dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00")) - timestamp = dt.strftime("%Y-%m-%d %H:%M") - except ValueError: - pass - - value = pred.get('value') or 0 - lower = pred.get('lower_bound') or value * 0.9 - upper = pred.get('upper_bound') or value * 1.1 - confidence = pred.get('confidence') or 0.8 - - table.add_row( - timestamp, - f"{value:.2%}", - f"{lower:.2%}", - f"{upper:.2%}", - f"{confidence * 100:.1f}%", - ) - - console.print(table) - - if len(predictions) > 10: - console.print(f"\n[dim]... and {len(predictions) - 10} more predictions[/dim]") - else: - console.print("[yellow]No predictions available[/yellow]") - - # Show model info - model_info = data.get("model_info", {}) - summary = data.get("summary", {}) - if model_info or summary: - console.print() - console.print(f"[bold]Model:[/bold] {model_info.get('name', model)} (Accuracy: {model_info.get('accuracy', 'N/A')})") - if summary: - console.print() - console.print("[bold]Summary:[/bold]") - console.print(f" Peak predicted: [red]{summary.get('peak', 0):.2f}[/red]") - console.print(f" Average: [blue]{summary.get('average', 0):.2f}[/blue]") - console.print(f" Trend: {summary.get('trend', 'stable')}") diff --git a/cli/src/prescale_cli/commands/recommend.py b/cli/src/prescale_cli/commands/recommend.py deleted file mode 100644 index b65dd2a..0000000 --- a/cli/src/prescale_cli/commands/recommend.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Scaling recommendation commands for Prescale CLI.""" - -import json -import random - -import click -import httpx -from rich.console import Console -from rich.table import Table -from rich.panel import Panel - -console = Console() - - -@click.command() -@click.option("--deployment", "-d", required=True, help="Deployment name") -@click.option("--namespace", "-n", default="default", help="Kubernetes namespace") -@click.option("--replicas", "-r", default=2, type=int, help="Current replica count") -@click.option("--cpu-request", default="100m", help="Current CPU request") -@click.option("--memory-request", default="256Mi", help="Current memory request") -@click.option("--target-utilization", "-t", default=0.7, type=float, help="Target utilization (0.0-1.0)") -@click.option("--cost-optimize", is_flag=True, help="Prioritize cost optimization") -@click.option("--performance", is_flag=True, help="Prioritize performance") -@click.pass_context -def recommend(ctx: click.Context, deployment: str, namespace: str, replicas: int, - cpu_request: str, memory_request: str, target_utilization: float, - cost_optimize: bool, performance: bool) -> None: - """Get scaling recommendations for a deployment. - - Analyzes current resource usage and provides intelligent scaling recommendations. - - \b - Examples: - prescale recommend # Quick recommend with defaults - prescale recommend -d my-app -n prod # Specific deployment - prescale recommend --cost-optimize # Prioritize savings - prescale recommend --performance # Prioritize performance - prescale recommend -r 3 -t 0.6 # Custom config - """ - endpoint = ctx.obj["endpoint"] - api_key = ctx.obj["api_key"] - output_format = ctx.obj["output"] - - # Simulate predicted utilization (in real scenario, this comes from /predict) - predicted_util = random.uniform(0.75, 0.95) # Simulated high utilization - - with console.status("[bold blue]Generating scaling recommendations..."): - try: - headers = {"Content-Type": "application/json"} - if api_key: - headers["Authorization"] = f"Bearer {api_key}" - - # Build request matching the actual API schema - request_body = { - "workload": deployment, - "namespace": namespace, - "current_state": { - "replicas": replicas, - "cpu_request": cpu_request, - "memory_request": memory_request, - "cpu_limit": "500m", - "memory_limit": "512Mi" - }, - "predictions": [ - {"timestamp": "2026-02-01T06:00:00Z", "value": predicted_util}, - {"timestamp": "2026-02-01T06:05:00Z", "value": predicted_util + 0.02}, - {"timestamp": "2026-02-01T06:10:00Z", "value": predicted_util + 0.05} - ], - "target_utilization": target_utilization - } - - response = httpx.post( - f"{endpoint}/recommend", - json=request_body, - headers=headers, - timeout=30.0, - ) - response.raise_for_status() - data = response.json() - except httpx.HTTPStatusError as e: - console.print(f"[red]Error:[/red] API returned {e.response.status_code}: {e.response.text}") - raise SystemExit(1) - except httpx.HTTPError as e: - console.print(f"[red]Error:[/red] Failed to connect to Prescale: {e}") - raise SystemExit(1) - - if output_format == "json": - console.print(json.dumps(data, indent=2)) - elif output_format == "yaml": - import yaml - console.print(yaml.dump(data, default_flow_style=False)) - else: - strategy = "cost" if cost_optimize else "performance" if performance else "balanced" - _display_recommendations(data, deployment, namespace, strategy, replicas, cpu_request, memory_request) - - -def _display_recommendations(data: dict, deployment: str, namespace: str, strategy: str, - current_replicas: int, current_cpu: str, current_memory: str) -> None: - """Display scaling recommendations.""" - console.print() - - strategy_emoji = {"balanced": "⚖️", "cost": "💰", "performance": "🚀"}.get(strategy, "⚖️") - - recommendations = data.get("recommendations", []) - rec = recommendations[0] if recommendations else {} - actions = rec.get("actions", []) - - info_lines = [ - "[bold]Scaling Recommendations[/bold]", - f"Deployment: [cyan]{deployment}[/cyan]", - f"Namespace: [cyan]{namespace}[/cyan]", - f"Strategy: {strategy_emoji} {strategy.capitalize()}" - ] - - console.print(Panel( - "\n".join(info_lines), - title="📊 Prescale Recommendations", - border_style="blue", - )) - console.print() - - if actions: - # Actions table - table = Table(show_header=True, header_style="bold magenta", title="Recommended Actions") - table.add_column("Action", style="bold") - table.add_column("Target") - table.add_column("Confidence", justify="right") - table.add_column("Reason") - - for action in actions: - action_type = action.get("action", "unknown") - action_color = { - "scale_out": "green", - "scale_in": "yellow", - "scale_up": "green", - "scale_down": "yellow", - "no_action": "dim" - }.get(action_type, "white") - - target = "" - if action.get("target_replicas"): - target = f"{current_replicas} → {action['target_replicas']} replicas" - elif action.get("target_cpu_request"): - target = f"CPU: {action['target_cpu_request']}" - elif action.get("target_memory_request"): - target = f"Memory: {action['target_memory_request']}" - else: - target = "-" - - confidence = action.get("confidence", 0) - conf_color = "green" if confidence >= 0.8 else "yellow" if confidence >= 0.6 else "red" - - table.add_row( - f"[{action_color}]{action_type.upper().replace('_', ' ')}[/{action_color}]", - target, - f"[{conf_color}]{confidence:.0%}[/{conf_color}]", - action.get("reason", "")[:60] + "..." if len(action.get("reason", "")) > 60 else action.get("reason", ""), - ) - - console.print(table) - - # Predicted utilization - predicted = rec.get("predicted_utilization") - if predicted: - console.print() - util_color = "green" if predicted < 0.7 else "yellow" if predicted < 0.85 else "red" - console.print(f"[bold]Predicted Utilization:[/bold] [{util_color}]{predicted:.1%}[/{util_color}]") - - # Apply commands - console.print() - console.print("[bold]To apply scaling:[/bold]") - for action in actions: - if action.get("target_replicas"): - console.print(f" [cyan]kubectl scale deployment/{deployment} -n {namespace} --replicas={action['target_replicas']}[/cyan]") - else: - console.print("[green]✓ Current configuration is optimal - no changes recommended[/green]") - - # Cooldown status - metadata = data.get("metadata", {}) - if metadata.get("cooldown_active"): - console.print() - console.print("[yellow]⏳ Cooldown active - recommendations may be limited[/yellow]") diff --git a/cli/src/prescale_cli/commands/run.py b/cli/src/prescale_cli/commands/run.py new file mode 100644 index 0000000..cbcc61c --- /dev/null +++ b/cli/src/prescale_cli/commands/run.py @@ -0,0 +1,196 @@ +"""`prescale run` - zero-config launch-readiness load test for one URL. + +Ramps virtual users against a URL and reports, in plain English, what breaks +first and at what traffic level - before you launch. +""" + +from __future__ import annotations + +import asyncio +import json +from urllib.parse import urlparse + +import click +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from prescale_cli.loadtest import ( + LoadError, + RunReport, + analyze, + default_levels, + run_loadtest, +) + +console = Console() + +_LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1"} + + +@click.command() +@click.argument("url") +@click.option("--max-users", "-u", default=200, type=int, + help="Peak virtual users to ramp to.") +@click.option("--stage-seconds", "-s", default=5.0, type=float, + help="Seconds to hold each load level.") +@click.option("--latency-wall", default=2.0, type=float, + help="p95 latency (seconds) treated as the failure threshold.") +@click.option("--error-threshold", default=0.02, type=float, + help="Error rate (0-1) treated as the failure threshold.") +@click.option("--method", "-m", default="GET", + help="HTTP method to fire.") +@click.option("--timeout", default=10.0, type=float, + help="Per-request timeout in seconds.") +@click.option("--i-own-this", "yes", is_flag=True, + help="Skip the confirmation prompt for non-local targets.") +@click.option("--json", "as_json", is_flag=True, + help="Emit the raw report as JSON.") +def run(url: str, max_users: int, stage_seconds: float, latency_wall: float, + error_threshold: float, method: str, timeout: float, yes: bool, + as_json: bool) -> None: + """Load test URL and report what breaks first. + + \b + Examples: + prescale run http://localhost:8000 + prescale run https://staging.myapp.com -u 500 + prescale run https://staging.myapp.com --i-own-this --json + """ + parsed = urlparse(url) + if not parsed.scheme or not parsed.netloc: + console.print(f"[red]Error:[/red] '{url}' doesn't look like a URL " + "(expected e.g. http://localhost:8000).") + raise SystemExit(1) + + host = (parsed.hostname or "").lower() + is_local = host in _LOCAL_HOSTS + if not is_local and not yes and not as_json: + console.print(Panel( + f"You're about to send real traffic to [bold]{host}[/bold] " + f"(up to {max_users} concurrent users).\n" + "Point this at a staging/preview URL you own — not production.", + title="⚠️ Heads up", border_style="yellow", + )) + if not click.confirm("Proceed?", default=False): + raise SystemExit(0) + + levels = default_levels(max_users) + + if not as_json: + console.print(f"\n[bold]PreScale[/bold] — load testing [cyan]{url}[/cyan]") + + def render_progress(status): + def cb(users: int) -> None: + status.update(f"[bold blue]Ramping load — {users} virtual users…") + return cb + + try: + if as_json: + stages, warning = asyncio.run(run_loadtest( + url, levels=levels, stage_seconds=stage_seconds, + method=method, timeout=timeout, + )) + else: + with console.status("[bold blue]Warming up…") as status: + stages, warning = asyncio.run(run_loadtest( + url, levels=levels, stage_seconds=stage_seconds, + method=method, timeout=timeout, progress_cb=render_progress(status), + )) + except LoadError as exc: + console.print(f"[red]Error:[/red] {exc}") + raise SystemExit(1) + + report = analyze(stages, latency_wall=latency_wall, error_threshold=error_threshold) + + if as_json: + console.print(json.dumps(_report_to_dict(report, warning), indent=2)) + return + + _render(report, warning) + + +def _report_to_dict(report: RunReport, warning: str | None) -> dict: + return { + "survives_users": report.survives_users, + "max_tested": report.max_tested, + "onset_users": report.onset_users, + "onset_reason": report.onset_reason, + "bottleneck": report.bottleneck, + "latency_wall": report.latency_wall, + "warning": warning, + "stages": [ + { + "users": s.users, + "rps": round(s.rps, 1), + "p50_ms": round(s.pct(0.50) * 1000), + "p95_ms": round(s.pct(0.95) * 1000), + "p99_ms": round(s.pct(0.99) * 1000), + "error_rate": round(s.error_rate, 4), + "errors": s.errors, + "total": s.total, + } + for s in report.stages + ], + } + + +def _render(report: RunReport, warning: str | None) -> None: + console.print() + if warning: + console.print(f"[yellow]⚠ {warning}[/yellow]\n") + + table = Table(show_header=True, header_style="bold magenta", title="Load ramp") + table.add_column("Users", justify="right") + table.add_column("Req/s", justify="right") + table.add_column("p50", justify="right") + table.add_column("p95", justify="right") + table.add_column("p99", justify="right") + table.add_column("Errors", justify="right") + + for stage in report.stages: + is_onset = stage.users == report.onset_users + err = stage.error_rate + err_color = "red" if err >= 0.02 else "yellow" if err > 0 else "green" + row_style = "bold red" if is_onset else None + table.add_row( + str(stage.users), + f"{stage.rps:.0f}", + _ms(stage.pct(0.50)), + _ms(stage.pct(0.95)), + _ms(stage.pct(0.99)), + f"[{err_color}]{err:.0%}[/{err_color}]", + style=row_style, + ) + console.print(table) + console.print() + + if report.onset_users is None: + emoji, color = "✅", "green" + headline = (f"Held up through {report.max_tested} concurrent users " + "(the most we tested).") + else: + emoji = "🟢" if report.survives_users >= report.max_tested else "⚠️" + color = "yellow" + if report.survives_users == 0: + emoji, color = "🛑", "red" + headline = f"Survives ~{report.survives_users} concurrent users." + + lines = [f"[bold]Scale readiness:[/bold] {emoji} {headline}"] + if report.onset_users is not None: + if report.onset_reason == "latency": + lines.append(f"Latency wall p95 crosses {report.latency_wall:g}s " + f"at ~{report.onset_users} users.") + else: + lines.append(f"First failure errors climb at ~{report.onset_users} users.") + if report.bottleneck: + lines.append(f"Likely cause {report.bottleneck}") + + console.print(Panel("\n".join(lines), title="📈 Readiness report", + border_style=color)) + + +def _ms(seconds: float) -> str: + if seconds <= 0: + return "-" + return f"{seconds * 1000:.0f}ms" diff --git a/cli/src/prescale_cli/commands/status.py b/cli/src/prescale_cli/commands/status.py deleted file mode 100644 index 2c4ea32..0000000 --- a/cli/src/prescale_cli/commands/status.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Status command for Prescale CLI.""" - -import json - -import click -import httpx -from rich.console import Console -from rich.table import Table -from rich.panel import Panel - -console = Console() - - -@click.command() -@click.pass_context -def status(ctx: click.Context) -> None: - """Check Prescale service status. - - Displays the health and status of connected Prescale services. - - Example: - prescale status - """ - endpoint = ctx.obj["endpoint"] - api_key = ctx.obj["api_key"] - output_format = ctx.obj["output"] - - results = {} - - # Check inference service - with console.status("[bold blue]Checking Prescale services..."): - # Health check - try: - headers = {"Authorization": f"Bearer {api_key}"} if api_key else {} - response = httpx.get( - f"{endpoint}/health", - headers=headers, - timeout=10.0, - ) - results["inference"] = { - "status": "healthy" if response.status_code == 200 else "unhealthy", - "response_time_ms": response.elapsed.total_seconds() * 1000, - "details": response.json() if response.status_code == 200 else {}, - } - except httpx.HTTPError as e: - results["inference"] = { - "status": "unreachable", - "error": str(e), - } - - # Get metrics endpoint - try: - response = httpx.get( - f"{endpoint}/metrics", - headers=headers, - timeout=10.0, - ) - results["metrics"] = { - "status": "available" if response.status_code == 200 else "unavailable", - } - except httpx.HTTPError: - results["metrics"] = {"status": "unavailable"} - - if output_format == "json": - console.print(json.dumps(results, indent=2)) - elif output_format == "yaml": - import yaml - console.print(yaml.dump(results, default_flow_style=False)) - else: - _display_status(results, endpoint) - - -def _display_status(results: dict, endpoint: str) -> None: - """Display service status in a formatted view.""" - console.print() - - inference = results.get("inference", {}) - status = inference.get("status", "unknown") - status_color = { - "healthy": "green", - "unhealthy": "yellow", - "unreachable": "red", - }.get(status, "white") - status_emoji = { - "healthy": "✓", - "unhealthy": "⚠", - "unreachable": "✗", - }.get(status, "?") - - console.print(Panel( - f"[bold]Prescale Service Status[/bold]\n" - f"Endpoint: [cyan]{endpoint}[/cyan]", - title="🌟 Prescale", - border_style=status_color, - )) - console.print() - - # Services table - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Service") - table.add_column("Status", justify="center") - table.add_column("Response Time", justify="right") - table.add_column("Details") - - # Inference service - response_time = inference.get("response_time_ms", 0) - response_time_str = f"{response_time:.0f}ms" if response_time else "N/A" - details = inference.get("details", {}) - version = details.get("version", "unknown") - - table.add_row( - "Inference API", - f"[{status_color}]{status_emoji} {status.upper()}[/{status_color}]", - response_time_str, - f"v{version}" if status == "healthy" else inference.get("error", ""), - ) - - # Metrics endpoint - metrics = results.get("metrics", {}) - metrics_status = metrics.get("status", "unknown") - metrics_color = "green" if metrics_status == "available" else "red" - metrics_emoji = "✓" if metrics_status == "available" else "✗" - - table.add_row( - "Metrics", - f"[{metrics_color}]{metrics_emoji} {metrics_status.upper()}[/{metrics_color}]", - "", - "Prometheus format", - ) - - console.print(table) - - # Model information - if status == "healthy" and "models" in inference.get("details", {}): - console.print() - console.print("[bold]Loaded Models:[/bold]") - for model in inference["details"]["models"]: - console.print(f" • {model}") - - # Quick actions - console.print() - console.print("[bold]Quick Actions:[/bold]") - console.print(" [dim]prescale predict cpu -d [/dim] - Get CPU predictions") - console.print(" [dim]prescale detect -d [/dim] - Detect anomalies") - console.print(" [dim]prescale recommend -d [/dim] - Get recommendations") diff --git a/cli/src/prescale_cli/loadtest.py b/cli/src/prescale_cli/loadtest.py new file mode 100644 index 0000000..7048500 --- /dev/null +++ b/cli/src/prescale_cli/loadtest.py @@ -0,0 +1,241 @@ +"""Self-contained launch-readiness load engine for `prescale run`. + +Ramps virtual users against a single URL, captures latency/error signals at +each level, and finds the point where the target starts to fail. Pure-Python +on top of httpx + asyncio so it needs no external load tool and no server. +""" + +from __future__ import annotations + +import asyncio +import time +from dataclasses import dataclass, field + +import httpx + + +class LoadError(Exception): + """Raised when the target can't be reached at all (nothing to test).""" + + +# Concurrency levels we step through. Filtered against --max-users at runtime. +_LADDER = [1, 2, 5, 10, 20, 30, 50, 75, 100, 150, 200, 300, 500, 750, 1000] + + +def default_levels(max_users: int) -> list[int]: + """Build the ramp ladder, capped at and ending exactly at max_users.""" + levels = [u for u in _LADDER if u < max_users] + levels.append(max_users) + return levels + + +def percentile(sorted_vals: list[float], p: float) -> float: + """Linear-interpolated percentile. `p` in [0, 1], `sorted_vals` ascending.""" + if not sorted_vals: + return 0.0 + if len(sorted_vals) == 1: + return sorted_vals[0] + k = (len(sorted_vals) - 1) * p + lo = int(k) + hi = min(lo + 1, len(sorted_vals) - 1) + return sorted_vals[lo] + (sorted_vals[hi] - sorted_vals[lo]) * (k - lo) + + +@dataclass +class StageResult: + """Aggregated outcome of holding `users` concurrent VUs for `duration`s.""" + + users: int + duration: float + total: int = 0 + errors: int = 0 + latencies: list[float] = field(default_factory=list) # successful responses, seconds + status_counts: dict[int, int] = field(default_factory=dict) + error_kinds: dict[str, int] = field(default_factory=dict) + + @property + def rps(self) -> float: + return self.total / self.duration if self.duration else 0.0 + + @property + def error_rate(self) -> float: + return self.errors / self.total if self.total else 0.0 + + def pct(self, p: float) -> float: + return percentile(sorted(self.latencies), p) + + +@dataclass +class RunReport: + stages: list[StageResult] + survives_users: int + max_tested: int + onset_users: int | None = None + onset_reason: str | None = None # "errors" | "latency" + bottleneck: str | None = None + latency_wall: float = 2.0 + + +class _Sink: + """Per-stage accumulator. A 5xx/429 is a failure; everything else with a + status code counts toward latency.""" + + def __init__(self) -> None: + self.total = 0 + self.errors = 0 + self.latencies: list[float] = [] + self.status_counts: dict[int, int] = {} + self.error_kinds: dict[str, int] = {} + + def _kind(self, k: str) -> None: + self.error_kinds[k] = self.error_kinds.get(k, 0) + 1 + + def record_response(self, status: int, latency: float) -> None: + self.total += 1 + self.status_counts[status] = self.status_counts.get(status, 0) + 1 + if status >= 500: + self.errors += 1 + self._kind("5xx") + elif status == 429: + self.errors += 1 + self._kind("rate limited (429)") + else: + self.latencies.append(latency) + + def record_error(self, kind: str) -> None: + self.total += 1 + self.errors += 1 + self._kind(kind) + + def to_stage(self, users: int, duration: float) -> StageResult: + return StageResult( + users=users, + duration=duration, + total=self.total, + errors=self.errors, + latencies=self.latencies, + status_counts=self.status_counts, + error_kinds=self.error_kinds, + ) + + +async def _worker(client: httpx.AsyncClient, url: str, method: str, + deadline: float, sink: _Sink) -> None: + """Closed-loop VU: fire requests back-to-back until the stage deadline.""" + loop = asyncio.get_running_loop() + while loop.time() < deadline: + start = time.perf_counter() + try: + resp = await client.request(method, url) + sink.record_response(resp.status_code, time.perf_counter() - start) + except httpx.TimeoutException: + sink.record_error("timeout") + except httpx.ConnectError: + sink.record_error("connection refused") + except httpx.HTTPError: + sink.record_error("network") + + +async def _run_stage(client: httpx.AsyncClient, url: str, method: str, + users: int, duration: float) -> StageResult: + sink = _Sink() + deadline = asyncio.get_running_loop().time() + duration + await asyncio.gather( + *(_worker(client, url, method, deadline, sink) for _ in range(users)) + ) + return sink.to_stage(users, duration) + + +async def run_loadtest( + url: str, + *, + levels: list[int], + stage_seconds: float, + method: str = "GET", + timeout: float = 10.0, + hard_stop_rate: float = 0.5, + progress_cb=None, +) -> tuple[list[StageResult], str | None]: + """Preflight the URL, then ramp through `levels`. Stops early once a stage + is more than `hard_stop_rate` failed. Returns (stages, warning).""" + max_conns = max(levels) + 50 + limits = httpx.Limits(max_connections=max_conns, max_keepalive_connections=max_conns) + warning: str | None = None + + async with httpx.AsyncClient( + timeout=timeout, limits=limits, follow_redirects=True + ) as client: + try: + preflight = await client.request(method, url) + except httpx.HTTPError as exc: + raise LoadError(f"Couldn't reach {url}: {exc}") from exc + if preflight.status_code >= 400: + warning = ( + f"First request returned HTTP {preflight.status_code} — " + "results may reflect a broken endpoint, not a load limit." + ) + + stages: list[StageResult] = [] + for users in levels: + if progress_cb: + progress_cb(users) + stage = await _run_stage(client, url, method, users, stage_seconds) + stages.append(stage) + if stage.error_rate >= hard_stop_rate: + break + + return stages, warning + + +def _bottleneck_hint(onset: StageResult, reason: str, latency_wall: float) -> str: + if reason == "latency": + return ( + f"No errors yet, but p95 crosses {latency_wall:g}s — users feel it " + "before anything 500s. Often a slow query, N+1, or missing cache." + ) + kind = max(onset.error_kinds, key=onset.error_kinds.get) if onset.error_kinds else "" + hints = { + "5xx": "Server returned 5xx under load — likely an unhandled overload " + "(DB connection pool, worker queue, or an uncaught error path).", + "rate limited (429)": "A rate limiter kicked in (429). Fine if intentional; " + "otherwise it'll throttle real users during a spike.", + "timeout": "Requests started timing out — the server is saturated or a " + "downstream dependency is blocking.", + "connection refused": "Connections were refused — you hit a connection or " + "worker ceiling (or the process fell over).", + "network": "Network-level errors under load — connection handling is the wall.", + } + return hints.get(kind, "The target started failing under load.") + + +def analyze(stages: list[StageResult], *, latency_wall: float, + error_threshold: float) -> RunReport: + """Find the first level that crosses the error or latency threshold.""" + onset: StageResult | None = None + reason: str | None = None + for stage in stages: + if stage.total and stage.error_rate >= error_threshold: + onset, reason = stage, "errors" + break + if stage.latencies and stage.pct(0.95) >= latency_wall: + onset, reason = stage, "latency" + break + + max_tested = stages[-1].users if stages else 0 + if onset is None: + survives = max_tested + bottleneck = None + else: + idx = stages.index(onset) + survives = stages[idx - 1].users if idx > 0 else 0 + bottleneck = _bottleneck_hint(onset, reason, latency_wall) + + return RunReport( + stages=stages, + survives_users=survives, + max_tested=max_tested, + onset_users=onset.users if onset else None, + onset_reason=reason, + bottleneck=bottleneck, + latency_wall=latency_wall, + ) diff --git a/cli/src/prescale_cli/main.py b/cli/src/prescale_cli/main.py index cdfef3f..e8ec52a 100644 --- a/cli/src/prescale_cli/main.py +++ b/cli/src/prescale_cli/main.py @@ -1,68 +1,28 @@ -"""Main CLI entry point for Prescale.""" +"""Main CLI entry point for PreScale.""" import click -from rich.console import Console from prescale_cli import __version__ -from prescale_cli.commands import predict, detect, recommend, status, config, agent - -console = Console() +from prescale_cli.commands import run @click.group() @click.version_option(version=__version__, prog_name="prescale") -@click.option( - "--endpoint", - "-e", - envvar="PRESCALE_ENDPOINT", - default="http://104.155.137.61", - help="Prescale API endpoint URL", -) -@click.option( - "--api-key", - envvar="PRESCALE_API_KEY", - default=None, - help="API key for authentication", -) -@click.option( - "--output", - "-o", - type=click.Choice(["table", "json", "yaml"]), - default="table", - help="Output format", -) -@click.pass_context -def cli(ctx: click.Context, endpoint: str, api_key: str | None, output: str) -> None: - """Prescale CLI - Predictive Infrastructure Intelligence Platform. - - ML-powered resource forecasting and anomaly detection for Kubernetes. - - \b - Quick Commands (all have sensible defaults): - prescale status # Check service health - prescale detect # Detect anomalies - prescale recommend # Get scaling recommendations - prescale predict cpu # CPU predictions (needs models) - +def cli() -> None: + """PreScale - launch-readiness load testing for solo/indie devs. + + Point `prescale run` at a URL and it ramps traffic until something gives, + then tells you in plain English what breaks first and at what load - + before your users find out. + \b - Examples with options: - prescale detect -d my-app -n prod --sensitivity high - prescale recommend -d my-app --cost-optimize - prescale predict cpu -d my-app -p 24 + Quick start: + prescale run http://localhost:8000 + prescale run https://staging.myapp.com -u 500 --i-own-this """ - ctx.ensure_object(dict) - ctx.obj["endpoint"] = endpoint - ctx.obj["api_key"] = api_key - ctx.obj["output"] = output -# Register commands -cli.add_command(predict.predict) -cli.add_command(detect.detect) -cli.add_command(recommend.recommend) -cli.add_command(status.status) -cli.add_command(config.config) -cli.add_command(agent.agent) +cli.add_command(run.run) def main() -> None: diff --git a/cli/tests/test_cli.py b/cli/tests/test_cli.py index b582ed9..ce37698 100644 --- a/cli/tests/test_cli.py +++ b/cli/tests/test_cli.py @@ -1,333 +1,41 @@ -""" -Prescale CLI Tests +"""Tests for the PreScale CLI surface.""" -Unit tests for the Prescale CLI commands. -""" - -import json -from datetime import datetime, timezone -from unittest.mock import patch, MagicMock - -import pytest from click.testing import CliRunner - -class TestCLIImport: - """Tests for CLI module imports.""" - - def test_import_main(self): - """Test that main CLI can be imported.""" - from prescale_cli.main import cli, main - assert cli is not None - assert main is not None - - def test_import_commands(self): - """Test that all commands can be imported.""" - from prescale_cli.commands import predict, detect, recommend, status, config, agent - assert predict is not None - assert detect is not None - assert recommend is not None - assert status is not None - assert config is not None - assert agent is not None - - -class TestCLIVersion: - """Tests for CLI version.""" - - def test_version_import(self): - """Test that version can be imported.""" - from prescale_cli import __version__ - assert __version__ is not None - assert isinstance(__version__, str) - - def test_version_format(self): - """Test that version follows semver format.""" - from prescale_cli import __version__ - parts = __version__.split(".") - assert len(parts) >= 2 # At least major.minor - - -class TestCLIGroup: - """Tests for the main CLI group.""" - - def test_cli_help(self): - """Test CLI help output.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - - assert result.exit_code == 0 - assert "Prescale CLI" in result.output - assert "status" in result.output - assert "detect" in result.output - assert "recommend" in result.output - assert "predict" in result.output - - def test_cli_version_option(self): - """Test --version option.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--version"]) - - assert result.exit_code == 0 - assert "prescale" in result.output.lower() - - -class TestStatusCommand: - """Tests for the status command.""" - - def test_status_help(self): - """Test status command help.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["status", "--help"]) - - assert result.exit_code == 0 - assert "status" in result.output.lower() - - @patch("prescale_cli.commands.status.httpx.get") - def test_status_healthy(self, mock_get): - """Test status command with healthy service.""" - from prescale_cli.main import cli - - # Mock healthy response - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = { - "status": "healthy", - "models_loaded": 3, - "uptime": "24h" - } - mock_response.elapsed.total_seconds.return_value = 0.05 - mock_get.return_value = mock_response - - runner = CliRunner() - result = runner.invoke(cli, ["status"]) - - # Should attempt to check health - assert mock_get.called - - @patch("prescale_cli.commands.status.httpx.get") - def test_status_json_output(self, mock_get): - """Test status command with JSON output.""" - from prescale_cli.main import cli - - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"status": "healthy"} - mock_response.elapsed.total_seconds.return_value = 0.05 - mock_get.return_value = mock_response - - runner = CliRunner() - result = runner.invoke(cli, ["--output", "json", "status"]) - - # JSON output should contain valid JSON - # (even if parsing fails due to test environment) - assert result.exit_code == 0 - - -class TestDetectCommand: - """Tests for the detect command.""" - - def test_detect_help(self): - """Test detect command help.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["detect", "--help"]) - - assert result.exit_code == 0 - assert "detect" in result.output.lower() - assert "sensitivity" in result.output - - def test_detect_sensitivity_options(self): - """Test that sensitivity options are validated.""" - from prescale_cli.main import cli - - runner = CliRunner() - - # Valid sensitivities should be accepted (help shows options) - result = runner.invoke(cli, ["detect", "--help"]) - assert "low" in result.output - assert "medium" in result.output - assert "high" in result.output - - def test_detect_metric_options(self): - """Test that metric options are validated.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["detect", "--help"]) - - assert "cpu_utilization" in result.output - assert "memory_utilization" in result.output - - -class TestRecommendCommand: - """Tests for the recommend command.""" - - def test_recommend_help(self): - """Test recommend command help.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["recommend", "--help"]) - - assert result.exit_code == 0 - assert "recommend" in result.output.lower() - - -class TestPredictCommand: - """Tests for the predict command.""" - - def test_predict_help(self): - """Test predict command help.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["predict", "--help"]) - - assert result.exit_code == 0 - assert "predict" in result.output.lower() - - -class TestConfigCommand: - """Tests for the config command.""" - - def test_config_help(self): - """Test config command help.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["config", "--help"]) - - assert result.exit_code == 0 - - -class TestAgentCommand: - """Tests for the agent command.""" - - def test_agent_help(self): - """Test agent command help.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["agent", "--help"]) - - assert result.exit_code == 0 - - -class TestOutputFormats: - """Tests for different output formats.""" - - def test_table_format_default(self): - """Test that table is the default output format.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - - assert "table" in result.output - - def test_json_format_option(self): - """Test JSON output format option.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - - assert "json" in result.output - - def test_yaml_format_option(self): - """Test YAML output format option.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - - assert "yaml" in result.output - - -class TestEndpointOptions: - """Tests for endpoint configuration.""" - - def test_endpoint_option(self): - """Test --endpoint option is available.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - - assert "--endpoint" in result.output or "-e" in result.output - - def test_api_key_option(self): - """Test --api-key option is available.""" - from prescale_cli.main import cli - - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - - assert "--api-key" in result.output +from prescale_cli.main import cli, main -class TestSampleDataGeneration: - """Tests for sample data generation utilities.""" +def test_imports(): + assert cli is not None + assert main is not None - def test_generate_sample_metrics(self): - """Test sample metrics generation.""" - from prescale_cli.commands.detect import generate_sample_metrics - - metrics = generate_sample_metrics(lookback_hours=1) - - assert len(metrics) > 0 - assert "timestamp" in metrics[0] - assert "value" in metrics[0] - def test_generate_sample_metrics_values(self): - """Test that generated values are in reasonable range.""" - from prescale_cli.commands.detect import generate_sample_metrics - - metrics = generate_sample_metrics(lookback_hours=2) - - for point in metrics: - # CPU values should be between 0 and 1 - assert 0 <= point["value"] <= 1 +def test_version_string(): + from prescale_cli import __version__ + assert isinstance(__version__, str) + assert len(__version__.split(".")) >= 2 -# Fixtures -@pytest.fixture -def cli_runner(): - """Create a CLI test runner.""" - return CliRunner() +def test_version_option(): + result = CliRunner().invoke(cli, ["--version"]) + assert result.exit_code == 0 + assert "prescale" in result.output.lower() -@pytest.fixture -def mock_healthy_response(): - """Create a mock healthy API response.""" - mock = MagicMock() - mock.status_code = 200 - mock.json.return_value = { - "status": "healthy", - "version": "0.4.0", - "models_loaded": 3 - } - mock.elapsed.total_seconds.return_value = 0.042 - return mock +def test_group_help_lists_run(): + result = CliRunner().invoke(cli, ["--help"]) + assert result.exit_code == 0 + assert "run" in result.output + assert "load" in result.output.lower() -@pytest.fixture -def mock_unhealthy_response(): - """Create a mock unhealthy API response.""" - mock = MagicMock() - mock.status_code = 503 - mock.json.return_value = {"error": "Service unavailable"} - mock.elapsed.total_seconds.return_value = 5.0 - return mock +def test_run_help_has_key_options(): + result = CliRunner().invoke(cli, ["run", "--help"]) + assert result.exit_code == 0 + assert "--max-users" in result.output + assert "--i-own-this" in result.output -if __name__ == "__main__": - pytest.main([__file__, "-v"]) +def test_run_rejects_non_url(): + result = CliRunner().invoke(cli, ["run", "not-a-url"]) + assert result.exit_code == 1 diff --git a/cli/tests/test_loadtest.py b/cli/tests/test_loadtest.py new file mode 100644 index 0000000..7ec2808 --- /dev/null +++ b/cli/tests/test_loadtest.py @@ -0,0 +1,82 @@ +"""Tests for the pure logic of the prescale run load engine.""" + +from prescale_cli.loadtest import ( + StageResult, + analyze, + default_levels, + percentile, +) + + +def _stage(users, total, errors, latency, kind="5xx"): + """Build a StageResult with `total` successful requests at `latency` seconds + plus `errors` failures of `kind`.""" + s = StageResult(users=users, duration=5.0) + s.total = total + s.errors = errors + s.latencies = [latency] * (total - errors) + if errors: + s.error_kinds = {kind: errors} + return s + + +def test_percentile_basic(): + vals = [1.0, 2.0, 3.0, 4.0, 5.0] + assert percentile(vals, 0.0) == 1.0 + assert percentile(vals, 1.0) == 5.0 + assert percentile(vals, 0.5) == 3.0 + + +def test_percentile_edges(): + assert percentile([], 0.95) == 0.0 + assert percentile([7.0], 0.95) == 7.0 + + +def test_default_levels_caps_and_ends_at_max(): + levels = default_levels(50) + assert levels[-1] == 50 + assert all(u <= 50 for u in levels) + assert levels == sorted(levels) + + +def test_default_levels_custom_max_appended(): + assert default_levels(42)[-1] == 42 + + +def test_analyze_detects_error_onset(): + stages = [ + _stage(10, 1000, 0, 0.05), + _stage(50, 1000, 0, 0.08), + _stage(100, 1000, 200, 0.10), # 20% errors -> onset + ] + report = analyze(stages, latency_wall=2.0, error_threshold=0.02) + assert report.onset_users == 100 + assert report.onset_reason == "errors" + assert report.survives_users == 50 + assert "5xx" in report.bottleneck + + +def test_analyze_detects_latency_wall(): + stages = [ + _stage(10, 1000, 0, 0.05), + _stage(50, 1000, 0, 3.0), # p95 = 3s > 2s wall -> onset + ] + report = analyze(stages, latency_wall=2.0, error_threshold=0.02) + assert report.onset_users == 50 + assert report.onset_reason == "latency" + assert report.survives_users == 10 + + +def test_analyze_survives_everything(): + stages = [_stage(10, 1000, 0, 0.05), _stage(100, 1000, 0, 0.06)] + report = analyze(stages, latency_wall=2.0, error_threshold=0.02) + assert report.onset_users is None + assert report.survives_users == 100 + assert report.bottleneck is None + + +def test_analyze_fails_immediately(): + stages = [_stage(10, 1000, 500, 0.05)] + report = analyze(stages, latency_wall=2.0, error_threshold=0.02) + assert report.onset_users == 10 + assert report.survives_users == 0 From 051cae1eea8d3f96b41fccf2284109dffa477380 Mon Sep 17 00:00:00 2001 From: pyjeebz Date: Thu, 25 Jun 2026 16:34:43 -0400 Subject: [PATCH 2/3] chore: strip legacy platform and depivot packaging/CI Remove the multi-cloud / Kubernetes platform now that PreScale is a single-purpose load-test CLI: ml/ (forecasting + dashboard), infra/, charts/, agent/, landing-page/, examples/, scripts/, docs/, and all GCP/Helm/Render/CloudBuild configs, plus stray artifacts. Depivot packaging and CI to the load-test thesis: slim CI to lint + test the CLI across Py 3.10-3.12, reduce release.yml to a PyPI trusted publish, and update both pyproject files (name, description, keywords, deps). Co-Authored-By: Claude Opus 4.8 --- .env.example | 45 - .github/workflows/ci.yml | 273 +- .github/workflows/helm-pages.yml | 64 - .github/workflows/publish-agent.yml | 54 - .github/workflows/release.yml | 197 +- Makefile | 116 - agent/README.md | 107 - agent/agent.py | 12 - agent/pyproject.toml | 85 - agent/pytest.ini | 10 - agent/src/prescale_agent/__init__.py | 3 - agent/src/prescale_agent/agent.py | 245 - agent/src/prescale_agent/cli.py | 384 - agent/src/prescale_agent/client.py | 199 - agent/src/prescale_agent/config.py | 189 - agent/src/prescale_agent/sources/__init__.py | 15 - .../prescale_agent/sources/azure_monitor.py | 190 - agent/src/prescale_agent/sources/base.py | 156 - .../src/prescale_agent/sources/cloudwatch.py | 209 - agent/src/prescale_agent/sources/datadog.py | 204 - .../prescale_agent/sources/gcp_monitoring.py | 259 - .../src/prescale_agent/sources/prometheus.py | 206 - agent/src/prescale_agent/sources/registry.py | 111 - agent/src/prescale_agent/sources/system.py | 228 - agent/tests/__init__.py | 1 - agent/tests/test_agent.py | 465 -- charts/prescale/Chart.yaml | 40 - charts/prescale/README.md | 143 - charts/prescale/templates/_helpers.tpl | 90 - .../cost-intelligence-deployment.yaml | 68 - .../templates/cost-intelligence-service.yaml | 23 - .../templates/inference-deployment.yaml | 82 - charts/prescale/templates/inference-hpa.yaml | 33 - .../prescale/templates/inference-service.yaml | 23 - charts/prescale/templates/ingress.yaml | 42 - charts/prescale/templates/pdb.yaml | 20 - charts/prescale/templates/pvc.yaml | 17 - charts/prescale/templates/serviceaccount.yaml | 12 - charts/prescale/templates/servicemonitor.yaml | 25 - charts/prescale/values.yaml | 249 - cli/pyproject.toml | 12 +- cloudbuild-cost.yaml | 29 - cloudbuild.yaml | 22 - commits.txt | 90 - docker-compose.yml | 83 - docs/INTEGRATION.md | 775 -- docs/QUICKSTART.md | 288 - docs/architecture/ARCHITECTURE.md | 698 -- docs/cloudwatch_testing_guide.md | 93 - docs/deployment_to_gcp_vm.md | 122 - docs/gcp_agent_guide.md | 75 - docs/gcp_full_setup_guide.md | 110 - docs/gcp_testing_guide.md | 90 - docs/phase5-design.md | 510 -- docs/real_world_testing.md | 108 - docs/research_infra_chat.md | 186 - docs/user_journey_gcp_locust.md | 186 - docs/user_scenarios.md | 103 - examples/README.md | 99 - examples/demo-environment/data/.gitkeep | 6 - .../2025-12-31_100users_20mins_exceptions.csv | 1 - .../2025-12-31_100users_20mins_failures.csv | 7 - .../data/2025-12-31_100users_20mins_stats.csv | 11 - .../data/2025-12-31_100users_report.html | 859 --- .../kubernetes/locust/base/configmap.yaml | 15 - .../kubernetes/locust/base/deployment.yaml | 116 - .../kubernetes/locust/base/kustomization.yaml | 21 - .../kubernetes/locust/base/namespace.yaml | 7 - .../locust/base/scripts/locustfile.py | 291 - .../kubernetes/locust/base/service.yaml | 26 - .../saleor/base/api-deployment.yaml | 134 - .../kubernetes/saleor/base/api-service.yaml | 24 - .../kubernetes/saleor/base/configmap.yaml | 57 - .../saleor/base/dashboard-deployment.yaml | 99 - .../saleor/base/dashboard-service.yaml | 23 - .../kubernetes/saleor/base/hpa.yaml | 101 - .../kubernetes/saleor/base/ingress.yaml | 112 - .../kubernetes/saleor/base/kustomization.yaml | 36 - .../kubernetes/saleor/base/namespace.yaml | 10 - .../kubernetes/saleor/base/pdb.yaml | 34 - .../kubernetes/saleor/base/secrets.yaml | 54 - .../saleor/base/serviceaccounts.yaml | 36 - .../saleor/base/worker-deployment.yaml | 185 - .../saleor/jobs/migration-jobs.yaml | 216 - .../saleor/overlays/dev/kustomization.yaml | 82 - examples/demo-environment/loadtest/Dockerfile | 27 - examples/demo-environment/loadtest/README.md | 184 - .../loadtest/locustfiles/common/__init__.py | 20 - .../locustfiles/common/graphql_client.py | 308 - .../loadtest/locustfiles/common/utils.py | 132 - .../loadtest/locustfiles/locustfile.py | 15 - .../loadtest/locustfiles/personas/__init__.py | 7 - .../loadtest/locustfiles/personas/admin.py | 203 - .../loadtest/locustfiles/personas/browser.py | 148 - .../loadtest/locustfiles/personas/buyer.py | 256 - .../loadtest/locustfiles/personas/searcher.py | 162 - .../loadtest/locustfiles/ramp_pattern.py | 17 - .../loadtest/locustfiles/read_heavy.py | 20 - .../loadtest/locustfiles/shapes/__init__.py | 15 - .../locustfiles/shapes/load_shapes.py | 213 - .../loadtest/locustfiles/spike_pattern.py | 20 - .../loadtest/locustfiles/wave_pattern.py | 18 - .../loadtest/locustfiles/write_heavy.py | 27 - .../loadtest/probe_metrics.py | 81 - .../loadtest/requirements.txt | 3 - .../demo-environment/loadtest/run-tests.ps1 | 114 - infra/DEPLOYMENT.md | 398 - infra/deploy.ps1 | 220 - .../dashboards/prescale-inference.json | 1098 --- infra/kubernetes/keda/kustomization.yaml | 12 - .../keda/scaledobject-prescale.yaml | 77 - .../kubernetes/keda/scaledobject-saleor.yaml | 108 - infra/kubernetes/keda/trigger-auth.yaml | 25 - .../monitoring/grafana-dashboards.yaml | 223 - infra/kubernetes/monitoring/grafana.yaml | 127 - .../kubernetes/monitoring/pod-monitoring.yaml | 31 - .../monitoring/values-prometheus.yaml | 162 - infra/kubernetes/prescale-cost/configmap.yaml | 19 - .../kubernetes/prescale-cost/deployment.yaml | 81 - .../prescale-inference/configmap.yaml | 42 - .../prescale-inference/deployment.yaml | 109 - infra/kubernetes/prescale-inference/hpa.yaml | 49 - .../prescale-inference/kustomization.yaml | 23 - .../prescale-inference/namespace.yaml | 7 - .../prescale-inference/pod-monitoring.yaml | 21 - .../prescale-inference/prometheus-rules.yaml | 174 - .../prescale-inference/service.yaml | 36 - .../prescale-inference/serviceaccount.yaml | 8 - infra/prometheus/prometheus.yml | 24 - infra/terraform/environments/dev/main.tf | 271 - .../environments/dev/terraform.tfvars.example | 18 - infra/terraform/environments/dev/tfplan | Bin 23133 -> 0 bytes infra/terraform/environments/dev/variables.tf | 50 - infra/terraform/modules/cloudsql/main.tf | 292 - infra/terraform/modules/gke/main.tf | 225 - infra/terraform/modules/iam/main.tf | 162 - infra/terraform/modules/networking/main.tf | 220 - infra/terraform/modules/redis/main.tf | 185 - infra/terraform/modules/storage/main.tf | 185 - landing-page/.gitignore | 1 - landing-page/ATTRIBUTIONS.md | 3 - landing-page/app/App.tsx | 32 - landing-page/app/components/Architecture.tsx | 128 - landing-page/app/components/AsciiViewer.tsx | 366 - landing-page/app/components/BentoGrid.tsx | 260 - landing-page/app/components/CTA.tsx | 67 - landing-page/app/components/CodeExample.tsx | 103 - landing-page/app/components/Features.tsx | 95 - landing-page/app/components/Footer.tsx | 108 - landing-page/app/components/Hero.tsx | 136 - landing-page/app/components/Installation.tsx | 143 - landing-page/app/components/LandingPage.tsx | 19 - landing-page/app/components/Layout.tsx | 16 - landing-page/app/components/Navigation.tsx | 485 -- landing-page/app/components/PreScaleLogo.tsx | 250 - landing-page/app/components/ProjectBadges.tsx | 60 - landing-page/app/components/Stats.tsx | 43 - landing-page/app/components/ThemeProvider.tsx | 46 - .../app/components/docs/ApiReferencePage.tsx | 365 - .../app/components/docs/ConfigurationPage.tsx | 326 - landing-page/app/components/docs/DocsHome.tsx | 189 - .../app/components/docs/QuickstartPage.tsx | 243 - .../components/figma/ImageWithFallback.tsx | 27 - .../app/components/hooks/useInView.tsx | 36 - .../integrations/IntegrationPage.tsx | 461 -- .../integrations/IntegrationsHome.tsx | 135 - .../app/components/landing/HowItWorks.tsx | 175 - landing-page/app/components/ui/accordion.tsx | 66 - .../app/components/ui/alert-dialog.tsx | 157 - landing-page/app/components/ui/alert.tsx | 66 - .../app/components/ui/aspect-ratio.tsx | 11 - landing-page/app/components/ui/avatar.tsx | 53 - landing-page/app/components/ui/badge.tsx | 46 - landing-page/app/components/ui/breadcrumb.tsx | 109 - landing-page/app/components/ui/button.tsx | 58 - landing-page/app/components/ui/calendar.tsx | 75 - landing-page/app/components/ui/card.tsx | 92 - landing-page/app/components/ui/carousel.tsx | 241 - landing-page/app/components/ui/chart.tsx | 353 - landing-page/app/components/ui/checkbox.tsx | 32 - .../app/components/ui/collapsible.tsx | 33 - landing-page/app/components/ui/command.tsx | 177 - .../app/components/ui/context-menu.tsx | 252 - landing-page/app/components/ui/dialog.tsx | 135 - landing-page/app/components/ui/drawer.tsx | 132 - .../app/components/ui/dropdown-menu.tsx | 257 - landing-page/app/components/ui/form.tsx | 168 - landing-page/app/components/ui/hover-card.tsx | 44 - landing-page/app/components/ui/input-otp.tsx | 77 - landing-page/app/components/ui/input.tsx | 21 - landing-page/app/components/ui/label.tsx | 24 - landing-page/app/components/ui/menubar.tsx | 276 - .../app/components/ui/navigation-menu.tsx | 168 - landing-page/app/components/ui/pagination.tsx | 127 - landing-page/app/components/ui/popover.tsx | 48 - landing-page/app/components/ui/progress.tsx | 31 - .../app/components/ui/radio-group.tsx | 45 - landing-page/app/components/ui/resizable.tsx | 56 - .../app/components/ui/scroll-area.tsx | 58 - landing-page/app/components/ui/select.tsx | 189 - landing-page/app/components/ui/separator.tsx | 28 - landing-page/app/components/ui/sheet.tsx | 139 - landing-page/app/components/ui/sidebar.tsx | 726 -- landing-page/app/components/ui/skeleton.tsx | 13 - landing-page/app/components/ui/slider.tsx | 63 - landing-page/app/components/ui/sonner.tsx | 25 - landing-page/app/components/ui/switch.tsx | 31 - landing-page/app/components/ui/table.tsx | 116 - landing-page/app/components/ui/tabs.tsx | 66 - landing-page/app/components/ui/textarea.tsx | 18 - .../app/components/ui/toggle-group.tsx | 73 - landing-page/app/components/ui/toggle.tsx | 47 - landing-page/app/components/ui/tooltip.tsx | 61 - landing-page/app/components/ui/use-mobile.ts | 21 - landing-page/app/components/ui/utils.ts | 6 - landing-page/app/main.tsx | 13 - landing-page/index.html | 24 - landing-page/package-lock.json | 6403 ----------------- landing-page/package.json | 100 - landing-page/postcss.config.mjs | 15 - landing-page/styles/fonts.css | 2 - landing-page/styles/index.css | 78 - landing-page/styles/tailwind.css | 4 - landing-page/styles/theme.css | 192 - landing-page/vercel.json | 8 - landing-page/vite.config.ts | 22 - locust-exporter/app.py | 98 - locust-exporter/requirements.txt | 4 - ml/__init__.py | 7 - .../data_summary_20251231_172304.json | 15 - .../data_summary_20251231_174808.json | 17 - .../data_summary_20251231_184446.json | 17 - .../data_summary_20251231_184526.json | 17 - .../data_summary_20251231_184618.json | 17 - ml/artifacts/metrics_20251231_172304.json | 28 - ml/artifacts/metrics_20251231_174808.json | 20 - ml/artifacts/metrics_20251231_184446.json | 20 - ml/artifacts/metrics_20251231_184526.json | 20 - ml/artifacts/metrics_20251231_184618.json | 28 - ml/config.py | 100 - ml/cost_intelligence/Dockerfile | 47 - ml/cost_intelligence/__init__.py | 3 - ml/cost_intelligence/app.py | 335 - ml/cost_intelligence/config.py | 84 - ml/cost_intelligence/cost_calculator.py | 274 - ml/cost_intelligence/efficiency.py | 243 - ml/cost_intelligence/forecaster.py | 180 - ml/cost_intelligence/models.py | 245 - ml/cost_intelligence/requirements.txt | 13 - ml/cost_intelligence/savings_analyzer.py | 258 - ml/fetch_real_data.py | 158 - ml/inference/Dockerfile | 59 - ml/inference/__init__.py | 18 - ml/inference/anomaly_detector.py | 247 - ml/inference/app.py | 884 --- ml/inference/config.py | 170 - ml/inference/db.py | 459 -- ml/inference/metrics.py | 281 - ml/inference/model_manager.py | 626 -- ml/inference/models.py | 306 - ml/inference/predictor.py | 190 - ml/inference/recommender.py | 293 - ml/inference/requirements.txt | 22 - ml/inference/retrain_scheduler.py | 485 -- ml/inference/static/assets/index-BELjdxMU.js | 55 - ml/inference/static/assets/index-CIB54x0g.css | 1 - ml/inference/static/index.html | 17 - ml/inference/storage/__init__.py | 9 - ml/inference/storage/sqlite_backend.py | 606 -- ml/inference/tests/__init__.py | 0 ml/inference/tests/locustfile.py | 118 - ml/inference/tests/requirements-test.txt | 4 - ml/inference/tests/test_api.py | 313 - ml/inference/web/README.md | 86 - ml/inference/web/index.html | 20 - ml/inference/web/package-lock.json | 2088 ------ ml/inference/web/package.json | 28 - ml/inference/web/postcss.config.js | 6 - ml/inference/web/src/App.tsx | 205 - ml/inference/web/src/components/MiniChart.tsx | 35 - .../web/src/context/DeploymentsContext.tsx | 76 - ml/inference/web/src/context/ThemeContext.tsx | 37 - ml/inference/web/src/hooks/useAgents.ts | 30 - ml/inference/web/src/main.tsx | 19 - ml/inference/web/src/services/api.ts | 141 - ml/inference/web/src/styles/index.css | 93 - ml/inference/web/src/views/AgentInstall.tsx | 70 - ml/inference/web/src/views/Agents.tsx | 63 - ml/inference/web/src/views/Anomalies.tsx | 19 - ml/inference/web/src/views/Dashboard.tsx | 297 - ml/inference/web/src/views/Deployments.tsx | 111 - ml/inference/web/src/views/Predictions.tsx | 19 - ml/inference/web/tailwind.config.js | 19 - ml/inference/web/test-results/.last-run.json | 4 - ml/inference/web/tsconfig.json | 37 - ml/inference/web/tsconfig.node.json | 13 - ml/inference/web/vite.config.ts | 30 - ml/models/__init__.py | 7 - .../anomaly_detector/1.0.0/metadata.json | 12 - ml/models/anomaly_detector/1.0.0/model.pkl | Bin 1284611 -> 0 bytes ml/models/baseline.py | 244 - ml/models/cpu_forecaster/1.0.0/metadata.json | 14 - ml/models/cpu_forecaster/1.0.0/model.pkl | Bin 257612 -> 0 bytes .../memory_forecaster/1.0.0/metadata.json | 14 - ml/models/memory_forecaster/1.0.0/model.pkl | Bin 265838 -> 0 bytes ml/models/prophet_model.py | 385 - ml/models/xgboost_anomaly.py | 386 - ml/pipeline/__init__.py | 6 - ml/pipeline/cloudwatch_fetcher.py | 208 - ml/pipeline/data_fetcher.py | 360 - ml/pipeline/feature_engineering.py | 273 - ml/pytest.ini | 10 - ml/requirements.txt | 22 - ml/scripts/create_demo_models.py | 260 - ml/tests/__init__.py | 1 - ml/tests/test_models.py | 402 -- ml/train.py | 552 -- ml/training/requirements.txt | 5 - ml/training/train_models.py | 318 - prescale-agent-gcp.yaml | 19 - prescale-agent.yaml | 89 - prescale-agent.yaml.example | 89 - pyproject.toml | 40 +- render.yaml | 15 - scripts/check_image_modules.py | 14 - scripts/setup.ps1 | 104 - scripts/setup.sh | 114 - test_retrain.ps1 | 28 - 328 files changed, 53 insertions(+), 48992 deletions(-) delete mode 100644 .env.example delete mode 100644 .github/workflows/helm-pages.yml delete mode 100644 .github/workflows/publish-agent.yml delete mode 100644 Makefile delete mode 100644 agent/README.md delete mode 100644 agent/agent.py delete mode 100644 agent/pyproject.toml delete mode 100644 agent/pytest.ini delete mode 100644 agent/src/prescale_agent/__init__.py delete mode 100644 agent/src/prescale_agent/agent.py delete mode 100644 agent/src/prescale_agent/cli.py delete mode 100644 agent/src/prescale_agent/client.py delete mode 100644 agent/src/prescale_agent/config.py delete mode 100644 agent/src/prescale_agent/sources/__init__.py delete mode 100644 agent/src/prescale_agent/sources/azure_monitor.py delete mode 100644 agent/src/prescale_agent/sources/base.py delete mode 100644 agent/src/prescale_agent/sources/cloudwatch.py delete mode 100644 agent/src/prescale_agent/sources/datadog.py delete mode 100644 agent/src/prescale_agent/sources/gcp_monitoring.py delete mode 100644 agent/src/prescale_agent/sources/prometheus.py delete mode 100644 agent/src/prescale_agent/sources/registry.py delete mode 100644 agent/src/prescale_agent/sources/system.py delete mode 100644 agent/tests/__init__.py delete mode 100644 agent/tests/test_agent.py delete mode 100644 charts/prescale/Chart.yaml delete mode 100644 charts/prescale/README.md delete mode 100644 charts/prescale/templates/_helpers.tpl delete mode 100644 charts/prescale/templates/cost-intelligence-deployment.yaml delete mode 100644 charts/prescale/templates/cost-intelligence-service.yaml delete mode 100644 charts/prescale/templates/inference-deployment.yaml delete mode 100644 charts/prescale/templates/inference-hpa.yaml delete mode 100644 charts/prescale/templates/inference-service.yaml delete mode 100644 charts/prescale/templates/ingress.yaml delete mode 100644 charts/prescale/templates/pdb.yaml delete mode 100644 charts/prescale/templates/pvc.yaml delete mode 100644 charts/prescale/templates/serviceaccount.yaml delete mode 100644 charts/prescale/templates/servicemonitor.yaml delete mode 100644 charts/prescale/values.yaml delete mode 100644 cloudbuild-cost.yaml delete mode 100644 cloudbuild.yaml delete mode 100644 commits.txt delete mode 100644 docker-compose.yml delete mode 100644 docs/INTEGRATION.md delete mode 100644 docs/QUICKSTART.md delete mode 100644 docs/architecture/ARCHITECTURE.md delete mode 100644 docs/cloudwatch_testing_guide.md delete mode 100644 docs/deployment_to_gcp_vm.md delete mode 100644 docs/gcp_agent_guide.md delete mode 100644 docs/gcp_full_setup_guide.md delete mode 100644 docs/gcp_testing_guide.md delete mode 100644 docs/phase5-design.md delete mode 100644 docs/real_world_testing.md delete mode 100644 docs/research_infra_chat.md delete mode 100644 docs/user_journey_gcp_locust.md delete mode 100644 docs/user_scenarios.md delete mode 100644 examples/README.md delete mode 100644 examples/demo-environment/data/.gitkeep delete mode 100644 examples/demo-environment/data/2025-12-31_100users_20mins_exceptions.csv delete mode 100644 examples/demo-environment/data/2025-12-31_100users_20mins_failures.csv delete mode 100644 examples/demo-environment/data/2025-12-31_100users_20mins_stats.csv delete mode 100644 examples/demo-environment/data/2025-12-31_100users_report.html delete mode 100644 examples/demo-environment/kubernetes/locust/base/configmap.yaml delete mode 100644 examples/demo-environment/kubernetes/locust/base/deployment.yaml delete mode 100644 examples/demo-environment/kubernetes/locust/base/kustomization.yaml delete mode 100644 examples/demo-environment/kubernetes/locust/base/namespace.yaml delete mode 100644 examples/demo-environment/kubernetes/locust/base/scripts/locustfile.py delete mode 100644 examples/demo-environment/kubernetes/locust/base/service.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/api-deployment.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/api-service.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/configmap.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/dashboard-deployment.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/dashboard-service.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/hpa.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/ingress.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/kustomization.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/namespace.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/pdb.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/secrets.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/serviceaccounts.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/base/worker-deployment.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/jobs/migration-jobs.yaml delete mode 100644 examples/demo-environment/kubernetes/saleor/overlays/dev/kustomization.yaml delete mode 100644 examples/demo-environment/loadtest/Dockerfile delete mode 100644 examples/demo-environment/loadtest/README.md delete mode 100644 examples/demo-environment/loadtest/locustfiles/common/__init__.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/common/graphql_client.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/common/utils.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/locustfile.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/personas/__init__.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/personas/admin.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/personas/browser.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/personas/buyer.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/personas/searcher.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/ramp_pattern.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/read_heavy.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/shapes/__init__.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/shapes/load_shapes.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/spike_pattern.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/wave_pattern.py delete mode 100644 examples/demo-environment/loadtest/locustfiles/write_heavy.py delete mode 100644 examples/demo-environment/loadtest/probe_metrics.py delete mode 100644 examples/demo-environment/loadtest/requirements.txt delete mode 100644 examples/demo-environment/loadtest/run-tests.ps1 delete mode 100644 infra/DEPLOYMENT.md delete mode 100644 infra/deploy.ps1 delete mode 100644 infra/grafana/dashboards/prescale-inference.json delete mode 100644 infra/kubernetes/keda/kustomization.yaml delete mode 100644 infra/kubernetes/keda/scaledobject-prescale.yaml delete mode 100644 infra/kubernetes/keda/scaledobject-saleor.yaml delete mode 100644 infra/kubernetes/keda/trigger-auth.yaml delete mode 100644 infra/kubernetes/monitoring/grafana-dashboards.yaml delete mode 100644 infra/kubernetes/monitoring/grafana.yaml delete mode 100644 infra/kubernetes/monitoring/pod-monitoring.yaml delete mode 100644 infra/kubernetes/monitoring/values-prometheus.yaml delete mode 100644 infra/kubernetes/prescale-cost/configmap.yaml delete mode 100644 infra/kubernetes/prescale-cost/deployment.yaml delete mode 100644 infra/kubernetes/prescale-inference/configmap.yaml delete mode 100644 infra/kubernetes/prescale-inference/deployment.yaml delete mode 100644 infra/kubernetes/prescale-inference/hpa.yaml delete mode 100644 infra/kubernetes/prescale-inference/kustomization.yaml delete mode 100644 infra/kubernetes/prescale-inference/namespace.yaml delete mode 100644 infra/kubernetes/prescale-inference/pod-monitoring.yaml delete mode 100644 infra/kubernetes/prescale-inference/prometheus-rules.yaml delete mode 100644 infra/kubernetes/prescale-inference/service.yaml delete mode 100644 infra/kubernetes/prescale-inference/serviceaccount.yaml delete mode 100644 infra/prometheus/prometheus.yml delete mode 100644 infra/terraform/environments/dev/main.tf delete mode 100644 infra/terraform/environments/dev/terraform.tfvars.example delete mode 100644 infra/terraform/environments/dev/tfplan delete mode 100644 infra/terraform/environments/dev/variables.tf delete mode 100644 infra/terraform/modules/cloudsql/main.tf delete mode 100644 infra/terraform/modules/gke/main.tf delete mode 100644 infra/terraform/modules/iam/main.tf delete mode 100644 infra/terraform/modules/networking/main.tf delete mode 100644 infra/terraform/modules/redis/main.tf delete mode 100644 infra/terraform/modules/storage/main.tf delete mode 100644 landing-page/.gitignore delete mode 100644 landing-page/ATTRIBUTIONS.md delete mode 100644 landing-page/app/App.tsx delete mode 100644 landing-page/app/components/Architecture.tsx delete mode 100644 landing-page/app/components/AsciiViewer.tsx delete mode 100644 landing-page/app/components/BentoGrid.tsx delete mode 100644 landing-page/app/components/CTA.tsx delete mode 100644 landing-page/app/components/CodeExample.tsx delete mode 100644 landing-page/app/components/Features.tsx delete mode 100644 landing-page/app/components/Footer.tsx delete mode 100644 landing-page/app/components/Hero.tsx delete mode 100644 landing-page/app/components/Installation.tsx delete mode 100644 landing-page/app/components/LandingPage.tsx delete mode 100644 landing-page/app/components/Layout.tsx delete mode 100644 landing-page/app/components/Navigation.tsx delete mode 100644 landing-page/app/components/PreScaleLogo.tsx delete mode 100644 landing-page/app/components/ProjectBadges.tsx delete mode 100644 landing-page/app/components/Stats.tsx delete mode 100644 landing-page/app/components/ThemeProvider.tsx delete mode 100644 landing-page/app/components/docs/ApiReferencePage.tsx delete mode 100644 landing-page/app/components/docs/ConfigurationPage.tsx delete mode 100644 landing-page/app/components/docs/DocsHome.tsx delete mode 100644 landing-page/app/components/docs/QuickstartPage.tsx delete mode 100644 landing-page/app/components/figma/ImageWithFallback.tsx delete mode 100644 landing-page/app/components/hooks/useInView.tsx delete mode 100644 landing-page/app/components/integrations/IntegrationPage.tsx delete mode 100644 landing-page/app/components/integrations/IntegrationsHome.tsx delete mode 100644 landing-page/app/components/landing/HowItWorks.tsx delete mode 100644 landing-page/app/components/ui/accordion.tsx delete mode 100644 landing-page/app/components/ui/alert-dialog.tsx delete mode 100644 landing-page/app/components/ui/alert.tsx delete mode 100644 landing-page/app/components/ui/aspect-ratio.tsx delete mode 100644 landing-page/app/components/ui/avatar.tsx delete mode 100644 landing-page/app/components/ui/badge.tsx delete mode 100644 landing-page/app/components/ui/breadcrumb.tsx delete mode 100644 landing-page/app/components/ui/button.tsx delete mode 100644 landing-page/app/components/ui/calendar.tsx delete mode 100644 landing-page/app/components/ui/card.tsx delete mode 100644 landing-page/app/components/ui/carousel.tsx delete mode 100644 landing-page/app/components/ui/chart.tsx delete mode 100644 landing-page/app/components/ui/checkbox.tsx delete mode 100644 landing-page/app/components/ui/collapsible.tsx delete mode 100644 landing-page/app/components/ui/command.tsx delete mode 100644 landing-page/app/components/ui/context-menu.tsx delete mode 100644 landing-page/app/components/ui/dialog.tsx delete mode 100644 landing-page/app/components/ui/drawer.tsx delete mode 100644 landing-page/app/components/ui/dropdown-menu.tsx delete mode 100644 landing-page/app/components/ui/form.tsx delete mode 100644 landing-page/app/components/ui/hover-card.tsx delete mode 100644 landing-page/app/components/ui/input-otp.tsx delete mode 100644 landing-page/app/components/ui/input.tsx delete mode 100644 landing-page/app/components/ui/label.tsx delete mode 100644 landing-page/app/components/ui/menubar.tsx delete mode 100644 landing-page/app/components/ui/navigation-menu.tsx delete mode 100644 landing-page/app/components/ui/pagination.tsx delete mode 100644 landing-page/app/components/ui/popover.tsx delete mode 100644 landing-page/app/components/ui/progress.tsx delete mode 100644 landing-page/app/components/ui/radio-group.tsx delete mode 100644 landing-page/app/components/ui/resizable.tsx delete mode 100644 landing-page/app/components/ui/scroll-area.tsx delete mode 100644 landing-page/app/components/ui/select.tsx delete mode 100644 landing-page/app/components/ui/separator.tsx delete mode 100644 landing-page/app/components/ui/sheet.tsx delete mode 100644 landing-page/app/components/ui/sidebar.tsx delete mode 100644 landing-page/app/components/ui/skeleton.tsx delete mode 100644 landing-page/app/components/ui/slider.tsx delete mode 100644 landing-page/app/components/ui/sonner.tsx delete mode 100644 landing-page/app/components/ui/switch.tsx delete mode 100644 landing-page/app/components/ui/table.tsx delete mode 100644 landing-page/app/components/ui/tabs.tsx delete mode 100644 landing-page/app/components/ui/textarea.tsx delete mode 100644 landing-page/app/components/ui/toggle-group.tsx delete mode 100644 landing-page/app/components/ui/toggle.tsx delete mode 100644 landing-page/app/components/ui/tooltip.tsx delete mode 100644 landing-page/app/components/ui/use-mobile.ts delete mode 100644 landing-page/app/components/ui/utils.ts delete mode 100644 landing-page/app/main.tsx delete mode 100644 landing-page/index.html delete mode 100644 landing-page/package-lock.json delete mode 100644 landing-page/package.json delete mode 100644 landing-page/postcss.config.mjs delete mode 100644 landing-page/styles/fonts.css delete mode 100644 landing-page/styles/index.css delete mode 100644 landing-page/styles/tailwind.css delete mode 100644 landing-page/styles/theme.css delete mode 100644 landing-page/vercel.json delete mode 100644 landing-page/vite.config.ts delete mode 100644 locust-exporter/app.py delete mode 100644 locust-exporter/requirements.txt delete mode 100644 ml/__init__.py delete mode 100644 ml/artifacts/data_summary_20251231_172304.json delete mode 100644 ml/artifacts/data_summary_20251231_174808.json delete mode 100644 ml/artifacts/data_summary_20251231_184446.json delete mode 100644 ml/artifacts/data_summary_20251231_184526.json delete mode 100644 ml/artifacts/data_summary_20251231_184618.json delete mode 100644 ml/artifacts/metrics_20251231_172304.json delete mode 100644 ml/artifacts/metrics_20251231_174808.json delete mode 100644 ml/artifacts/metrics_20251231_184446.json delete mode 100644 ml/artifacts/metrics_20251231_184526.json delete mode 100644 ml/artifacts/metrics_20251231_184618.json delete mode 100644 ml/config.py delete mode 100644 ml/cost_intelligence/Dockerfile delete mode 100644 ml/cost_intelligence/__init__.py delete mode 100644 ml/cost_intelligence/app.py delete mode 100644 ml/cost_intelligence/config.py delete mode 100644 ml/cost_intelligence/cost_calculator.py delete mode 100644 ml/cost_intelligence/efficiency.py delete mode 100644 ml/cost_intelligence/forecaster.py delete mode 100644 ml/cost_intelligence/models.py delete mode 100644 ml/cost_intelligence/requirements.txt delete mode 100644 ml/cost_intelligence/savings_analyzer.py delete mode 100644 ml/fetch_real_data.py delete mode 100644 ml/inference/Dockerfile delete mode 100644 ml/inference/__init__.py delete mode 100644 ml/inference/anomaly_detector.py delete mode 100644 ml/inference/app.py delete mode 100644 ml/inference/config.py delete mode 100644 ml/inference/db.py delete mode 100644 ml/inference/metrics.py delete mode 100644 ml/inference/model_manager.py delete mode 100644 ml/inference/models.py delete mode 100644 ml/inference/predictor.py delete mode 100644 ml/inference/recommender.py delete mode 100644 ml/inference/requirements.txt delete mode 100644 ml/inference/retrain_scheduler.py delete mode 100644 ml/inference/static/assets/index-BELjdxMU.js delete mode 100644 ml/inference/static/assets/index-CIB54x0g.css delete mode 100644 ml/inference/static/index.html delete mode 100644 ml/inference/storage/__init__.py delete mode 100644 ml/inference/storage/sqlite_backend.py delete mode 100644 ml/inference/tests/__init__.py delete mode 100644 ml/inference/tests/locustfile.py delete mode 100644 ml/inference/tests/requirements-test.txt delete mode 100644 ml/inference/tests/test_api.py delete mode 100644 ml/inference/web/README.md delete mode 100644 ml/inference/web/index.html delete mode 100644 ml/inference/web/package-lock.json delete mode 100644 ml/inference/web/package.json delete mode 100644 ml/inference/web/postcss.config.js delete mode 100644 ml/inference/web/src/App.tsx delete mode 100644 ml/inference/web/src/components/MiniChart.tsx delete mode 100644 ml/inference/web/src/context/DeploymentsContext.tsx delete mode 100644 ml/inference/web/src/context/ThemeContext.tsx delete mode 100644 ml/inference/web/src/hooks/useAgents.ts delete mode 100644 ml/inference/web/src/main.tsx delete mode 100644 ml/inference/web/src/services/api.ts delete mode 100644 ml/inference/web/src/styles/index.css delete mode 100644 ml/inference/web/src/views/AgentInstall.tsx delete mode 100644 ml/inference/web/src/views/Agents.tsx delete mode 100644 ml/inference/web/src/views/Anomalies.tsx delete mode 100644 ml/inference/web/src/views/Dashboard.tsx delete mode 100644 ml/inference/web/src/views/Deployments.tsx delete mode 100644 ml/inference/web/src/views/Predictions.tsx delete mode 100644 ml/inference/web/tailwind.config.js delete mode 100644 ml/inference/web/test-results/.last-run.json delete mode 100644 ml/inference/web/tsconfig.json delete mode 100644 ml/inference/web/tsconfig.node.json delete mode 100644 ml/inference/web/vite.config.ts delete mode 100644 ml/models/__init__.py delete mode 100644 ml/models/anomaly_detector/1.0.0/metadata.json delete mode 100644 ml/models/anomaly_detector/1.0.0/model.pkl delete mode 100644 ml/models/baseline.py delete mode 100644 ml/models/cpu_forecaster/1.0.0/metadata.json delete mode 100644 ml/models/cpu_forecaster/1.0.0/model.pkl delete mode 100644 ml/models/memory_forecaster/1.0.0/metadata.json delete mode 100644 ml/models/memory_forecaster/1.0.0/model.pkl delete mode 100644 ml/models/prophet_model.py delete mode 100644 ml/models/xgboost_anomaly.py delete mode 100644 ml/pipeline/__init__.py delete mode 100644 ml/pipeline/cloudwatch_fetcher.py delete mode 100644 ml/pipeline/data_fetcher.py delete mode 100644 ml/pipeline/feature_engineering.py delete mode 100644 ml/pytest.ini delete mode 100644 ml/requirements.txt delete mode 100644 ml/scripts/create_demo_models.py delete mode 100644 ml/tests/__init__.py delete mode 100644 ml/tests/test_models.py delete mode 100644 ml/train.py delete mode 100644 ml/training/requirements.txt delete mode 100644 ml/training/train_models.py delete mode 100644 prescale-agent-gcp.yaml delete mode 100644 prescale-agent.yaml delete mode 100644 prescale-agent.yaml.example delete mode 100644 render.yaml delete mode 100644 scripts/check_image_modules.py delete mode 100644 scripts/setup.ps1 delete mode 100644 scripts/setup.sh delete mode 100644 test_retrain.ps1 diff --git a/.env.example b/.env.example deleted file mode 100644 index 11c0b37..0000000 --- a/.env.example +++ /dev/null @@ -1,45 +0,0 @@ -# Helios Configuration -# Copy this file to .env and fill in your values - -# ============================================================================= -# GCP Configuration -# ============================================================================= -GCP_PROJECT_ID=your-gcp-project-id -GCP_REGION=us-central1 -GCP_ZONE=us-central1-a - -# ============================================================================= -# Container Registry -# ============================================================================= -# Container registry path (gcr.io, us-docker.pkg.dev, etc.) -CONTAINER_REGISTRY=gcr.io/${GCP_PROJECT_ID} - -# ============================================================================= -# GCS Buckets -# ============================================================================= -MODELS_GCS_BUCKET=helios-models-${GCP_PROJECT_ID} -BILLING_DATASET=billing_export - -# ============================================================================= -# Kubernetes -# ============================================================================= -GKE_CLUSTER_NAME=helios-dev-gke -HELIOS_NAMESPACE=helios - -# ============================================================================= -# Service Versions -# ============================================================================= -HELIOS_INFERENCE_VERSION=0.2.0 -HELIOS_COST_VERSION=0.1.0 - -# ============================================================================= -# Monitoring -# ============================================================================= -PROMETHEUS_URL=http://prometheus-server.monitoring.svc.cluster.local -GRAFANA_URL=http://grafana.monitoring.svc.cluster.local - -# ============================================================================= -# Environment -# ============================================================================= -ENVIRONMENT=development -LOG_LEVEL=info diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78bdee6..90c5955 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,276 +2,31 @@ name: CI on: push: - branches: [main, develop] + branches: [main] pull_request: branches: [main] -env: - PYTHON_VERSION: "3.11" - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - jobs: - lint: - name: Lint & Type Check + test: + name: Lint & Test CLI runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 + - uses: actions/setup-python@v5 with: - python-version: ${{ env.PYTHON_VERSION }} + python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install run: | python -m pip install --upgrade pip - pip install ruff mypy - - - name: Run Ruff linter - run: ruff check ml/ --select=E9,F63,F7,F82 --ignore=E501 - # Only check for critical errors (syntax, undefined names) - # Full linting will be enforced incrementally - - - name: Run Ruff formatter check - run: ruff format --check ml/ - continue-on-error: true # Formatting being standardized - - - name: Run MyPy type checker - run: mypy ml/ --ignore-missing-imports - continue-on-error: true # Type hints are being added incrementally - - test-ml: - name: Test ML Models - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-ml-${{ hashFiles('ml/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip-ml- - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r ml/requirements.txt - pip install pytest pytest-cov - - - name: Run ML tests - run: | - cd ml - pytest tests/ -v --cov=models --cov-report=xml - continue-on-error: true - - - name: Upload coverage - uses: codecov/codecov-action@v4 - with: - file: ./ml/coverage.xml - flags: ml - fail_ci_if_error: false - - test-agent: - name: Test Agent - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-agent-${{ hashFiles('agent/pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip-agent- - - - name: Install agent - run: | - cd agent - pip install -e ".[all]" - pip install pytest pytest-cov pytest-asyncio - - - name: Run agent tests - run: | - cd agent - pytest tests/ -v --cov=src/prescale_agent --cov-report=xml - continue-on-error: true - - - name: Upload coverage - uses: codecov/codecov-action@v4 - with: - file: ./agent/coverage.xml - flags: agent - fail_ci_if_error: false - - test-cli: - name: Test CLI - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-cli-${{ hashFiles('cli/pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip-cli- - - - name: Install CLI - run: | - cd cli - pip install -e . - pip install pytest pytest-cov - - - name: Run CLI tests - run: | - cd cli - pytest tests/ -v --cov=src/prescale_cli --cov-report=xml - continue-on-error: true - - - name: Upload coverage - uses: codecov/codecov-action@v4 - with: - file: ./cli/coverage.xml - flags: cli - fail_ci_if_error: false - - build-inference: - name: Build Inference Service - runs-on: ubuntu-latest - needs: [lint, test-ml] - permissions: - contents: read - packages: write - steps: - - uses: actions/checkout@v4 + pip install -e "cli[dev]" ruff - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Container Registry - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/inference - tags: | - type=ref,event=branch - type=ref,event=pr - type=sha,prefix= - type=raw,value=latest,enable={{is_default_branch}} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./ml - file: ./ml/inference/Dockerfile - push: ${{ github.event_name != 'pull_request' }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - build-cost-intelligence: - name: Build Cost Intelligence Service - runs-on: ubuntu-latest - needs: [lint, test-ml] - permissions: - contents: read - packages: write - steps: - - uses: actions/checkout@v4 + - name: Ruff + run: ruff check cli/src - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Container Registry - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/cost-intelligence - tags: | - type=ref,event=branch - type=ref,event=pr - type=sha,prefix= - type=raw,value=latest,enable={{is_default_branch}} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./ml - file: ./ml/cost_intelligence/Dockerfile - push: ${{ github.event_name != 'pull_request' }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - helm-lint: - name: Lint Helm Charts - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Helm - uses: azure/setup-helm@v4 - with: - version: v3.14.0 - - - name: Lint Helm charts - run: | - if [ -d "charts" ]; then - helm lint charts/prescale - else - echo "Charts directory not found, skipping" - fi - - security-scan: - name: Security Scan - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - scan-type: 'fs' - scan-ref: '.' - format: 'table' - severity: 'CRITICAL,HIGH' - exit-code: '0' # Don't fail the build, just report - continue-on-error: true # Security scan is advisory + - name: Tests + run: pytest cli/tests -q diff --git a/.github/workflows/helm-pages.yml b/.github/workflows/helm-pages.yml deleted file mode 100644 index ddf141d..0000000 --- a/.github/workflows/helm-pages.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: Helm Chart Pages - -on: - release: - types: [published] - workflow_dispatch: - -permissions: - contents: read - pages: write - id-token: write - -concurrency: - group: "pages" - cancel-in-progress: false - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Helm - uses: azure/setup-helm@v4 - with: - version: v3.14.0 - - - name: Download chart dependencies - run: | - if [ -d "charts/prescale" ]; then - helm dependency update charts/prescale - fi - - - name: Package Helm charts - run: | - mkdir -p .helm-repo - if [ -d "charts/prescale" ]; then - helm package charts/prescale -d .helm-repo - fi - - - name: Generate Helm repo index - run: | - helm repo index .helm-repo --url https://pyjeebz.github.io/prescale - - - name: Setup Pages - uses: actions/configure-pages@v4 - - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: '.helm-repo' - - deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - needs: build - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/.github/workflows/publish-agent.yml b/.github/workflows/publish-agent.yml deleted file mode 100644 index 6c42b9d..0000000 --- a/.github/workflows/publish-agent.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: Publish Agent to PyPI - -on: - release: - types: [published] - workflow_dispatch: - inputs: - version: - description: 'Version to publish (leave empty to use pyproject.toml version)' - required: false - -jobs: - build: - name: Build distribution - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install build tools - run: python -m pip install --upgrade pip build - - - name: Build package - run: python -m build - working-directory: ./agent - - - name: Upload distribution artifacts - uses: actions/upload-artifact@v4 - with: - name: python-package-distributions - path: agent/dist/ - - publish-pypi: - name: Publish to PyPI - needs: build - runs-on: ubuntu-latest - environment: - name: prescale-agent - url: https://pypi.org/p/prescale-agent - permissions: - id-token: write # Required for trusted publishing - steps: - - name: Download distribution artifacts - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9eb716e..cc01818 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,206 +5,43 @@ on: tags: - 'v*' -env: - PYTHON_VERSION: "3.11" - REGISTRY: ghcr.io - IMAGE_NAME: pyjeebz/prescale - jobs: - release-docker: - name: Release Docker Images + pypi: + name: Publish CLI to PyPI runs-on: ubuntu-latest permissions: + id-token: write # trusted publishing, no token needed contents: read - packages: write - strategy: - matrix: - service: - - name: inference - dockerfile: ./ml/inference/Dockerfile - context: ./ml - - name: cost-intelligence - dockerfile: ./ml/cost_intelligence/Dockerfile - context: ./ml - steps: - - uses: actions/checkout@v4 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract version from tag - id: version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ${{ matrix.service.context }} - file: ${{ matrix.service.dockerfile }} - platforms: linux/amd64,linux/arm64 - push: true - tags: | - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/${{ matrix.service.name }}:${{ steps.version.outputs.VERSION }} - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/${{ matrix.service.name }}:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - release-helm: - name: Release Helm Chart - runs-on: ubuntu-latest - needs: release-docker - permissions: - contents: write steps: - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Set up Helm - uses: azure/setup-helm@v4 + - uses: actions/setup-python@v5 with: - version: v3.14.0 - - - name: Extract version from tag - id: version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + python-version: "3.11" - - name: Download chart dependencies + - name: Build + working-directory: cli run: | - if [ -d "charts/prescale" ]; then - helm dependency update charts/prescale - fi - - - name: Package Helm chart - run: | - if [ -d "charts/prescale" ]; then - helm package charts/prescale --version ${{ steps.version.outputs.VERSION }} --app-version ${{ steps.version.outputs.VERSION }} - fi - - - name: Upload Helm chart as release asset - uses: softprops/action-gh-release@v1 - with: - files: prescale-*.tgz - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + python -m pip install --upgrade pip build + python -m build - release-pypi: - name: Release to PyPI - runs-on: ubuntu-latest - continue-on-error: true # Don't block release if PyPI fails - permissions: - id-token: write - contents: read - strategy: - matrix: - package: - - name: prescale-platform-agent - path: agent - - name: prescale-cli - path: cli - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install build tools - run: | - python -m pip install --upgrade pip - pip install build twine hatchling - - - name: Build ${{ matrix.package.name }} - working-directory: ${{ matrix.package.path }} - run: python -m build - - - name: Publish ${{ matrix.package.name }} to PyPI - working-directory: ${{ matrix.package.path }} - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - twine upload dist/* --skip-existing || echo "PyPI upload failed for ${{ matrix.package.name }}, continuing..." + packages-dir: cli/dist - create-release: + github-release: name: Create GitHub Release runs-on: ubuntu-latest - needs: [release-docker, release-helm] + needs: pypi permissions: contents: write steps: - uses: actions/checkout@v4 - - name: Extract version from tag - id: version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT - - - name: Generate changelog - id: changelog - run: | - # Get commits since last tag - LAST_TAG=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") - if [ -n "$LAST_TAG" ]; then - CHANGELOG=$(git log $LAST_TAG..HEAD --pretty=format:"- %s (%h)" --no-merges) - else - CHANGELOG=$(git log --pretty=format:"- %s (%h)" --no-merges -20) - fi - echo "CHANGELOG<> $GITHUB_OUTPUT - echo "$CHANGELOG" >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT - - - name: Create Release - uses: softprops/action-gh-release@v1 + - name: Create release + uses: softprops/action-gh-release@v2 with: - name: Prescale v${{ steps.version.outputs.VERSION }} - body: | - ## 🚀 Prescale v${{ steps.version.outputs.VERSION }} - - ### Installation - - **Helm:** - ```bash - helm repo add prescale https://pyjeebz.github.io/prescale - helm install prescale prescale/prescale --version ${{ steps.version.outputs.VERSION }} - ``` - - **Docker:** - ```bash - docker pull ghcr.io/${{ env.IMAGE_NAME }}/inference:${{ steps.version.outputs.VERSION }} - docker pull ghcr.io/${{ env.IMAGE_NAME }}/cost-intelligence:${{ steps.version.outputs.VERSION }} - ``` - - **CLI:** - ```bash - pip install prescale-cli==${{ steps.version.outputs.VERSION }} - ``` - - ### Changes - - ${{ steps.changelog.outputs.CHANGELOG }} - - ### Docker Images - - - `ghcr.io/${{ env.IMAGE_NAME }}/inference:${{ steps.version.outputs.VERSION }}` - - `ghcr.io/${{ env.IMAGE_NAME }}/cost-intelligence:${{ steps.version.outputs.VERSION }}` - - draft: false - prerelease: ${{ contains(github.ref, 'alpha') || contains(github.ref, 'beta') || contains(github.ref, 'rc') }} + generate_release_notes: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/Makefile b/Makefile deleted file mode 100644 index 75358b9..0000000 --- a/Makefile +++ /dev/null @@ -1,116 +0,0 @@ -# Prescale - Makefile for common operations -# Run 'make help' to see available commands - -.PHONY: help install dev up down build test clean lint - -# Default target -help: - @echo "Prescale Development Commands" - @echo "============================" - @echo "" - @echo "Local Development:" - @echo " make install - Install Python dependencies" - @echo " make dev - Start inference service locally (no Docker)" - @echo " make up - Start all services with Docker Compose" - @echo " make down - Stop Docker Compose services" - @echo " make build - Build Docker images" - @echo "" - @echo "Testing:" - @echo " make test - Run all tests" - @echo " make test-api - Test API endpoints" - @echo " make lint - Run linters" - @echo "" - @echo "Kubernetes:" - @echo " make k8s-apply - Apply Kubernetes manifests" - @echo " make k8s-delete - Delete Kubernetes resources" - @echo "" - @echo "Utilities:" - @echo " make clean - Remove build artifacts" - @echo " make agent - Run Prescale agent locally" - -# ============================================================================ -# Local Development -# ============================================================================ - -install: - python -m pip install --upgrade pip - pip install -e ./agent[dev] - pip install -r ml/inference/requirements.txt - -dev: - cd ml && python -m uvicorn inference.app:app --host 0.0.0.0 --port 8080 --reload - -agent: - cd agent && python -m prescale_agent run --config ../prescale-agent.yaml - -# ============================================================================ -# Docker Compose -# ============================================================================ - -up: - docker compose up -d inference - -up-all: - docker compose --profile monitoring up -d - -down: - docker compose down - -build: - docker compose build - -logs: - docker compose logs -f inference - -# ============================================================================ -# Testing -# ============================================================================ - -test: - pytest ml/tests/ agent/tests/ -v - -test-api: - @echo "Testing health endpoint..." - curl -s http://localhost:8080/health | python -m json.tool - @echo "" - @echo "Testing ready endpoint..." - curl -s http://localhost:8080/ready | python -m json.tool - @echo "" - @echo "Testing predict endpoint..." - curl -s -X POST http://localhost:8080/predict \ - -H "Content-Type: application/json" \ - -d '{"deployment": "test", "namespace": "default", "metric": "cpu_utilization"}' | python -m json.tool - -lint: - ruff check ml/ agent/ - ruff format --check ml/ agent/ - -format: - ruff format ml/ agent/ - -# ============================================================================ -# Kubernetes -# ============================================================================ - -k8s-apply: - kubectl apply -f infra/kubernetes/prescale-inference/ - -k8s-delete: - kubectl delete -f infra/kubernetes/prescale-inference/ - -k8s-logs: - kubectl logs -n prescale -l app.kubernetes.io/name=prescale-inference -f - -k8s-status: - kubectl get pods,svc,deploy -n prescale - -# ============================================================================ -# Cleanup -# ============================================================================ - -clean: - find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true - find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true - find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true - find . -type f -name "*.pyc" -delete 2>/dev/null || true - rm -rf .ruff_cache build dist diff --git a/agent/README.md b/agent/README.md deleted file mode 100644 index 7620260..0000000 --- a/agent/README.md +++ /dev/null @@ -1,107 +0,0 @@ -# Prescale Agent - -Metrics collection agent for [Prescale](https://github.com/pyjeebz/prescale) - Predictive Infrastructure Intelligence Platform. - -## Installation - -```bash -# Base installation (system metrics + Prometheus) -pip install prescale-agent - -# With specific backends -pip install prescale-agent[gcp] # + GCP Cloud Monitoring -pip install prescale-agent[aws] # + AWS CloudWatch -pip install prescale-agent[azure] # + Azure Monitor -pip install prescale-agent[datadog] # + Datadog -pip install prescale-agent[all] # All backends -``` - -## Quick Start - -```bash -# Generate configuration file -prescale-agent init - -# List available metric sources -prescale-agent sources - -# Test configured sources -prescale-agent test - -# Run the agent -prescale-agent run --config prescale-agent.yaml -``` - -## Configuration - -Create a `prescale-agent.yaml` file: - -```yaml -agent: - collection_interval: 60 - log_level: INFO - -sources: - # System metrics (always available) - - type: system - enabled: true - config: - collect_cpu: true - collect_memory: true - - # GCP Cloud Monitoring - - type: gcp-monitoring - enabled: true - config: - project_id: your-gcp-project - metrics: - - kubernetes.io/container/cpu/limit_utilization - - kubernetes.io/container/memory/limit_utilization - - # Prometheus - - type: prometheus - enabled: false - config: - url: http://prometheus:9090 - queries: - - name: cpu_usage - query: rate(container_cpu_usage_seconds_total[5m]) - -prescale: - endpoint: http://prescale-inference:8080 -``` - -## Supported Sources - -| Source | Description | Extra Install | -|--------|-------------|---------------| -| `system` | Local CPU, memory, disk via psutil | Built-in | -| `prometheus` | Query Prometheus server | Built-in | -| `gcp-monitoring` | GCP Cloud Monitoring | `[gcp]` | -| `cloudwatch` | AWS CloudWatch | `[aws]` | -| `azure-monitor` | Azure Monitor | `[azure]` | -| `datadog` | Datadog API | `[datadog]` | - -## CLI Commands - -```bash -prescale-agent init # Generate config file -prescale-agent run # Start collecting metrics -prescale-agent run --once # Single collection (testing) -prescale-agent run --deployment my-deployment # Associate with deployment -prescale-agent sources # List available sources -prescale-agent test # Test source connections -prescale-agent status # Show agent status -``` - -## Environment Variables - -| Variable | Description | -|----------|-------------| -| `PRESCALE_CONFIG_FILE` | Path to config file (default: `./prescale-agent.yaml`) | -| `PRESCALE_ENDPOINT` | Prescale inference endpoint | -| `PRESCALE_API_KEY` | API key for authentication | - -## License - -Apache 2.0 - See [LICENSE](../LICENSE) diff --git a/agent/agent.py b/agent/agent.py deleted file mode 100644 index cd01b96..0000000 --- a/agent/agent.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env python -"""Launcher for the packaged Prescale agent. - -This small script mirrors the previous simple-entrypoint used during -local testing. It imports the package CLI and invokes it so running -`python agent/agent.py` works from the workspace root. -""" -from prescale_agent.cli import main - - -if __name__ == "__main__": - main() diff --git a/agent/pyproject.toml b/agent/pyproject.toml deleted file mode 100644 index 2eb53c1..0000000 --- a/agent/pyproject.toml +++ /dev/null @@ -1,85 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "prescale-agent" -version = "0.2.2" -description = "Metrics collection agent for Prescale - Predictive Infrastructure Intelligence Platform" -readme = "README.md" -license = {text = "Apache-2.0"} -requires-python = ">=3.9" -authors = [ - {name = "Prescale Platform", email = "maintainers@prescale.dev"} -] -keywords = [ - "kubernetes", - "metrics", - "monitoring", - "prometheus", - "observability" -] -classifiers = [ - "Development Status :: 4 - Beta", - "Environment :: Console", - "Intended Audience :: Developers", - "Intended Audience :: System Administrators", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: System :: Monitoring", -] - -dependencies = [ - "httpx>=0.24.0", - "click>=8.0.0", - "rich>=13.0.0", - "pyyaml>=6.0", - "psutil>=5.9.0", -] - -[project.optional-dependencies] -prometheus = [ - "prometheus-client>=0.17.0", -] -datadog = [ - "datadog-api-client>=2.0.0", -] -aws = [ - "boto3>=1.26.0", -] -azure = [ - "azure-identity>=1.12.0", - "azure-mgmt-monitor>=6.0.0", -] -gcp = [ - "google-cloud-monitoring>=2.15.0", -] -kubernetes = [ - "kubernetes>=28.0.0", -] -all = [ - "prescale-agent[prometheus,datadog,aws,azure,gcp,kubernetes]", -] -dev = [ - "pytest>=7.0.0", - "pytest-asyncio>=0.21.0", - "ruff>=0.1.0", -] - -[project.scripts] -prescale-agent = "prescale_agent.cli:main" - -[project.urls] -Homepage = "https://github.com/pyjeebz/prescale" -Repository = "https://github.com/pyjeebz/prescale" - -[tool.setuptools.packages.find] -where = ["src"] - -[tool.setuptools.package-dir] -"" = "src" diff --git a/agent/pytest.ini b/agent/pytest.ini deleted file mode 100644 index 4a6926f..0000000 --- a/agent/pytest.ini +++ /dev/null @@ -1,10 +0,0 @@ -[pytest] -testpaths = tests -python_files = test_*.py -python_classes = Test* -python_functions = test_* -addopts = -v --tb=short -asyncio_mode = auto -filterwarnings = - ignore::DeprecationWarning - ignore::FutureWarning diff --git a/agent/src/prescale_agent/__init__.py b/agent/src/prescale_agent/__init__.py deleted file mode 100644 index ea41a12..0000000 --- a/agent/src/prescale_agent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Prescale Agent - Metrics Collection for Predictive Infrastructure Intelligence.""" - -__version__ = "0.2.0" diff --git a/agent/src/prescale_agent/agent.py b/agent/src/prescale_agent/agent.py deleted file mode 100644 index 910b9ff..0000000 --- a/agent/src/prescale_agent/agent.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Prescale Agent - Main agent runner with unified metrics sources.""" - -import asyncio -import logging -import signal -from datetime import datetime, timezone -from typing import Optional - -from .client import PrescaleClient -from .sources import MetricsSource, MetricSample, SourceRegistry -from .config import AgentConfig - -logger = logging.getLogger(__name__) - - -class Agent: - """ - Main Prescale metrics collection agent. - - Uses a unified source interface to collect metrics from any backend: - - system: Local host metrics via psutil - - prometheus: Prometheus server - - datadog: Datadog API - - cloudwatch: AWS CloudWatch - - azure_monitor: Azure Monitor - - gcp_monitoring: Google Cloud Monitoring - """ - - def __init__(self, config: AgentConfig): - self.config = config - self.sources: list[MetricsSource] = [] - self.client: Optional[PrescaleClient] = None - self._running = False - self._paused = False - self._interval_override: Optional[int] = None # Server-controlled interval - self._metrics_buffer: list[MetricSample] = [] - self._last_flush = datetime.now(timezone.utc) - - async def setup(self): - """Initialize sources and client.""" - # Setup Prescale client - self.client = PrescaleClient( - endpoint=self.config.endpoint.url, - api_key=self.config.endpoint.api_key, - timeout=self.config.endpoint.timeout, - retry_attempts=self.config.endpoint.retry_attempts, - retry_delay=self.config.endpoint.retry_delay, - ) - - # Setup sources from config - for source_config in self.config.sources: - if not source_config.enabled: - continue - - source = SourceRegistry.create(source_config) - if source is None: - logger.warning(f"Unknown source type: {source_config.type}") - continue - - # Initialize the source - try: - if await source.initialize(): - self.sources.append(source) - logger.info(f"Initialized source: {source.name} (type: {source_config.type})") - else: - logger.warning(f"Failed to initialize source: {source.name}") - except Exception as e: - logger.error(f"Error initializing source {source.name}: {e}") - - logger.info(f"Agent initialized with {len(self.sources)} sources") - logger.info(f"Available source types: {SourceRegistry.list_types()}") - - async def run(self): - """Run the agent main loop.""" - self._running = True - logger.info("Starting Prescale Agent...") - - # Setup signal handlers - loop = asyncio.get_running_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - try: - loop.add_signal_handler(sig, lambda: asyncio.create_task(self.stop())) - except NotImplementedError: - # Windows doesn't support add_signal_handler - pass - - # Check Prescale API health - if self.client: - healthy = await self.client.check_health() - if healthy: - logger.info(f"Connected to Prescale at {self.config.endpoint.url}") - else: - logger.warning(f"Could not connect to Prescale at {self.config.endpoint.url}") - - # Start source collection tasks - tasks = [] - for source in self.sources: - if source.is_enabled(): - task = asyncio.create_task(self._run_source(source)) - tasks.append(task) - - # Start flush task - flush_task = asyncio.create_task(self._flush_loop()) - tasks.append(flush_task) - - # Wait for all tasks - try: - await asyncio.gather(*tasks) - except asyncio.CancelledError: - logger.info("Agent tasks cancelled") - - async def stop(self): - """Stop the agent gracefully.""" - logger.info("Stopping Prescale Agent...") - self._running = False - - # Flush remaining metrics - await self._flush_metrics() - - # Close sources - for source in self.sources: - try: - await source.close() - except Exception as e: - logger.warning(f"Error closing source {source.name}: {e}") - - # Close client - if self.client: - await self.client.close() - - async def _run_source(self, source: MetricsSource): - """Run a source collection loop.""" - while self._running: - # Check if paused - if self._paused: - logger.debug(f"Agent paused, skipping collection from {source.name}") - await asyncio.sleep(5) # Check pause state every 5s - continue - - try: - result = await source.collect() - - if result.success: - # Convert source metrics to buffer format - self._metrics_buffer.extend(result.metrics) - logger.debug( - f"Collected {len(result.metrics)} metrics from {source.name} " - f"in {result.duration_ms:.1f}ms" - ) - else: - logger.warning(f"Source {source.name} error: {result.error}") - - except Exception as e: - logger.error(f"Error running source {source.name}: {e}") - - # Use server-controlled interval if available, else source default - interval = self._interval_override or source.config.interval - await asyncio.sleep(interval) - - async def _flush_loop(self): - """Periodically flush metrics to Prescale.""" - while self._running: - await asyncio.sleep(self.config.flush_interval) - await self._flush_metrics() - - async def _flush_metrics(self): - """Flush buffered metrics to Prescale.""" - if not self._metrics_buffer: - return - - if not self.client: - logger.warning("No Prescale client configured, discarding metrics") - self._metrics_buffer.clear() - return - - # Get metrics to send - metrics_to_send = self._metrics_buffer[:self.config.batch_size] - self._metrics_buffer = self._metrics_buffer[self.config.batch_size:] - - # Send metrics - result = await self.client.send_metrics(metrics_to_send) - - if result is not None: - logger.info(f"Sent {len(metrics_to_send)} metrics to Prescale") - - # Apply control commands from server - commands = result.get("commands") - if commands: - self._apply_commands(commands) - else: - # Re-add failed metrics to buffer (at the front) - self._metrics_buffer = metrics_to_send + self._metrics_buffer - # Trim buffer if too large - max_buffer = self.config.batch_size * 10 - if len(self._metrics_buffer) > max_buffer: - dropped = len(self._metrics_buffer) - max_buffer - self._metrics_buffer = self._metrics_buffer[:max_buffer] - logger.warning(f"Buffer full, dropped {dropped} oldest metrics") - - def _apply_commands(self, commands: dict): - """Apply control commands received from the server.""" - if "paused" in commands: - new_paused = bool(commands["paused"]) - if new_paused != self._paused: - self._paused = new_paused - state = "PAUSED" if new_paused else "RESUMED" - logger.info(f"Agent {state} by server command") - - if "collection_interval" in commands: - new_interval = int(commands["collection_interval"]) - if new_interval != self._interval_override: - old = self._interval_override or "default" - self._interval_override = new_interval - logger.info(f"Collection interval changed: {old} -> {new_interval}s") - - async def collect_once(self) -> list[MetricSample]: - """Run all sources once and return metrics.""" - all_metrics = [] - - for source in self.sources: - if source.is_enabled(): - result = await source.collect() - if result.success: - all_metrics.extend(result.metrics) - - return all_metrics - - async def health_check(self) -> dict: - """Check health of all sources and client.""" - status = { - "sources": {}, - "client": False, - "metrics_buffered": len(self._metrics_buffer), - } - - for source in self.sources: - try: - status["sources"][source.name] = await source.health_check() - except Exception: - status["sources"][source.name] = False - - if self.client: - status["client"] = await self.client.check_health() - - return status diff --git a/agent/src/prescale_agent/cli.py b/agent/src/prescale_agent/cli.py deleted file mode 100644 index b035ff8..0000000 --- a/agent/src/prescale_agent/cli.py +++ /dev/null @@ -1,384 +0,0 @@ -"""Prescale Agent CLI - Unified metrics collection.""" - -import asyncio -import logging -import sys -from pathlib import Path -from typing import Optional - -import click -from rich.console import Console -from rich.table import Table -from rich.panel import Panel - -from . import __version__ -from .agent import Agent -from .config import load_config, AgentConfig -from .sources import SourceRegistry, list_sources - -console = Console() - - -def setup_logging(level: str): - """Configure logging.""" - logging.basicConfig( - level=getattr(logging, level.upper()), - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], - ) - - -@click.group() -@click.version_option(version=__version__, prog_name="prescale-agent") -def main(): - """Prescale Agent - Unified metrics collection for predictive infrastructure intelligence.""" - pass - - -@main.command() -@click.option("--config", "-c", "config_path", help="Path to config file") -@click.option("--endpoint", "-e", envvar="PRESCALE_ENDPOINT", help="Prescale API endpoint") -@click.option("--deployment", "-d", envvar="PRESCALE_DEPLOYMENT", help="Deployment ID to send metrics to") -@click.option("--api-key", envvar="PRESCALE_API_KEY", help="Prescale API key") -@click.option("--interval", default=15, help="Collection interval in seconds") -@click.option("--log-level", default="INFO", help="Log level") -@click.option("--once", is_flag=True, help="Collect once and exit") -def run( - config_path: Optional[str], - endpoint: Optional[str], - deployment: Optional[str], - api_key: Optional[str], - interval: int, - log_level: str, - once: bool, -): - """Run the Prescale metrics collection agent.""" - setup_logging(log_level) - - # Load config - config = load_config(config_path) - - # Override with CLI options - if endpoint: - config.endpoint.url = endpoint - if api_key: - config.endpoint.api_key = api_key - if deployment: - # Add deployment label to all sources - for source in config.sources: - source.labels["deployment"] = deployment - if interval: - for source in config.sources: - source.interval = interval - - config.log_level = log_level - - # Create and run agent - agent = Agent(config) - - async def _run(): - await agent.setup() - - if once: - metrics = await agent.collect_once() - _display_metrics(metrics) - else: - source_names = [s.name for s in agent.sources] - console.print(Panel( - f"[bold green]Prescale Agent v{__version__}[/bold green]\n" - f"Endpoint: {config.endpoint.url}\n" - f"Sources: {', '.join(source_names)}\n" - f"Interval: {interval}s", - title="Starting", - )) - - try: - await agent.run() - except KeyboardInterrupt: - console.print("\n[yellow]Shutting down...[/yellow]") - await agent.stop() - - asyncio.run(_run()) - - -def _display_metrics(metrics: list): - """Display collected metrics in a table.""" - if not metrics: - console.print("[yellow]No metrics collected[/yellow]") - return - - table = Table(title=f"Collected {len(metrics)} Metrics", show_lines=True) - table.add_column("Name", style="cyan") - table.add_column("Value", justify="right") - table.add_column("Source", style="green") - table.add_column("Labels", style="dim") - - for metric in metrics[:50]: # Show first 50 - labels_str = ", ".join(f"{k}={v}" for k, v in list(metric.labels.items())[:3]) - if len(metric.labels) > 3: - labels_str += f" (+{len(metric.labels) - 3})" - - # Format value - if metric.value >= 1_000_000_000: - value_str = f"{metric.value / 1_000_000_000:.2f}G" - elif metric.value >= 1_000_000: - value_str = f"{metric.value / 1_000_000:.2f}M" - elif metric.value >= 1_000: - value_str = f"{metric.value / 1_000:.2f}K" - elif metric.value < 1: - value_str = f"{metric.value * 100:.1f}%" - else: - value_str = f"{metric.value:.4f}" - - table.add_row( - metric.name, - value_str, - metric.source, - labels_str, - ) - - console.print(table) - - if len(metrics) > 50: - console.print(f"[dim]... and {len(metrics) - 50} more metrics[/dim]") - - -@main.command() -@click.option("--config", "-c", "config_path", help="Path to config file") -def test(config_path: Optional[str]): - """Test configured sources without sending data.""" - config = load_config(config_path) - - console.print("[bold]Testing metrics sources...[/bold]\n") - console.print(f"Available source types: {', '.join(list_sources())}\n") - - async def _test(): - from .sources import SourceRegistry - - for source_config in config.sources: - console.print(f"[cyan]{source_config.type.upper()} Source ({source_config.name}):[/cyan]") - - source = SourceRegistry.create(source_config) - if source is None: - console.print(f" [red]x Unknown source type: {source_config.type}[/red]") - continue - - # Initialize - try: - if not await source.initialize(): - console.print(" [red]x Failed to initialize[/red]") - continue - except Exception as e: - console.print(f" [red]x Init error: {e}[/red]") - continue - - # Health check - healthy = await source.health_check() - if healthy: - console.print(" [green]+ Health check passed[/green]") - else: - console.print(" [yellow]! Health check failed (may still work)[/yellow]") - - # Collect - try: - result = await source.collect() - - if result.success: - console.print(f" [green]+ Collected {len(result.metrics)} metrics in {result.duration_ms:.1f}ms[/green]") - - # Show sample metrics - if result.metrics: - for m in result.metrics[:3]: - if m.value < 1: - console.print(f" - {m.name}: {m.value*100:.1f}%") - else: - console.print(f" - {m.name}: {m.value:.2f}") - if len(result.metrics) > 3: - console.print(f" ... and {len(result.metrics) - 3} more") - else: - console.print(f" [red]x Collection failed: {result.error}[/red]") - except Exception as e: - console.print(f" [red]x Collection error: {e}[/red]") - finally: - await source.close() - - console.print() - - # Test Prescale connection - console.print(f"[cyan]Prescale API ({config.endpoint.url}):[/cyan]") - from .client import PrescaleClient - - client = PrescaleClient( - endpoint=config.endpoint.url, - api_key=config.endpoint.api_key, - ) - try: - healthy = await client.check_health() - if healthy: - console.print(" [green]+ Connected successfully[/green]") - else: - console.print(" [red]x Could not connect[/red]") - finally: - await client.close() - - asyncio.run(_test()) - - -@main.command() -def sources(): - """List available metrics source types.""" - console.print("[bold]Available Metrics Sources:[/bold]\n") - - table = Table(show_header=True) - table.add_column("Type", style="cyan") - table.add_column("Description") - table.add_column("Required Credentials", style="dim") - - source_info = { - "system": ("Local system metrics via psutil", "None"), - "prometheus": ("Prometheus server scraping", "endpoint"), - "datadog": ("Datadog API", "api_key, app_key"), - "cloudwatch": ("AWS CloudWatch", "aws_access_key_id, aws_secret_access_key, region"), - "azure_monitor": ("Azure Monitor", "tenant_id, client_id, client_secret, subscription_id"), - "gcp_monitoring": ("Google Cloud Monitoring", "project_id, credentials_file (optional)"), - } - - registered = list_sources() - - for source_type, (desc, creds) in source_info.items(): - status = "[green]+" if source_type in registered else "[red]x" - table.add_row(f"{status} {source_type}", desc, creds) - - console.print(table) - console.print("\n[dim]+ = registered and available, x = not registered (missing dependencies)[/dim]") - - -@main.command() -@click.option("--output", "-o", type=click.Path(), help="Output file path") -def init(output: Optional[str]): - """Generate a sample configuration file.""" - sample_config = """# Prescale Agent Configuration -# Unified metrics collection from any monitoring backend - -# Prescale API endpoint -endpoint: - url: http://localhost:8000 - api_key: ${PRESCALE_API_KEY} # Or set via environment - timeout: 30 - retry_attempts: 3 - -# Metrics sources - add any combination of backends -sources: - # Local system metrics (always available) - - name: local-system - type: system - enabled: true - interval: 15 - options: - collect_cpu: true - collect_memory: true - collect_disk: true - collect_network: true - - # Prometheus (if you have a Prometheus server) - # - name: prometheus - # type: prometheus - # enabled: false - # endpoint: http://prometheus:9090 - # interval: 15 - # queries: - # - 'sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (namespace, pod)' - # - 'sum(container_memory_usage_bytes{container!=""}) by (namespace, pod)' - - # Datadog (requires API key) - # - name: datadog - # type: datadog - # enabled: false - # api_key: ${DATADOG_API_KEY} - # credentials: - # app_key: ${DATADOG_APP_KEY} - # options: - # site: us1 # us1, us3, us5, eu1, ap1 - # metrics: - # - system.cpu.user - # - system.mem.used - - # AWS CloudWatch (requires AWS credentials) - # - name: cloudwatch - # type: cloudwatch - # enabled: false - # credentials: - # aws_access_key_id: ${AWS_ACCESS_KEY_ID} - # aws_secret_access_key: ${AWS_SECRET_ACCESS_KEY} - # region: us-east-1 - # metrics: - # - AWS/EC2/CPUUtilization - # - AWS/RDS/DatabaseConnections - - # Azure Monitor (requires service principal) - # - name: azure-monitor - # type: azure_monitor - # enabled: false - # credentials: - # tenant_id: ${AZURE_TENANT_ID} - # client_id: ${AZURE_CLIENT_ID} - # client_secret: ${AZURE_CLIENT_SECRET} - # subscription_id: ${AZURE_SUBSCRIPTION_ID} - - # Google Cloud Monitoring (requires service account) - # - name: gcp-monitoring - # type: gcp_monitoring - # enabled: false - # credentials: - # project_id: ${GCP_PROJECT_ID} - # credentials_file: /path/to/service-account.json - -# Agent settings -batch_size: 100 -flush_interval: 10 -log_level: INFO -""" - - output_path = output or "prescale-agent.yaml" - - with open(output_path, "w") as f: - f.write(sample_config) - - console.print(f"[green]+ Created config file: {output_path}[/green]") - console.print("\nEdit the file to enable your monitoring sources, then run:") - console.print(f" [cyan]prescale-agent run -c {output_path}[/cyan]") - - -@main.command() -def status(): - """Show agent status and system info.""" - import psutil - import platform - - console.print(Panel( - f"[bold]Prescale Agent v{__version__}[/bold]", - title="Status", - )) - - # System info - table = Table(title="System Information", show_header=False) - table.add_column("Property", style="cyan") - table.add_column("Value") - - table.add_row("Platform", platform.platform()) - table.add_row("Python", platform.python_version()) - table.add_row("CPU Cores", str(psutil.cpu_count())) - table.add_row("CPU Usage", f"{psutil.cpu_percent():.1f}%") - - mem = psutil.virtual_memory() - table.add_row("Memory Total", f"{mem.total / 1024**3:.1f} GB") - table.add_row("Memory Used", f"{mem.percent:.1f}%") - - console.print(table) - - # Available sources - console.print(f"\n[bold]Available Source Types:[/bold] {', '.join(list_sources())}") - - -if __name__ == "__main__": - main() diff --git a/agent/src/prescale_agent/client.py b/agent/src/prescale_agent/client.py deleted file mode 100644 index 6876b04..0000000 --- a/agent/src/prescale_agent/client.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Prescale API client for sending metrics.""" - -import asyncio -import logging -from datetime import datetime -from typing import Optional, TYPE_CHECKING - -import httpx - -if TYPE_CHECKING: - from .sources.base import MetricSample - -logger = logging.getLogger(__name__) - - -class PrescaleClient: - """Client for sending metrics to Prescale API.""" - - def __init__( - self, - endpoint: str = "http://localhost:8000", - api_key: Optional[str] = None, - timeout: int = 30, - retry_attempts: int = 3, - retry_delay: float = 1.0, - ): - self.endpoint = endpoint.rstrip("/") - self.api_key = api_key - self.timeout = timeout - self.retry_attempts = retry_attempts - self.retry_delay = retry_delay - self._client: Optional[httpx.AsyncClient] = None - - def _get_headers(self) -> dict: - """Get request headers.""" - headers = { - "Content-Type": "application/json", - "User-Agent": "prescale-agent/0.1.0", - } - if self.api_key: - headers["Authorization"] = f"Bearer {self.api_key}" - headers["X-API-Key"] = self.api_key - return headers - - async def _get_client(self) -> httpx.AsyncClient: - """Get or create HTTP client.""" - if self._client is None or self._client.is_closed: - self._client = httpx.AsyncClient( - timeout=self.timeout, - headers=self._get_headers(), - ) - return self._client - - async def send_metrics(self, metrics: list) -> Optional[dict]: - """Send metrics to Prescale API. - - Args: - metrics: List of metric samples to send - - Returns: - Response dict with 'received' and optional 'commands', or None on failure - """ - if not metrics: - return {"received": 0} - - payload = { - "metrics": [m.to_dict() for m in metrics], - "agent_version": "0.1.0", - "sent_at": datetime.utcnow().isoformat(), - } - - client = await self._get_client() - - for attempt in range(self.retry_attempts): - try: - response = await client.post( - f"{self.endpoint}/api/v1/ingest", - json=payload, - ) - - if response.status_code == 200: - logger.debug(f"Successfully sent {len(metrics)} metrics") - return response.json() - elif response.status_code == 401: - logger.error("Authentication failed - check API key") - return None - elif response.status_code == 429: - # Rate limited - retry_after = float(response.headers.get("Retry-After", self.retry_delay * 2)) - logger.warning(f"Rate limited, waiting {retry_after}s") - await asyncio.sleep(retry_after) - else: - logger.warning(f"Failed to send metrics: {response.status_code}") - - except httpx.TimeoutException: - logger.warning(f"Timeout sending metrics (attempt {attempt + 1})") - except httpx.HTTPError as e: - logger.warning(f"HTTP error sending metrics: {e}") - - if attempt < self.retry_attempts - 1: - await asyncio.sleep(self.retry_delay * (attempt + 1)) - - logger.error(f"Failed to send {len(metrics)} metrics after {self.retry_attempts} attempts") - return None - - async def check_health(self) -> bool: - """Check if Prescale API is healthy.""" - try: - client = await self._get_client() - response = await client.get(f"{self.endpoint}/health") - return response.status_code == 200 - except Exception: - return False - - async def get_predictions( - self, - deployment: str, - namespace: str = "default", - metric: str = "cpu_utilization", - horizon_hours: int = 24, - ) -> Optional[dict]: - """Get predictions from Prescale API. - - Args: - deployment: Deployment name - namespace: Kubernetes namespace - metric: Metric type to predict - horizon_hours: Prediction horizon - - Returns: - Prediction response or None if failed - """ - try: - client = await self._get_client() - response = await client.post( - f"{self.endpoint}/predict", - json={ - "metric": metric, - "deployment": deployment, - "namespace": namespace, - "horizon_hours": horizon_hours, - }, - ) - - if response.status_code == 200: - return response.json() - else: - logger.warning(f"Failed to get predictions: {response.status_code}") - return None - - except Exception as e: - logger.error(f"Error getting predictions: {e}") - return None - - async def detect_anomalies( - self, - deployment: str, - namespace: str = "default", - metrics: Optional[list[str]] = None, - ) -> Optional[dict]: - """Detect anomalies using Prescale API. - - Args: - deployment: Deployment name - namespace: Kubernetes namespace - metrics: List of metric types to analyze - - Returns: - Anomaly detection response or None if failed - """ - try: - client = await self._get_client() - payload = { - "deployment": deployment, - "namespace": namespace, - } - if metrics: - payload["metrics"] = metrics - - response = await client.post( - f"{self.endpoint}/detect", - json=payload, - ) - - if response.status_code == 200: - return response.json() - else: - logger.warning(f"Failed to detect anomalies: {response.status_code}") - return None - - except Exception as e: - logger.error(f"Error detecting anomalies: {e}") - return None - - async def close(self): - """Close the HTTP client.""" - if self._client: - await self._client.aclose() - self._client = None diff --git a/agent/src/prescale_agent/config.py b/agent/src/prescale_agent/config.py deleted file mode 100644 index 4382bb5..0000000 --- a/agent/src/prescale_agent/config.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Configuration management for Prescale Agent.""" - -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Optional - -import yaml - -from .sources import SourceConfig - - -@dataclass -class PrescaleEndpoint: - """Prescale API endpoint configuration.""" - - url: str = "http://104.155.137.61" - api_key: Optional[str] = None - timeout: int = 30 - retry_attempts: int = 3 - retry_delay: float = 1.0 - - -@dataclass -class AgentConfig: - """Main agent configuration.""" - - # Prescale endpoint - endpoint: PrescaleEndpoint = field(default_factory=PrescaleEndpoint) - - # Metrics sources (unified) - sources: list[SourceConfig] = field(default_factory=list) - - # Agent settings - batch_size: int = 100 - flush_interval: int = 10 # seconds - log_level: str = "INFO" - - @classmethod - def from_file(cls, path: str | Path) -> "AgentConfig": - """Load configuration from YAML file.""" - with open(path) as f: - data = yaml.safe_load(f) - return cls.from_dict(data or {}) - - @classmethod - def from_dict(cls, data: dict) -> "AgentConfig": - """Create config from dictionary.""" - config = cls() - - # Endpoint - if "endpoint" in data: - ep = data["endpoint"] - config.endpoint = PrescaleEndpoint( - url=ep.get("url", config.endpoint.url), - api_key=ep.get("api_key") or os.environ.get("PRESCALE_API_KEY"), - timeout=ep.get("timeout", config.endpoint.timeout), - retry_attempts=ep.get("retry_attempts", config.endpoint.retry_attempts), - retry_delay=ep.get("retry_delay", config.endpoint.retry_delay), - ) - - # Override API key from env - if not config.endpoint.api_key: - config.endpoint.api_key = os.environ.get("PRESCALE_API_KEY") - - # Override URL from env - if os.environ.get("PRESCALE_ENDPOINT"): - config.endpoint.url = os.environ["PRESCALE_ENDPOINT"] - - # Parse sources - config.sources = [] - for source_data in data.get("sources", []): - # Ensure list types - metrics = source_data.get("metrics", []) - if isinstance(metrics, str): - metrics = [metrics] - - queries = source_data.get("queries", []) - if isinstance(queries, str): - queries = [queries] - - # Merge top-level project_id into credentials - credentials = source_data.get("credentials", {}).copy() - if "project_id" in source_data: - credentials["project_id"] = source_data["project_id"] - - source_config = SourceConfig( - name=source_data.get("name", source_data.get("type", "unknown")), - type=source_data.get("type"), - enabled=source_data.get("enabled", True), - interval=source_data.get("interval", 15), - endpoint=source_data.get("endpoint"), - api_key=source_data.get("api_key") or os.environ.get(f"{source_data.get('type', '').upper()}_API_KEY"), - credentials=credentials, - queries=queries, - metrics=metrics, - namespaces=source_data.get("namespaces", []), - labels=source_data.get("labels", {}), - options=source_data.get("options", {}), - ) - config.sources.append(source_config) - - # Default to system source if no sources configured - if not config.sources: - config.sources.append(SourceConfig( - name="local-system", - type="system", - enabled=True, - interval=15, - options={ - "collect_cpu": True, - "collect_memory": True, - "collect_disk": True, - "collect_network": True, - }, - )) - - # Agent settings - config.batch_size = data.get("batch_size", config.batch_size) - config.flush_interval = data.get("flush_interval", config.flush_interval) - config.log_level = data.get("log_level", config.log_level) - - return config - - @classmethod - def from_env(cls) -> "AgentConfig": - """Create config from environment variables.""" - config = cls() - - # Endpoint - config.endpoint.url = os.environ.get("PRESCALE_ENDPOINT", config.endpoint.url) - config.endpoint.api_key = os.environ.get("PRESCALE_API_KEY") - - # Default system source - config.sources = [ - SourceConfig( - name="local-system", - type="system", - enabled=True, - interval=15, - ) - ] - - # Add Prometheus if configured - prom_url = os.environ.get("PROMETHEUS_URL") - if prom_url: - config.sources.append(SourceConfig( - name="prometheus", - type="prometheus", - enabled=True, - endpoint=prom_url, - interval=15, - )) - - # Add Datadog if configured - dd_api_key = os.environ.get("DATADOG_API_KEY") - if dd_api_key: - config.sources.append(SourceConfig( - name="datadog", - type="datadog", - enabled=True, - api_key=dd_api_key, - credentials={"app_key": os.environ.get("DATADOG_APP_KEY", "")}, - interval=15, - )) - - return config - - -def load_config(config_path: Optional[str] = None) -> AgentConfig: - """Load configuration from file or environment.""" - # Try config file first - if config_path and Path(config_path).exists(): - return AgentConfig.from_file(config_path) - - # Try default locations - default_paths = [ - Path("prescale-agent.yaml"), - Path("prescale-agent.yml"), - Path.home() / ".prescale" / "agent.yaml", - Path("/etc/prescale/agent.yaml"), - ] - - for path in default_paths: - if path.exists(): - return AgentConfig.from_file(path) - - # Fall back to environment - return AgentConfig.from_env() diff --git a/agent/src/prescale_agent/sources/__init__.py b/agent/src/prescale_agent/sources/__init__.py deleted file mode 100644 index f38d1bf..0000000 --- a/agent/src/prescale_agent/sources/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Metrics sources - pluggable backends for collecting metrics.""" - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig -from .registry import SourceRegistry, get_source, register_source, list_sources - -__all__ = [ - "MetricsSource", - "MetricSample", - "MetricType", - "SourceConfig", - "SourceRegistry", - "get_source", - "register_source", - "list_sources", -] diff --git a/agent/src/prescale_agent/sources/azure_monitor.py b/agent/src/prescale_agent/sources/azure_monitor.py deleted file mode 100644 index a38fcd9..0000000 --- a/agent/src/prescale_agent/sources/azure_monitor.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Azure Monitor metrics source.""" - -import time -from datetime import datetime, timezone, timedelta -import logging - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig, CollectionResult -from .registry import register_source - -logger = logging.getLogger(__name__) - - -@register_source("azure_monitor") -class AzureMonitorSource(MetricsSource): - """ - Collect metrics from Azure Monitor. - - Config: - credentials: - tenant_id: str - Azure AD tenant ID - client_id: str - Service principal client ID - client_secret: str - Service principal secret - subscription_id: str - Azure subscription ID - - metrics: list[str] - Metric specs in format "ResourceType/MetricName" - e.g., ["Microsoft.Compute/virtualMachines/Percentage CPU"] - - Config options: - resource_group: str - Filter by resource group - timespan: str - ISO8601 duration (default: PT5M) - interval: str - Aggregation interval (default: PT1M) - """ - - source_type = "azure_monitor" - - def __init__(self, config: SourceConfig): - super().__init__(config) - self._credential = None - self._client = None - - async def initialize(self) -> bool: - """Initialize Azure Monitor client.""" - try: - from azure.identity import ClientSecretCredential - from azure.mgmt.monitor import MonitorManagementClient - - self._credential = ClientSecretCredential( - tenant_id=self.config.credentials.get("tenant_id"), - client_id=self.config.credentials.get("client_id"), - client_secret=self.config.credentials.get("client_secret"), - ) - - self._client = MonitorManagementClient( - credential=self._credential, - subscription_id=self.config.credentials.get("subscription_id"), - ) - - self._initialized = True - return True - - except ImportError: - logger.error("Azure SDK required: pip install azure-identity azure-mgmt-monitor") - return False - except Exception as e: - logger.error(f"Failed to initialize Azure Monitor: {e}") - return False - - async def health_check(self) -> bool: - """Check Azure Monitor connectivity.""" - if not self._client: - return False - - try: - # Try to list metric definitions for a known resource type - list(self._client.metric_definitions.list( - resource_uri=f"/subscriptions/{self.config.credentials.get('subscription_id')}", - )) - return True - except Exception: - return False - - async def collect(self) -> CollectionResult: - """Query Azure Monitor metrics.""" - if not self._client: - await self.initialize() - - start = time.time() - metrics = [] - - metric_specs = self.config.metrics or self.get_default_queries() - - try: - timespan = self.config.options.get("timespan", "PT5M") - interval = self.config.options.get("interval", "PT1M") - - for spec in metric_specs: - result = self._query_metric(spec, timespan, interval) - metrics.extend(result) - - return CollectionResult( - source=self.name, - success=True, - metrics=metrics, - duration_ms=(time.time() - start) * 1000, - ) - - except Exception as e: - logger.error(f"Error collecting Azure Monitor metrics: {e}") - return CollectionResult( - source=self.name, - success=False, - error=str(e), - duration_ms=(time.time() - start) * 1000, - ) - - def _query_metric( - self, - spec: str, - timespan: str, - interval: str, - ) -> list[MetricSample]: - """Query a single Azure Monitor metric.""" - metrics = [] - - # Parse spec: "ResourceUri/MetricName" or simplified format - # For simplicity, we'll query all resources of a type - - try: - # This is a simplified implementation - # In production, you'd iterate over resources - response = self._client.metrics.list( - resource_uri=spec, - timespan=timespan, - interval=interval, - metricnames=None, # All metrics - aggregation="Average", - ) - - for metric in response.value: - for ts in metric.timeseries: - for data in ts.data: - if data.average is not None: - # Build labels from dimensions - labels = {} - if ts.metadatavalues: - for mv in ts.metadatavalues: - labels[mv.name.value] = mv.value - - normalized = self._normalize_metric_name(metric.name.value) - - metrics.append(MetricSample( - name=normalized, - value=data.average, - timestamp=data.time_stamp.replace(tzinfo=timezone.utc), - metric_type=MetricType.GAUGE, - labels=labels, - source=self.name, - )) - break # Take first (most recent) value - - except Exception as e: - logger.warning(f"Failed to query Azure metric '{spec}': {e}") - - return metrics - - def _normalize_metric_name(self, name: str) -> str: - """Convert Azure metric name to standard format.""" - # Percentage CPU -> cpu_percentage - import re - name = re.sub(r'(? list[str]: - """Azure requires service principal credentials.""" - return ["tenant_id", "client_id", "client_secret", "subscription_id"] - - @classmethod - def get_default_queries(cls) -> list[str]: - """Default Azure metrics.""" - return [ - # These would be resource URIs in production - "Percentage CPU", - "Available Memory Bytes", - "Disk Read Bytes", - "Disk Write Bytes", - "Network In Total", - "Network Out Total", - ] diff --git a/agent/src/prescale_agent/sources/base.py b/agent/src/prescale_agent/sources/base.py deleted file mode 100644 index a4d959c..0000000 --- a/agent/src/prescale_agent/sources/base.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Base interface for all metrics sources.""" - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from typing import Any, Optional - - -class MetricType(str, Enum): - """Types of metrics.""" - GAUGE = "gauge" - COUNTER = "counter" - HISTOGRAM = "histogram" - SUMMARY = "summary" - - -@dataclass -class MetricSample: - """A single metric sample from any source.""" - - name: str - value: float - timestamp: datetime - metric_type: MetricType = MetricType.GAUGE - labels: dict[str, str] = field(default_factory=dict) - source: str = "" # Which source collected this - - def to_dict(self) -> dict[str, Any]: - """Convert to dictionary for API submission.""" - return { - "name": self.name, - "value": self.value, - "timestamp": self.timestamp.isoformat(), - "type": self.metric_type.value, - "labels": self.labels, - "source": self.source, - } - - -@dataclass -class SourceConfig: - """Configuration for a metrics source.""" - - name: str - type: str # e.g., "prometheus", "datadog", "cloudwatch" - enabled: bool = True - interval: int = 15 # Collection interval in seconds - - # Connection settings - endpoint: Optional[str] = None - api_key: Optional[str] = None - credentials: dict[str, Any] = field(default_factory=dict) - - # Query configuration - queries: list[str] = field(default_factory=list) - metrics: list[str] = field(default_factory=list) - - # Filters - namespaces: list[str] = field(default_factory=list) - labels: dict[str, str] = field(default_factory=dict) - - # Extra options specific to the source type - options: dict[str, Any] = field(default_factory=dict) - - -@dataclass -class CollectionResult: - """Result from a collection operation.""" - - source: str - success: bool - metrics: list[MetricSample] = field(default_factory=list) - error: Optional[str] = None - duration_ms: float = 0.0 - timestamp: datetime = field(default_factory=datetime.utcnow) - - -class MetricsSource(ABC): - """ - Abstract base class for all metrics sources. - - Implement this interface to add support for a new monitoring backend. - - Example: - class DatadogSource(MetricsSource): - source_type = "datadog" - - async def collect(self) -> CollectionResult: - # Fetch metrics from Datadog API - ... - """ - - # Override in subclass - used for registration - source_type: str = "base" - - def __init__(self, config: SourceConfig): - self.config = config - self.name = config.name - self._initialized = False - - @abstractmethod - async def initialize(self) -> bool: - """ - Initialize the source (connect, authenticate, etc.). - - Returns: - True if initialization succeeded - """ - pass - - @abstractmethod - async def collect(self) -> CollectionResult: - """ - Collect metrics from this source. - - Returns: - CollectionResult with metrics or error - """ - pass - - @abstractmethod - async def health_check(self) -> bool: - """ - Check if the source is healthy and reachable. - - Returns: - True if source is healthy - """ - pass - - async def close(self): - """Clean up resources. Override if needed.""" - pass - - def is_enabled(self) -> bool: - """Check if this source is enabled.""" - return self.config.enabled - - @classmethod - def get_required_credentials(cls) -> list[str]: - """ - Return list of required credential fields for this source. - - Override to specify required credentials like api_key, secret, etc. - """ - return [] - - @classmethod - def get_default_queries(cls) -> list[str]: - """ - Return default queries for this source type. - - Override to provide sensible defaults. - """ - return [] diff --git a/agent/src/prescale_agent/sources/cloudwatch.py b/agent/src/prescale_agent/sources/cloudwatch.py deleted file mode 100644 index 7a92419..0000000 --- a/agent/src/prescale_agent/sources/cloudwatch.py +++ /dev/null @@ -1,209 +0,0 @@ -"""AWS CloudWatch metrics source.""" - -import time -from datetime import datetime, timezone, timedelta -import logging - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig, CollectionResult -from .registry import register_source - -logger = logging.getLogger(__name__) - - -@register_source("cloudwatch") -class CloudWatchSource(MetricsSource): - """ - Collect metrics from AWS CloudWatch. - - Config: - credentials: - aws_access_key_id: str - aws_secret_access_key: str - region: str - AWS region (default: us-east-1) - - metrics: list[str] - Metric specs in format "Namespace/MetricName" - e.g., ["AWS/EC2/CPUUtilization", "AWS/RDS/DatabaseConnections"] - - Config options: - period: int - CloudWatch period in seconds (default: 300) - lookback_minutes: int - How far back to query (default: 10) - dimensions: dict - Dimensions to filter by - """ - - source_type = "cloudwatch" - - def __init__(self, config: SourceConfig): - super().__init__(config) - self._client = None - - async def initialize(self) -> bool: - """Initialize AWS CloudWatch client.""" - try: - import boto3 - - region = self.config.credentials.get("region", "us-east-1") - - self._client = boto3.client( - "cloudwatch", - region_name=region, - aws_access_key_id=self.config.credentials.get("aws_access_key_id"), - aws_secret_access_key=self.config.credentials.get("aws_secret_access_key"), - ) - self._initialized = True - return True - - except ImportError: - logger.error("boto3 is required for CloudWatch source: pip install boto3") - return False - except Exception as e: - logger.error(f"Failed to initialize CloudWatch: {e}") - return False - - async def health_check(self) -> bool: - """Check CloudWatch connectivity.""" - if not self._client: - return False - - try: - self._client.list_metrics(Limit=1) - return True - except Exception: - return False - - async def collect(self) -> CollectionResult: - """Query CloudWatch metrics.""" - if not self._client: - await self.initialize() - - start = time.time() - metrics = [] - - metric_specs = self.config.metrics or self.get_default_queries() - - try: - lookback = self.config.options.get("lookback_minutes", 10) - period = self.config.options.get("period", 300) - now = datetime.now(timezone.utc) - start_time = now - timedelta(minutes=lookback) - - for spec in metric_specs: - result = self._query_metric(spec, start_time, now, period) - metrics.extend(result) - - return CollectionResult( - source=self.name, - success=True, - metrics=metrics, - duration_ms=(time.time() - start) * 1000, - ) - - except Exception as e: - logger.error(f"Error collecting CloudWatch metrics: {e}") - return CollectionResult( - source=self.name, - success=False, - error=str(e), - duration_ms=(time.time() - start) * 1000, - ) - - def _query_metric( - self, - spec: str, - start_time: datetime, - end_time: datetime, - period: int, - ) -> list[MetricSample]: - """Query a single CloudWatch metric.""" - metrics = [] - - # Parse spec: "Namespace/MetricName" or "Namespace/MetricName:Dim=Val" - parts = spec.split("/") - if len(parts) < 2: - logger.warning(f"Invalid metric spec: {spec}") - return metrics - - namespace = parts[0] - metric_part = "/".join(parts[1:]) - - # Parse dimensions - dimensions = [] - if ":" in metric_part: - metric_name, dim_str = metric_part.split(":", 1) - for dim in dim_str.split(","): - if "=" in dim: - name, value = dim.split("=", 1) - dimensions.append({"Name": name, "Value": value}) - else: - metric_name = metric_part - - # Add config dimensions - for name, value in self.config.options.get("dimensions", {}).items(): - dimensions.append({"Name": name, "Value": value}) - - try: - response = self._client.get_metric_statistics( - Namespace=namespace, - MetricName=metric_name, - Dimensions=dimensions, - StartTime=start_time, - EndTime=end_time, - Period=period, - Statistics=["Average"], - ) - - datapoints = response.get("Datapoints", []) - if datapoints: - # Sort by timestamp and take the latest - datapoints.sort(key=lambda x: x["Timestamp"]) - latest = datapoints[-1] - - # Build labels from dimensions - labels = {d["Name"]: d["Value"] for d in dimensions} - labels["namespace"] = namespace - - # Normalize metric name - normalized = self._normalize_metric_name(namespace, metric_name) - - metrics.append(MetricSample( - name=normalized, - value=latest.get("Average", 0.0), - timestamp=latest["Timestamp"].replace(tzinfo=timezone.utc), - metric_type=MetricType.GAUGE, - labels=labels, - source=self.name, - )) - - except Exception as e: - logger.warning(f"Failed to query CloudWatch metric '{spec}': {e}") - - return metrics - - def _normalize_metric_name(self, namespace: str, metric_name: str) -> str: - """Convert CloudWatch metric to standard format.""" - # AWS/EC2/CPUUtilization -> ec2_cpu_utilization - ns_parts = namespace.lower().split("/") - if ns_parts[0] == "aws": - ns_parts = ns_parts[1:] - - # Convert CamelCase to snake_case - import re - name = re.sub(r'(? list[str]: - """CloudWatch requires AWS credentials.""" - return ["aws_access_key_id", "aws_secret_access_key", "region"] - - @classmethod - def get_default_queries(cls) -> list[str]: - """Default CloudWatch metrics.""" - return [ - "AWS/EC2/CPUUtilization", - "AWS/EC2/NetworkIn", - "AWS/EC2/NetworkOut", - "AWS/RDS/CPUUtilization", - "AWS/RDS/DatabaseConnections", - "AWS/RDS/FreeableMemory", - ] diff --git a/agent/src/prescale_agent/sources/datadog.py b/agent/src/prescale_agent/sources/datadog.py deleted file mode 100644 index 60f9710..0000000 --- a/agent/src/prescale_agent/sources/datadog.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Datadog metrics source - fetches metrics from Datadog API.""" - -import time -from datetime import datetime, timezone, timedelta -import logging - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig, CollectionResult -from .registry import register_source - -logger = logging.getLogger(__name__) - - -@register_source("datadog") -class DatadogSource(MetricsSource): - """ - Collect metrics from Datadog API. - - Config: - endpoint: str - Datadog API endpoint (default: "https://api.datadoghq.com") - api_key: str - Datadog API key - credentials: - app_key: str - Datadog application key - - metrics: list[str] - Metric names to query (e.g., ["system.cpu.user", "system.mem.used"]) - - Config options: - site: str - Datadog site (us1, us3, us5, eu1, ap1) - lookback_minutes: int - How far back to query (default: 5) - """ - - source_type = "datadog" - - SITE_ENDPOINTS = { - "us1": "https://api.datadoghq.com", - "us3": "https://api.us3.datadoghq.com", - "us5": "https://api.us5.datadoghq.com", - "eu1": "https://api.datadoghq.eu", - "ap1": "https://api.ap1.datadoghq.com", - } - - def __init__(self, config: SourceConfig): - super().__init__(config) - self._client = None - - async def initialize(self) -> bool: - """Initialize Datadog client.""" - try: - import httpx - - # Determine endpoint - site = self.config.options.get("site", "us1") - endpoint = self.config.endpoint or self.SITE_ENDPOINTS.get(site, self.SITE_ENDPOINTS["us1"]) - - self._client = httpx.AsyncClient( - base_url=endpoint, - headers={ - "DD-API-KEY": self.config.api_key or "", - "DD-APPLICATION-KEY": self.config.credentials.get("app_key", ""), - }, - timeout=30, - ) - self._initialized = True - return True - - except ImportError: - logger.error("httpx is required for Datadog source") - return False - except Exception as e: - logger.error(f"Failed to initialize Datadog source: {e}") - return False - - async def close(self): - """Close HTTP client.""" - if self._client: - await self._client.aclose() - self._client = None - - async def health_check(self) -> bool: - """Check Datadog API connectivity.""" - if not self._client: - return False - - try: - response = await self._client.get("/api/v1/validate") - return response.status_code == 200 - except Exception: - return False - - async def collect(self) -> CollectionResult: - """Query Datadog metrics API.""" - if not self._client: - await self.initialize() - - start = time.time() - metrics = [] - - # Get metrics to query - metric_names = self.config.metrics or self.get_default_queries() - - try: - lookback = self.config.options.get("lookback_minutes", 5) - now = datetime.now(timezone.utc) - from_ts = int((now - timedelta(minutes=lookback)).timestamp()) - to_ts = int(now.timestamp()) - - for metric_name in metric_names: - result = await self._query_metric(metric_name, from_ts, to_ts) - metrics.extend(result) - - return CollectionResult( - source=self.name, - success=True, - metrics=metrics, - duration_ms=(time.time() - start) * 1000, - ) - - except Exception as e: - logger.error(f"Error collecting Datadog metrics: {e}") - return CollectionResult( - source=self.name, - success=False, - error=str(e), - duration_ms=(time.time() - start) * 1000, - ) - - async def _query_metric(self, metric_name: str, from_ts: int, to_ts: int) -> list[MetricSample]: - """Query a single metric from Datadog.""" - metrics = [] - - try: - response = await self._client.get( - "/api/v1/query", - params={ - "query": metric_name, - "from": from_ts, - "to": to_ts, - }, - ) - response.raise_for_status() - data = response.json() - - for series in data.get("series", []): - scope = series.get("scope", "") - pointlist = series.get("pointlist", []) - - # Parse scope into labels - labels = self._parse_scope(scope) - - # Take the most recent point - if pointlist: - timestamp_ms, value = pointlist[-1] - - # Normalize metric name - normalized_name = self._normalize_metric_name(series.get("metric", metric_name)) - - metrics.append(MetricSample( - name=normalized_name, - value=value if value is not None else 0.0, - timestamp=datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc), - metric_type=MetricType.GAUGE, - labels=labels, - source=self.name, - )) - - except Exception as e: - logger.warning(f"Failed to query Datadog metric '{metric_name}': {e}") - - return metrics - - def _parse_scope(self, scope: str) -> dict[str, str]: - """Parse Datadog scope string into labels dict.""" - labels = {} - if scope: - for part in scope.split(","): - if ":" in part: - key, value = part.split(":", 1) - labels[key.strip()] = value.strip() - return labels - - def _normalize_metric_name(self, name: str) -> str: - """Convert Datadog metric name to standard format.""" - # system.cpu.user -> cpu_user - parts = name.split(".") - if len(parts) > 1 and parts[0] in ("system", "aws", "azure", "gcp"): - parts = parts[1:] - return "_".join(parts) - - @classmethod - def get_required_credentials(cls) -> list[str]: - """Datadog requires API key and app key.""" - return ["api_key", "app_key"] - - @classmethod - def get_default_queries(cls) -> list[str]: - """Default Datadog metrics to collect.""" - return [ - "system.cpu.user", - "system.cpu.system", - "system.mem.used", - "system.mem.total", - "system.disk.used", - "system.net.bytes_rcvd", - "system.net.bytes_sent", - ] diff --git a/agent/src/prescale_agent/sources/gcp_monitoring.py b/agent/src/prescale_agent/sources/gcp_monitoring.py deleted file mode 100644 index 6d9c84e..0000000 --- a/agent/src/prescale_agent/sources/gcp_monitoring.py +++ /dev/null @@ -1,259 +0,0 @@ -"""Google Cloud Monitoring metrics source.""" - -import time -from datetime import datetime, timezone, timedelta -import logging - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig, CollectionResult -from .registry import register_source - -logger = logging.getLogger(__name__) - - -@register_source("gcp_monitoring") -class GCPMonitoringSource(MetricsSource): - """ - Collect metrics from Google Cloud Monitoring (Stackdriver). - - Config: - credentials: - project_id: str - GCP project ID - credentials_file: str - Path to service account JSON (optional if using ADC) - - metrics: list[str] - Metric types to query - e.g., ["compute.googleapis.com/instance/cpu/utilization"] - - Config options: - lookback_minutes: int - How far back to query (default: 5) - alignment_period: int - Alignment period in seconds (default: 60) - """ - - source_type = "gcp_monitoring" - - def __init__(self, config: SourceConfig): - super().__init__(config) - self._client = None - - async def initialize(self) -> bool: - """Initialize GCP Monitoring client.""" - try: - from google.cloud import monitoring_v3 - from google.oauth2 import service_account - - credentials = None - creds_file = self.config.credentials.get("credentials_file") - - if creds_file: - credentials = service_account.Credentials.from_service_account_file(creds_file) - - self._client = monitoring_v3.MetricServiceClient(credentials=credentials) - self._project_id = self.config.credentials.get("project_id") - self._initialized = True - return True - - except ImportError: - logger.error("GCP SDK required: pip install google-cloud-monitoring") - return False - except Exception as e: - logger.error(f"Failed to initialize GCP Monitoring: {e}") - return False - - async def health_check(self) -> bool: - """Check GCP Monitoring connectivity.""" - if not self._client or not self._project_id: - return False - - try: - project_name = f"projects/{self._project_id}" - # Try to list metric descriptors - request = {"name": project_name, "page_size": 1} - list(self._client.list_metric_descriptors(request=request)) - return True - except Exception: - return False - - async def collect(self) -> CollectionResult: - """Query GCP Monitoring metrics.""" - if not self._client: - await self.initialize() - - start = time.time() - metrics = [] - - metric_types = self.config.metrics or self.get_default_queries() - - try: - lookback = self.config.options.get("lookback_minutes", 5) - alignment = self.config.options.get("alignment_period", 60) - - now = datetime.now(timezone.utc) - start_time = now - timedelta(minutes=lookback) - - for metric_type in metric_types: - result = self._query_metric(metric_type, start_time, now, alignment) - metrics.extend(result) - - return CollectionResult( - source=self.name, - success=True, - metrics=metrics, - duration_ms=(time.time() - start) * 1000, - ) - - except Exception as e: - logger.error(f"Error collecting GCP Monitoring metrics: {e}") - return CollectionResult( - source=self.name, - success=False, - error=str(e), - duration_ms=(time.time() - start) * 1000, - ) - - def _query_metric( - self, - metric_type: str, - start_time: datetime, - end_time: datetime, - alignment_period: int, - ) -> list[MetricSample]: - """Query a single GCP metric type.""" - metrics = [] - - try: - from google.cloud.monitoring_v3 import Aggregation, TimeInterval - from google.protobuf.timestamp_pb2 import Timestamp - - project_name = f"projects/{self._project_id}" - - # Build time interval - start_pb = Timestamp() - start_pb.FromDatetime(start_time) - end_pb = Timestamp() - end_pb.FromDatetime(end_time) - - interval = TimeInterval( - start_time=start_pb, - end_time=end_pb, - ) - - # Choose aligner based on metric type - # Cumulative metrics (counters) need ALIGN_RATE, gauge metrics use ALIGN_MEAN - aligner = Aggregation.Aligner.ALIGN_RATE - if any(gauge in metric_type for gauge in ['utilization', 'memory/used', 'limit_utilization']): - aligner = Aggregation.Aligner.ALIGN_MEAN - - # Build aggregation - aggregation = Aggregation( - alignment_period={"seconds": alignment_period}, - per_series_aligner=aligner, - ) - - results = self._client.list_time_series( - request={ - "name": project_name, - "filter": f'metric.type = "{metric_type}"', - "interval": interval, - "view": "FULL", - "aggregation": aggregation, - } - ) - - for ts in results: - # Build labels from metric and resource labels - labels = dict(ts.metric.labels) - labels.update({f"resource_{k}": v for k, v in ts.resource.labels.items()}) - labels["resource_type"] = ts.resource.type - - # Merge static labels from config (e.g. deployment ID) - if self.config.labels: - labels.update(self.config.labels) - - # Get latest point - points = list(ts.points) - if points: - point = points[0] # Most recent - - # Extract value based on type - value = self._extract_value(point.value) - - normalized = self._normalize_metric_name(metric_type) - - # Handle timestamp - proto-plus returns DatetimeWithNanoseconds which is a datetime subclass - ts_value = point.interval.end_time - if hasattr(ts_value, 'ToDatetime'): - ts_value = ts_value.ToDatetime().replace(tzinfo=timezone.utc) - elif not ts_value.tzinfo: - ts_value = ts_value.replace(tzinfo=timezone.utc) - - metrics.append(MetricSample( - name=normalized, - value=value, - timestamp=ts_value, - metric_type=MetricType.GAUGE, - labels=labels, - source=self.name, - )) - - except Exception as e: - logger.warning(f"Failed to query GCP metric '{metric_type}': {e}") - - return metrics - - def _extract_value(self, typed_value) -> float: - """Extract numeric value from GCP TypedValue.""" - # The google-cloud-monitoring proto-plus wrapper uses simple attribute access - # Try each value type in order of likelihood - try: - if typed_value.double_value: - return typed_value.double_value - except (AttributeError, TypeError): - pass - - try: - if typed_value.int64_value: - return float(typed_value.int64_value) - except (AttributeError, TypeError): - pass - - try: - if typed_value.bool_value is not None: - return 1.0 if typed_value.bool_value else 0.0 - except (AttributeError, TypeError): - pass - - try: - if typed_value.distribution_value: - return typed_value.distribution_value.mean - except (AttributeError, TypeError): - pass - - return 0.0 - - def _normalize_metric_name(self, metric_type: str) -> str: - """Convert GCP metric type to standard format.""" - # compute.googleapis.com/instance/cpu/utilization -> instance_cpu_utilization - parts = metric_type.split("/") - if len(parts) > 1: - # Remove the domain part - parts = parts[1:] - return "_".join(parts).replace("-", "_") - - @classmethod - def get_required_credentials(cls) -> list[str]: - """GCP requires project ID, credentials optional if using ADC.""" - return ["project_id"] - - @classmethod - def get_default_queries(cls) -> list[str]: - """Default GCP Monitoring metrics.""" - return [ - "compute.googleapis.com/instance/cpu/utilization", - "compute.googleapis.com/instance/memory/balloon/ram_used", - "compute.googleapis.com/instance/disk/read_bytes_count", - "compute.googleapis.com/instance/disk/write_bytes_count", - "compute.googleapis.com/instance/network/received_bytes_count", - "compute.googleapis.com/instance/network/sent_bytes_count", - # Kubernetes Engine - "kubernetes.io/container/cpu/core_usage_time", - "kubernetes.io/container/memory/used_bytes", - ] diff --git a/agent/src/prescale_agent/sources/prometheus.py b/agent/src/prescale_agent/sources/prometheus.py deleted file mode 100644 index f911dc2..0000000 --- a/agent/src/prescale_agent/sources/prometheus.py +++ /dev/null @@ -1,206 +0,0 @@ -"""Prometheus metrics source - scrapes metrics from Prometheus server.""" - -import time -from datetime import datetime, timezone -from typing import Any -import logging - -import httpx - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig, CollectionResult -from .registry import register_source - -logger = logging.getLogger(__name__) - - -@register_source("prometheus") -class PrometheusSource(MetricsSource): - """ - Collect metrics from a Prometheus server. - - This source executes PromQL queries and converts results to MetricSamples. - - Config: - endpoint: str - Prometheus server URL (e.g., "http://prometheus:9090") - queries: list[str] - PromQL queries to execute - - Config options: - timeout: int - Query timeout in seconds (default: 30) - step: str - Query step for range queries (default: "1m") - """ - - source_type = "prometheus" - - def __init__(self, config: SourceConfig): - super().__init__(config) - self._client: httpx.AsyncClient | None = None - - async def initialize(self) -> bool: - """Initialize HTTP client for Prometheus.""" - timeout = self.config.options.get("timeout", 30) - self._client = httpx.AsyncClient( - base_url=self.config.endpoint, - timeout=timeout, - ) - self._initialized = True - return True - - async def close(self): - """Close HTTP client.""" - if self._client: - await self._client.aclose() - self._client = None - - async def health_check(self) -> bool: - """Check Prometheus connectivity.""" - if not self._client: - return False - - try: - response = await self._client.get("/-/healthy") - return response.status_code == 200 - except Exception as e: - logger.debug(f"Prometheus health check failed: {e}") - return False - - async def collect(self) -> CollectionResult: - """Execute PromQL queries and collect metrics.""" - if not self._client: - await self.initialize() - - start = time.time() - metrics = [] - - # Use configured queries or defaults - queries = self.config.queries or self.get_default_queries() - - try: - for query in queries: - result = await self._execute_query(query) - metrics.extend(result) - - return CollectionResult( - source=self.name, - success=True, - metrics=metrics, - duration_ms=(time.time() - start) * 1000, - ) - - except Exception as e: - logger.error(f"Error collecting Prometheus metrics: {e}") - return CollectionResult( - source=self.name, - success=False, - error=str(e), - duration_ms=(time.time() - start) * 1000, - ) - - async def _execute_query(self, query: str) -> list[MetricSample]: - """Execute a PromQL query and parse results.""" - metrics = [] - - try: - response = await self._client.get( - "/api/v1/query", - params={"query": query}, - ) - response.raise_for_status() - data = response.json() - - if data.get("status") != "success": - logger.warning(f"Prometheus query failed: {data.get('error', 'unknown')}") - return metrics - - result = data.get("data", {}) - result_type = result.get("resultType") - - if result_type == "vector": - metrics.extend(self._parse_vector(result.get("result", []), query)) - elif result_type == "matrix": - metrics.extend(self._parse_matrix(result.get("result", []), query)) - - except Exception as e: - logger.warning(f"Failed to execute query '{query}': {e}") - - return metrics - - def _parse_vector(self, results: list[dict], query: str) -> list[MetricSample]: - """Parse instant vector results.""" - metrics = [] - - for item in results: - metric_labels = item.get("metric", {}) - timestamp_val, value = item.get("value", [0, "0"]) - - # Extract metric name from labels or query - name = metric_labels.pop("__name__", self._query_to_name(query)) - - try: - metrics.append(MetricSample( - name=name, - value=float(value), - timestamp=datetime.fromtimestamp(timestamp_val, tz=timezone.utc), - metric_type=MetricType.GAUGE, - labels=metric_labels, - source=self.name, - )) - except (ValueError, TypeError) as e: - logger.debug(f"Failed to parse metric value: {e}") - - return metrics - - def _parse_matrix(self, results: list[dict], query: str) -> list[MetricSample]: - """Parse range vector (matrix) results - take latest value.""" - metrics = [] - - for item in results: - metric_labels = item.get("metric", {}) - values = item.get("values", []) - - if not values: - continue - - # Take the most recent value - timestamp_val, value = values[-1] - name = metric_labels.pop("__name__", self._query_to_name(query)) - - try: - metrics.append(MetricSample( - name=name, - value=float(value), - timestamp=datetime.fromtimestamp(timestamp_val, tz=timezone.utc), - metric_type=MetricType.GAUGE, - labels=metric_labels, - source=self.name, - )) - except (ValueError, TypeError) as e: - logger.debug(f"Failed to parse metric value: {e}") - - return metrics - - def _query_to_name(self, query: str) -> str: - """Convert a PromQL query to a metric name.""" - # Simple heuristic: extract first word before parens or brackets - clean = query.strip() - for char in "({[": - if char in clean: - clean = clean.split(char)[0] - return clean.strip().replace(" ", "_").lower() - - @classmethod - def get_required_credentials(cls) -> list[str]: - """Prometheus may need basic auth.""" - return [] # Optional: ["username", "password"] - - @classmethod - def get_default_queries(cls) -> list[str]: - """Default Prometheus queries for Kubernetes metrics.""" - return [ - # CPU usage by pod - 'sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (namespace, pod)', - # Memory usage by pod - 'sum(container_memory_usage_bytes{container!=""}) by (namespace, pod)', - # Network I/O - 'sum(rate(container_network_receive_bytes_total[5m])) by (namespace, pod)', - 'sum(rate(container_network_transmit_bytes_total[5m])) by (namespace, pod)', - ] diff --git a/agent/src/prescale_agent/sources/registry.py b/agent/src/prescale_agent/sources/registry.py deleted file mode 100644 index 95e1fd5..0000000 --- a/agent/src/prescale_agent/sources/registry.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Source registry - manages available metrics source plugins.""" - -from typing import Type, Optional -import logging - -from .base import MetricsSource, SourceConfig - -logger = logging.getLogger(__name__) - - -class SourceRegistry: - """ - Registry for metrics source plugins. - - Sources register themselves here and can be instantiated by type name. - """ - - _sources: dict[str, Type[MetricsSource]] = {} - - @classmethod - def register(cls, source_type: str, source_class: Type[MetricsSource]): - """Register a metrics source class.""" - cls._sources[source_type] = source_class - logger.debug(f"Registered metrics source: {source_type}") - - @classmethod - def get(cls, source_type: str) -> Optional[Type[MetricsSource]]: - """Get a source class by type name.""" - return cls._sources.get(source_type) - - @classmethod - def create(cls, config: SourceConfig) -> Optional[MetricsSource]: - """Create a source instance from config.""" - source_class = cls._sources.get(config.type) - if source_class is None: - logger.error(f"Unknown source type: {config.type}") - return None - return source_class(config) - - @classmethod - def list_types(cls) -> list[str]: - """List all registered source types.""" - return list(cls._sources.keys()) - - @classmethod - def is_registered(cls, source_type: str) -> bool: - """Check if a source type is registered.""" - return source_type in cls._sources - - -def register_source(source_type: str): - """ - Decorator to register a metrics source class. - - Usage: - @register_source("datadog") - class DatadogSource(MetricsSource): - ... - """ - def decorator(cls: Type[MetricsSource]): - cls.source_type = source_type - SourceRegistry.register(source_type, cls) - return cls - return decorator - - -def get_source(source_type: str) -> Optional[Type[MetricsSource]]: - """Get a source class by type name.""" - return SourceRegistry.get(source_type) - - -def list_sources() -> list[str]: - """List all registered source types.""" - return SourceRegistry.list_types() - - -# Auto-register built-in sources when this module is imported -def _register_builtin_sources(): - """Import and register all built-in source plugins.""" - try: - from . import prometheus - except ImportError as e: - logger.debug(f"Prometheus source not available: {e}") - - try: - from . import system - except ImportError as e: - logger.debug(f"System source not available: {e}") - - try: - from . import datadog - except ImportError as e: - logger.debug(f"Datadog source not available: {e}") - - try: - from . import cloudwatch - except ImportError as e: - logger.debug(f"CloudWatch source not available: {e}") - - try: - from . import azure_monitor - except ImportError as e: - logger.debug(f"Azure Monitor source not available: {e}") - - try: - from . import gcp_monitoring - except ImportError as e: - logger.debug(f"GCP Monitoring source not available: {e}") - - -_register_builtin_sources() diff --git a/agent/src/prescale_agent/sources/system.py b/agent/src/prescale_agent/sources/system.py deleted file mode 100644 index 62f9932..0000000 --- a/agent/src/prescale_agent/sources/system.py +++ /dev/null @@ -1,228 +0,0 @@ -"""System metrics source - collects host-level metrics via psutil.""" - -import time -from datetime import datetime, timezone -import logging - -import psutil - -from .base import MetricsSource, MetricSample, MetricType, SourceConfig, CollectionResult -from .registry import register_source - -logger = logging.getLogger(__name__) - - -@register_source("system") -class SystemSource(MetricsSource): - """ - Collect system metrics from the local host using psutil. - - This source collects: - - CPU utilization (overall and per-core) - - Memory usage - - Disk usage - - Network I/O - - Config options: - collect_cpu: bool - Collect CPU metrics (default: True) - collect_memory: bool - Collect memory metrics (default: True) - collect_disk: bool - Collect disk metrics (default: True) - collect_network: bool - Collect network metrics (default: True) - per_cpu: bool - Collect per-CPU core metrics (default: False) - """ - - source_type = "system" - - async def initialize(self) -> bool: - """Initialize system metrics collection.""" - # psutil doesn't need initialization - self._initialized = True - return True - - async def health_check(self) -> bool: - """System source is always healthy if psutil is available.""" - try: - psutil.cpu_percent(interval=None) - return True - except Exception: - return False - - async def collect(self) -> CollectionResult: - """Collect system metrics.""" - start = time.time() - metrics = [] - now = datetime.now(timezone.utc) - - options = self.config.options - - try: - # CPU metrics - if options.get("collect_cpu", True): - metrics.extend(self._collect_cpu(now, options.get("per_cpu", False))) - - # Memory metrics - if options.get("collect_memory", True): - metrics.extend(self._collect_memory(now)) - - # Disk metrics - if options.get("collect_disk", True): - metrics.extend(self._collect_disk(now)) - - # Network metrics - if options.get("collect_network", True): - metrics.extend(self._collect_network(now)) - - # Merge config-level labels (like deployment) into each metric - if self.config.labels: - for metric in metrics: - metric.labels.update(self.config.labels) - - duration = (time.time() - start) * 1000 - - return CollectionResult( - source=self.name, - success=True, - metrics=metrics, - duration_ms=duration, - ) - - except Exception as e: - logger.error(f"Error collecting system metrics: {e}") - return CollectionResult( - source=self.name, - success=False, - error=str(e), - duration_ms=(time.time() - start) * 1000, - ) - - def _collect_cpu(self, timestamp: datetime, per_cpu: bool = False) -> list[MetricSample]: - """Collect CPU metrics.""" - metrics = [] - - # Overall CPU - cpu_percent = psutil.cpu_percent(interval=0.1) - metrics.append(MetricSample( - name="cpu_utilization", - value=cpu_percent / 100.0, - timestamp=timestamp, - metric_type=MetricType.GAUGE, - labels={"host": self._get_hostname()}, - source=self.name, - )) - - # Per-CPU if requested - if per_cpu: - per_cpu_percent = psutil.cpu_percent(interval=None, percpu=True) - for i, pct in enumerate(per_cpu_percent): - metrics.append(MetricSample( - name="cpu_utilization", - value=pct / 100.0, - timestamp=timestamp, - metric_type=MetricType.GAUGE, - labels={"host": self._get_hostname(), "cpu": str(i)}, - source=self.name, - )) - - return metrics - - def _collect_memory(self, timestamp: datetime) -> list[MetricSample]: - """Collect memory metrics.""" - metrics = [] - mem = psutil.virtual_memory() - host = self._get_hostname() - - metrics.append(MetricSample( - name="memory_utilization", - value=mem.percent / 100.0, - timestamp=timestamp, - metric_type=MetricType.GAUGE, - labels={"host": host}, - source=self.name, - )) - - metrics.append(MetricSample( - name="memory_bytes", - value=float(mem.used), - timestamp=timestamp, - metric_type=MetricType.GAUGE, - labels={"host": host, "type": "used"}, - source=self.name, - )) - - metrics.append(MetricSample( - name="memory_bytes", - value=float(mem.total), - timestamp=timestamp, - metric_type=MetricType.GAUGE, - labels={"host": host, "type": "total"}, - source=self.name, - )) - - return metrics - - def _collect_disk(self, timestamp: datetime) -> list[MetricSample]: - """Collect disk metrics.""" - metrics = [] - host = self._get_hostname() - - for partition in psutil.disk_partitions(all=False): - try: - usage = psutil.disk_usage(partition.mountpoint) - - metrics.append(MetricSample( - name="disk_utilization", - value=usage.percent / 100.0, - timestamp=timestamp, - metric_type=MetricType.GAUGE, - labels={ - "host": host, - "device": partition.device, - "mountpoint": partition.mountpoint, - }, - source=self.name, - )) - except (PermissionError, OSError): - continue - - return metrics - - def _collect_network(self, timestamp: datetime) -> list[MetricSample]: - """Collect network metrics.""" - metrics = [] - host = self._get_hostname() - net = psutil.net_io_counters() - - metrics.append(MetricSample( - name="network_bytes_recv", - value=float(net.bytes_recv), - timestamp=timestamp, - metric_type=MetricType.COUNTER, - labels={"host": host}, - source=self.name, - )) - - metrics.append(MetricSample( - name="network_bytes_sent", - value=float(net.bytes_sent), - timestamp=timestamp, - metric_type=MetricType.COUNTER, - labels={"host": host}, - source=self.name, - )) - - return metrics - - def _get_hostname(self) -> str: - """Get the hostname.""" - import socket - return socket.gethostname() - - @classmethod - def get_required_credentials(cls) -> list[str]: - """System source doesn't need credentials.""" - return [] - - @classmethod - def get_default_queries(cls) -> list[str]: - """System source doesn't use queries.""" - return [] diff --git a/agent/tests/__init__.py b/agent/tests/__init__.py deleted file mode 100644 index 08737f3..0000000 --- a/agent/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Prescale Agent Tests diff --git a/agent/tests/test_agent.py b/agent/tests/test_agent.py deleted file mode 100644 index 7473597..0000000 --- a/agent/tests/test_agent.py +++ /dev/null @@ -1,465 +0,0 @@ -""" -Prescale Agent Tests - -Unit tests for the Prescale agent, sources, and client. -""" - -import asyncio -from datetime import datetime, timezone -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - - -class TestMetricSample: - """Tests for MetricSample dataclass.""" - - def test_import(self): - """Test that MetricSample can be imported.""" - from prescale_agent.sources.base import MetricSample, MetricType - assert MetricSample is not None - assert MetricType is not None - - def test_create_sample(self): - """Test creating a metric sample.""" - from prescale_agent.sources.base import MetricSample, MetricType - - sample = MetricSample( - name="cpu_usage", - value=65.5, - timestamp=datetime.now(timezone.utc), - metric_type=MetricType.GAUGE, - labels={"host": "server1"}, - source="system" - ) - - assert sample.name == "cpu_usage" - assert sample.value == 65.5 - assert sample.labels == {"host": "server1"} - assert sample.source == "system" - - def test_to_dict(self): - """Test converting sample to dictionary.""" - from prescale_agent.sources.base import MetricSample, MetricType - - ts = datetime.now(timezone.utc) - sample = MetricSample( - name="memory_percent", - value=75.0, - timestamp=ts, - metric_type=MetricType.GAUGE, - labels={"host": "server1"}, - source="system" - ) - - d = sample.to_dict() - - assert d["name"] == "memory_percent" - assert d["value"] == 75.0 - assert d["type"] == "gauge" - assert d["labels"] == {"host": "server1"} - assert d["source"] == "system" - - -class TestSourceConfig: - """Tests for SourceConfig dataclass.""" - - def test_import(self): - """Test that SourceConfig can be imported.""" - from prescale_agent.sources.base import SourceConfig - assert SourceConfig is not None - - def test_create_config(self): - """Test creating source configuration.""" - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig( - name="prometheus-main", - type="prometheus", - enabled=True, - endpoint="http://prometheus:9090" - ) - - assert config.name == "prometheus-main" - assert config.type == "prometheus" - assert config.enabled is True - assert config.endpoint == "http://prometheus:9090" - - def test_default_values(self): - """Test default configuration values.""" - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig(name="test", type="system") - - assert config.enabled is True - assert config.interval == 15 - assert config.queries == [] - assert config.metrics == [] - - -class TestSourceRegistry: - """Tests for the source registry.""" - - def test_import(self): - """Test that SourceRegistry can be imported.""" - from prescale_agent.sources.registry import SourceRegistry - assert SourceRegistry is not None - - def test_list_types(self): - """Test listing registered source types.""" - from prescale_agent.sources.registry import SourceRegistry - - types = SourceRegistry.list_types() - - assert isinstance(types, list) - assert "system" in types - # Other sources are conditionally registered based on dependencies - - def test_create_system_source(self): - """Test creating a system source.""" - from prescale_agent.sources.registry import SourceRegistry - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig(name="test-system", type="system") - source = SourceRegistry.create(config) - - assert source is not None - assert source.name == "test-system" - assert source.source_type == "system" - - def test_create_unknown_source(self): - """Test creating an unknown source type returns None.""" - from prescale_agent.sources.registry import SourceRegistry - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig(name="test-unknown", type="nonexistent") - source = SourceRegistry.create(config) - - assert source is None - - -class TestSystemSource: - """Tests for SystemSource.""" - - def test_import(self): - """Test that SystemSource can be imported.""" - from prescale_agent.sources.system import SystemSource - assert SystemSource is not None - - def test_source_type(self): - """Test system source type.""" - from prescale_agent.sources.system import SystemSource - assert SystemSource.source_type == "system" - - @pytest.mark.asyncio - async def test_initialize(self): - """Test system source initialization.""" - from prescale_agent.sources.system import SystemSource - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig(name="system-test", type="system") - source = SystemSource(config) - - result = await source.initialize() - - assert result is True - assert source._initialized is True - - @pytest.mark.asyncio - async def test_health_check(self): - """Test system source health check.""" - from prescale_agent.sources.system import SystemSource - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig(name="system-test", type="system") - source = SystemSource(config) - - healthy = await source.health_check() - - assert healthy is True - - @pytest.mark.asyncio - async def test_collect(self): - """Test system metrics collection.""" - from prescale_agent.sources.system import SystemSource - from prescale_agent.sources.base import SourceConfig, CollectionResult - - config = SourceConfig( - name="system-test", - type="system", - options={ - "collect_cpu": True, - "collect_memory": True, - "collect_disk": True, - "collect_network": True - } - ) - source = SystemSource(config) - await source.initialize() - - result = await source.collect() - - assert isinstance(result, CollectionResult) - assert result.success is True - assert len(result.metrics) > 0 - assert result.source == "system-test" - - @pytest.mark.asyncio - async def test_collect_cpu_only(self): - """Test collecting only CPU metrics.""" - from prescale_agent.sources.system import SystemSource - from prescale_agent.sources.base import SourceConfig - - config = SourceConfig( - name="cpu-only", - type="system", - options={ - "collect_cpu": True, - "collect_memory": False, - "collect_disk": False, - "collect_network": False - } - ) - source = SystemSource(config) - await source.initialize() - - result = await source.collect() - - assert result.success is True - # Should have at least CPU metric - cpu_metrics = [m for m in result.metrics if "cpu" in m.name] - assert len(cpu_metrics) > 0 - # Should NOT have memory metrics - mem_metrics = [m for m in result.metrics if "memory" in m.name] - assert len(mem_metrics) == 0 - - -class TestAgentConfig: - """Tests for AgentConfig.""" - - def test_import(self): - """Test that AgentConfig can be imported.""" - from prescale_agent.config import AgentConfig - assert AgentConfig is not None - - def test_load_from_dict(self): - """Test loading config from dictionary.""" - from prescale_agent.config import AgentConfig - - config_dict = { - "endpoint": { - "url": "http://localhost:8080", - "api_key": "test-key" - }, - "collection_interval": 30, - "batch_size": 50, - "sources": [ - { - "name": "local-system", - "type": "system", - "enabled": True - } - ] - } - - config = AgentConfig.from_dict(config_dict) - - assert config.endpoint.url == "http://localhost:8080" - assert config.endpoint.api_key == "test-key" - assert config.collection_interval == 30 - assert config.batch_size == 50 - assert len(config.sources) == 1 - - -class TestPrescaleClient: - """Tests for PrescaleClient.""" - - def test_import(self): - """Test that PrescaleClient can be imported.""" - from prescale_agent.client import PrescaleClient - assert PrescaleClient is not None - - def test_initialization(self): - """Test client initialization.""" - from prescale_agent.client import PrescaleClient - - client = PrescaleClient( - endpoint="http://localhost:8080", - api_key="test-key", - timeout=30 - ) - - assert client.endpoint == "http://localhost:8080" - assert client.api_key == "test-key" - assert client.timeout == 30 - - @pytest.mark.asyncio - async def test_send_metrics_success(self): - """Test sending metrics successfully.""" - from prescale_agent.client import PrescaleClient - from prescale_agent.sources.base import MetricSample, MetricType - - client = PrescaleClient(endpoint="http://localhost:8080") - - metrics = [ - MetricSample( - name="test_metric", - value=100.0, - timestamp=datetime.now(timezone.utc), - metric_type=MetricType.GAUGE, - source="test" - ) - ] - - with patch("aiohttp.ClientSession") as mock_session: - mock_response = AsyncMock() - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={"status": "ok"}) - mock_response.__aenter__ = AsyncMock(return_value=mock_response) - mock_response.__aexit__ = AsyncMock(return_value=None) - - mock_session_instance = MagicMock() - mock_session_instance.post = MagicMock(return_value=mock_response) - mock_session_instance.__aenter__ = AsyncMock(return_value=mock_session_instance) - mock_session_instance.__aexit__ = AsyncMock(return_value=None) - mock_session.return_value = mock_session_instance - - result = await client.send_metrics(metrics) - - # Should attempt to send - assert mock_session_instance.post.called - - -class TestAgent: - """Tests for the main Agent class.""" - - def test_import(self): - """Test that Agent can be imported.""" - from prescale_agent.agent import Agent - assert Agent is not None - - def test_initialization(self): - """Test agent initialization.""" - from prescale_agent.agent import Agent - from prescale_agent.config import AgentConfig - - config_dict = { - "endpoint": {"url": "http://localhost:8080"}, - "sources": [] - } - config = AgentConfig.from_dict(config_dict) - - agent = Agent(config) - - assert agent.config == config - assert agent.sources == [] - assert agent._running is False - - @pytest.mark.asyncio - async def test_setup_no_sources(self): - """Test agent setup with no sources.""" - from prescale_agent.agent import Agent - from prescale_agent.config import AgentConfig - - config_dict = { - "endpoint": {"url": "http://localhost:8080"}, - "sources": [] - } - config = AgentConfig.from_dict(config_dict) - - agent = Agent(config) - await agent.setup() - - assert len(agent.sources) == 0 - assert agent.client is not None - - @pytest.mark.asyncio - async def test_setup_with_system_source(self): - """Test agent setup with system source.""" - from prescale_agent.agent import Agent - from prescale_agent.config import AgentConfig - - config_dict = { - "endpoint": {"url": "http://localhost:8080"}, - "sources": [ - { - "name": "local-system", - "type": "system", - "enabled": True - } - ] - } - config = AgentConfig.from_dict(config_dict) - - agent = Agent(config) - await agent.setup() - - assert len(agent.sources) == 1 - assert agent.sources[0].name == "local-system" - - -class TestCLI: - """Tests for the CLI module.""" - - def test_import(self): - """Test that CLI can be imported.""" - from prescale_agent.cli import main - assert main is not None - - -# Fixtures -@pytest.fixture -def sample_config(): - """Create sample agent configuration.""" - return { - "endpoint": { - "url": "http://localhost:8080", - "api_key": "test-api-key", - "timeout": 30 - }, - "collection_interval": 15, - "batch_size": 100, - "sources": [ - { - "name": "system-metrics", - "type": "system", - "enabled": True, - "options": { - "collect_cpu": True, - "collect_memory": True, - "collect_disk": False, - "collect_network": False - } - } - ] - } - - -@pytest.fixture -def mock_metrics(): - """Create mock metric samples.""" - from prescale_agent.sources.base import MetricSample, MetricType - - now = datetime.now(timezone.utc) - return [ - MetricSample( - name="cpu_usage_percent", - value=45.5, - timestamp=now, - metric_type=MetricType.GAUGE, - labels={"host": "test-host"}, - source="system" - ), - MetricSample( - name="memory_usage_percent", - value=62.3, - timestamp=now, - metric_type=MetricType.GAUGE, - labels={"host": "test-host"}, - source="system" - ) - ] - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/charts/prescale/Chart.yaml b/charts/prescale/Chart.yaml deleted file mode 100644 index 07a0826..0000000 --- a/charts/prescale/Chart.yaml +++ /dev/null @@ -1,40 +0,0 @@ -apiVersion: v2 -name: prescale -description: Predictive Infrastructure Intelligence Platform - ML-powered resource forecasting and anomaly detection -type: application -version: 0.1.0 -appVersion: "0.1.0" -kubeVersion: ">=1.23.0-0" - -keywords: - - machine-learning - - kubernetes - - monitoring - - prediction - - anomaly-detection - - cost-optimization - - autoscaling - - observability - -home: https://github.com/pyjeebz/prescale -sources: - - https://github.com/pyjeebz/prescale - -maintainers: - - name: Prescale Platform - email: maintainers@prescale.dev - url: https://github.com/pyjeebz - -annotations: - artifacthub.io/license: Apache-2.0 - artifacthub.io/category: monitoring-logging - -dependencies: - - name: prometheus - version: "~25.0" - repository: https://prometheus-community.github.io/helm-charts - condition: prometheus.enabled - - name: grafana - version: "~7.0" - repository: https://grafana.github.io/helm-charts - condition: grafana.enabled diff --git a/charts/prescale/README.md b/charts/prescale/README.md deleted file mode 100644 index 865dc4a..0000000 --- a/charts/prescale/README.md +++ /dev/null @@ -1,143 +0,0 @@ -# Prescale Helm Chart - -Predictive Infrastructure Intelligence Platform - ML-powered resource forecasting and anomaly detection for Kubernetes. - -## Prerequisites - -- Kubernetes 1.23+ -- Helm 3.x -- Prometheus (optional, for metrics collection) - -## Installation - -```bash -# Add the Prescale Helm repository -helm repo add prescale https://pyjeebz.github.io/prescale -helm repo update - -# Install Prescale -helm install prescale prescale/prescale --namespace prescale --create-namespace -``` - -### Install from local chart - -```bash -helm install prescale ./charts/prescale --namespace prescale --create-namespace -``` - -## Configuration - -See [values.yaml](values.yaml) for the full list of configurable parameters. - -### Common configurations - -#### Basic installation - -```yaml -# values-basic.yaml -inference: - enabled: true - replicaCount: 2 - -costIntelligence: - enabled: true -``` - -#### Production installation - -```yaml -# values-production.yaml -inference: - enabled: true - replicaCount: 3 - autoscaling: - enabled: true - minReplicas: 3 - maxReplicas: 20 - resources: - limits: - cpu: 2000m - memory: 4Gi - requests: - cpu: 1000m - memory: 2Gi - ingress: - enabled: true - className: nginx - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - hosts: - - host: prescale.example.com - paths: - - path: / - pathType: Prefix - tls: - - secretName: prescale-tls - hosts: - - prescale.example.com - -costIntelligence: - enabled: true - replicaCount: 2 - -models: - persistence: - enabled: true - size: 10Gi - -prometheus: - serviceMonitor: - enabled: true -``` - -### Install with custom values - -```bash -helm install prescale prescale/prescale \ - --namespace prescale \ - --create-namespace \ - -f values-production.yaml -``` - -## Upgrading - -```bash -helm upgrade prescale prescale/prescale --namespace prescale -f values.yaml -``` - -## Uninstalling - -```bash -helm uninstall prescale --namespace prescale -``` - -## Components - -| Component | Description | -|-----------|-------------| -| Inference Service | ML prediction API for CPU/memory forecasting | -| Cost Intelligence | Resource cost analysis and recommendations | - -## API Endpoints - -### Inference Service (port 8000) - -| Endpoint | Method | Description | -|----------|--------|-------------| -| `/predict` | POST | Generate resource predictions | -| `/detect` | POST | Detect anomalies in metrics | -| `/recommend` | POST | Get scaling recommendations | -| `/health` | GET | Health check | -| `/metrics` | GET | Prometheus metrics | - -### Cost Intelligence (port 8001) - -| Endpoint | Method | Description | -|----------|--------|-------------| -| `/analyze` | POST | Analyze resource costs | -| `/recommendations` | GET | Get cost optimization recommendations | -| `/health` | GET | Health check | - -## License - -Apache License 2.0 diff --git a/charts/prescale/templates/_helpers.tpl b/charts/prescale/templates/_helpers.tpl deleted file mode 100644 index 735378a..0000000 --- a/charts/prescale/templates/_helpers.tpl +++ /dev/null @@ -1,90 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "prescale.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -*/}} -{{- define "prescale.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "prescale.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "prescale.labels" -}} -helm.sh/chart: {{ include "prescale.chart" . }} -{{ include "prescale.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "prescale.selectorLabels" -}} -app.kubernetes.io/name: {{ include "prescale.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "prescale.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "prescale.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} - -{{/* -Inference service fullname -*/}} -{{- define "prescale.inference.fullname" -}} -{{- printf "%s-inference" (include "prescale.fullname" .) | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Cost Intelligence service fullname -*/}} -{{- define "prescale.costIntelligence.fullname" -}} -{{- printf "%s-cost-intelligence" (include "prescale.fullname" .) | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Inference image -*/}} -{{- define "prescale.inference.image" -}} -{{- $tag := default .Chart.AppVersion .Values.inference.image.tag }} -{{- printf "%s:%s" .Values.inference.image.repository $tag }} -{{- end }} - -{{/* -Cost Intelligence image -*/}} -{{- define "prescale.costIntelligence.image" -}} -{{- $tag := default .Chart.AppVersion .Values.costIntelligence.image.tag }} -{{- printf "%s:%s" .Values.costIntelligence.image.repository $tag }} -{{- end }} diff --git a/charts/prescale/templates/cost-intelligence-deployment.yaml b/charts/prescale/templates/cost-intelligence-deployment.yaml deleted file mode 100644 index e14c637..0000000 --- a/charts/prescale/templates/cost-intelligence-deployment.yaml +++ /dev/null @@ -1,68 +0,0 @@ -{{- if .Values.costIntelligence.enabled }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "prescale.costIntelligence.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: cost-intelligence -spec: - replicas: {{ .Values.costIntelligence.replicaCount }} - selector: - matchLabels: - {{- include "prescale.selectorLabels" . | nindent 6 }} - app.kubernetes.io/component: cost-intelligence - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "prescale.selectorLabels" . | nindent 8 }} - app.kubernetes.io/component: cost-intelligence - spec: - {{- with .Values.global.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "prescale.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - containers: - - name: cost-intelligence - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "prescale.costIntelligence.image" . }} - imagePullPolicy: {{ .Values.costIntelligence.image.pullPolicy }} - ports: - - name: http - containerPort: 8001 - protocol: TCP - livenessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 5 - periodSeconds: 5 - resources: - {{- toYaml .Values.costIntelligence.resources | nindent 12 }} - {{- with .Values.costIntelligence.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.costIntelligence.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.costIntelligence.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} -{{- end }} diff --git a/charts/prescale/templates/cost-intelligence-service.yaml b/charts/prescale/templates/cost-intelligence-service.yaml deleted file mode 100644 index 9bbb122..0000000 --- a/charts/prescale/templates/cost-intelligence-service.yaml +++ /dev/null @@ -1,23 +0,0 @@ -{{- if .Values.costIntelligence.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "prescale.costIntelligence.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: cost-intelligence - {{- with .Values.costIntelligence.service.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - type: {{ .Values.costIntelligence.service.type }} - ports: - - port: {{ .Values.costIntelligence.service.port }} - targetPort: http - protocol: TCP - name: http - selector: - {{- include "prescale.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: cost-intelligence -{{- end }} diff --git a/charts/prescale/templates/inference-deployment.yaml b/charts/prescale/templates/inference-deployment.yaml deleted file mode 100644 index c9cd4a4..0000000 --- a/charts/prescale/templates/inference-deployment.yaml +++ /dev/null @@ -1,82 +0,0 @@ -{{- if .Values.inference.enabled }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "prescale.inference.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: inference -spec: - {{- if not .Values.inference.autoscaling.enabled }} - replicas: {{ .Values.inference.replicaCount }} - {{- end }} - selector: - matchLabels: - {{- include "prescale.selectorLabels" . | nindent 6 }} - app.kubernetes.io/component: inference - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "prescale.selectorLabels" . | nindent 8 }} - app.kubernetes.io/component: inference - spec: - {{- with .Values.global.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "prescale.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - containers: - - name: inference - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "prescale.inference.image" . }} - imagePullPolicy: {{ .Values.inference.image.pullPolicy }} - ports: - - name: http - containerPort: 8080 - protocol: TCP - {{- with .Values.inference.env }} - env: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.inference.envFrom }} - envFrom: - {{- toYaml . | nindent 12 }} - {{- end }} - livenessProbe: - {{- toYaml .Values.inference.livenessProbe | nindent 12 }} - readinessProbe: - {{- toYaml .Values.inference.readinessProbe | nindent 12 }} - resources: - {{- toYaml .Values.inference.resources | nindent 12 }} - {{- if .Values.models.persistence.enabled }} - volumeMounts: - - name: models - mountPath: /app/models - readOnly: false - {{- end }} - {{- if .Values.models.persistence.enabled }} - volumes: - - name: models - persistentVolumeClaim: - claimName: {{ include "prescale.fullname" . }}-models - {{- end }} - {{- with .Values.inference.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.inference.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.inference.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} -{{- end }} diff --git a/charts/prescale/templates/inference-hpa.yaml b/charts/prescale/templates/inference-hpa.yaml deleted file mode 100644 index 718689b..0000000 --- a/charts/prescale/templates/inference-hpa.yaml +++ /dev/null @@ -1,33 +0,0 @@ -{{- if and .Values.inference.enabled .Values.inference.autoscaling.enabled }} -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: {{ include "prescale.inference.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: inference -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: {{ include "prescale.inference.fullname" . }} - minReplicas: {{ .Values.inference.autoscaling.minReplicas }} - maxReplicas: {{ .Values.inference.autoscaling.maxReplicas }} - metrics: - {{- if .Values.inference.autoscaling.targetCPUUtilizationPercentage }} - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: {{ .Values.inference.autoscaling.targetCPUUtilizationPercentage }} - {{- end }} - {{- if .Values.inference.autoscaling.targetMemoryUtilizationPercentage }} - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: {{ .Values.inference.autoscaling.targetMemoryUtilizationPercentage }} - {{- end }} -{{- end }} diff --git a/charts/prescale/templates/inference-service.yaml b/charts/prescale/templates/inference-service.yaml deleted file mode 100644 index 84d5d1c..0000000 --- a/charts/prescale/templates/inference-service.yaml +++ /dev/null @@ -1,23 +0,0 @@ -{{- if .Values.inference.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "prescale.inference.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: inference - {{- with .Values.inference.service.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - type: {{ .Values.inference.service.type }} - ports: - - port: {{ .Values.inference.service.port }} - targetPort: http - protocol: TCP - name: http - selector: - {{- include "prescale.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: inference -{{- end }} diff --git a/charts/prescale/templates/ingress.yaml b/charts/prescale/templates/ingress.yaml deleted file mode 100644 index 799b600..0000000 --- a/charts/prescale/templates/ingress.yaml +++ /dev/null @@ -1,42 +0,0 @@ -{{- if and .Values.inference.enabled .Values.inference.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "prescale.inference.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: inference - {{- with .Values.inference.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if .Values.inference.ingress.className }} - ingressClassName: {{ .Values.inference.ingress.className }} - {{- end }} - {{- if .Values.inference.ingress.tls }} - tls: - {{- range .Values.inference.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - {{- range .Values.inference.ingress.hosts }} - - host: {{ .host | quote }} - http: - paths: - {{- range .paths }} - - path: {{ .path }} - pathType: {{ .pathType }} - backend: - service: - name: {{ include "prescale.inference.fullname" $ }} - port: - name: http - {{- end }} - {{- end }} -{{- end }} diff --git a/charts/prescale/templates/pdb.yaml b/charts/prescale/templates/pdb.yaml deleted file mode 100644 index 0d7c00e..0000000 --- a/charts/prescale/templates/pdb.yaml +++ /dev/null @@ -1,20 +0,0 @@ -{{- if and .Values.inference.enabled .Values.inference.podDisruptionBudget.enabled }} -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: {{ include "prescale.inference.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: inference -spec: - {{- if .Values.inference.podDisruptionBudget.minAvailable }} - minAvailable: {{ .Values.inference.podDisruptionBudget.minAvailable }} - {{- end }} - {{- if .Values.inference.podDisruptionBudget.maxUnavailable }} - maxUnavailable: {{ .Values.inference.podDisruptionBudget.maxUnavailable }} - {{- end }} - selector: - matchLabels: - {{- include "prescale.selectorLabels" . | nindent 6 }} - app.kubernetes.io/component: inference -{{- end }} diff --git a/charts/prescale/templates/pvc.yaml b/charts/prescale/templates/pvc.yaml deleted file mode 100644 index 263a49a..0000000 --- a/charts/prescale/templates/pvc.yaml +++ /dev/null @@ -1,17 +0,0 @@ -{{- if .Values.models.persistence.enabled }} -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: {{ include "prescale.fullname" . }}-models - labels: - {{- include "prescale.labels" . | nindent 4 }} -spec: - accessModes: - {{- toYaml .Values.models.persistence.accessModes | nindent 4 }} - {{- if or .Values.models.persistence.storageClass .Values.global.storageClass }} - storageClassName: {{ .Values.models.persistence.storageClass | default .Values.global.storageClass }} - {{- end }} - resources: - requests: - storage: {{ .Values.models.persistence.size }} -{{- end }} diff --git a/charts/prescale/templates/serviceaccount.yaml b/charts/prescale/templates/serviceaccount.yaml deleted file mode 100644 index 12d1506..0000000 --- a/charts/prescale/templates/serviceaccount.yaml +++ /dev/null @@ -1,12 +0,0 @@ -{{- if .Values.serviceAccount.create }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "prescale.serviceAccountName" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - {{- with .Values.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -{{- end }} diff --git a/charts/prescale/templates/servicemonitor.yaml b/charts/prescale/templates/servicemonitor.yaml deleted file mode 100644 index 8f826bc..0000000 --- a/charts/prescale/templates/servicemonitor.yaml +++ /dev/null @@ -1,25 +0,0 @@ -{{- if and .Values.inference.enabled .Values.prometheus.serviceMonitor.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: {{ include "prescale.inference.fullname" . }} - labels: - {{- include "prescale.labels" . | nindent 4 }} - app.kubernetes.io/component: inference - {{- with .Values.prometheus.serviceMonitor.labels }} - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - selector: - matchLabels: - {{- include "prescale.selectorLabels" . | nindent 6 }} - app.kubernetes.io/component: inference - endpoints: - - port: http - path: /metrics - interval: {{ .Values.prometheus.serviceMonitor.interval }} - scrapeTimeout: {{ .Values.prometheus.serviceMonitor.scrapeTimeout }} - namespaceSelector: - matchNames: - - {{ .Release.Namespace }} -{{- end }} diff --git a/charts/prescale/values.yaml b/charts/prescale/values.yaml deleted file mode 100644 index 920d4b5..0000000 --- a/charts/prescale/values.yaml +++ /dev/null @@ -1,249 +0,0 @@ -# Prescale Helm Chart Values -# Default configuration values for the Prescale platform - -# -- Global settings -global: - # -- Image pull secrets for private registries - imagePullSecrets: [] - # -- Storage class for persistent volumes - storageClass: "" - -# -- Inference Service Configuration -inference: - # -- Enable the inference service - enabled: true - - # -- Number of replicas - replicaCount: 2 - - image: - # -- Image repository - repository: ghcr.io/pyjeebz/prescale/inference - # -- Image pull policy - pullPolicy: IfNotPresent - # -- Image tag (defaults to chart appVersion) - tag: "" - - # -- Service configuration - service: - type: ClusterIP - port: 8000 - annotations: {} - - # -- Ingress configuration - ingress: - enabled: false - className: "" - annotations: {} - # kubernetes.io/ingress.class: nginx - # cert-manager.io/cluster-issuer: letsencrypt-prod - hosts: - - host: prescale.local - paths: - - path: / - pathType: Prefix - tls: [] - # - secretName: prescale-tls - # hosts: - # - prescale.local - - # -- Resource limits and requests - resources: - limits: - cpu: 1000m - memory: 2Gi - requests: - cpu: 500m - memory: 1Gi - - # -- Autoscaling configuration - autoscaling: - enabled: true - minReplicas: 2 - maxReplicas: 10 - targetCPUUtilizationPercentage: 70 - targetMemoryUtilizationPercentage: 80 - - # -- Pod disruption budget - podDisruptionBudget: - enabled: true - minAvailable: 1 - - # -- Node selector - nodeSelector: {} - - # -- Tolerations - tolerations: [] - - # -- Affinity rules - affinity: {} - - # -- Additional environment variables - env: [] - # - name: LOG_LEVEL - # value: "INFO" - - # -- Additional environment variables from secrets/configmaps - envFrom: [] - # - secretRef: - # name: prescale-secrets - - # -- Liveness probe configuration - livenessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 - - # -- Readiness probe configuration - readinessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 3 - -# -- Cost Intelligence Service Configuration -costIntelligence: - # -- Enable the cost intelligence service - enabled: true - - # -- Number of replicas - replicaCount: 1 - - image: - # -- Image repository - repository: ghcr.io/pyjeebz/prescale/cost-intelligence - # -- Image pull policy - pullPolicy: IfNotPresent - # -- Image tag (defaults to chart appVersion) - tag: "" - - # -- Service configuration - service: - type: ClusterIP - port: 8001 - annotations: {} - - # -- Resource limits and requests - resources: - limits: - cpu: 500m - memory: 1Gi - requests: - cpu: 250m - memory: 512Mi - - # -- Node selector - nodeSelector: {} - - # -- Tolerations - tolerations: [] - - # -- Affinity rules - affinity: {} - -# -- Model Configuration -models: - # -- Enable model persistence - persistence: - enabled: true - # -- Storage class (uses global.storageClass if empty) - storageClass: "" - # -- Size of the persistent volume - size: 5Gi - # -- Access modes - accessModes: - - ReadWriteOnce - - # -- Model training configuration - training: - # -- Enable scheduled training jobs - enabled: false - # -- Cron schedule for training - schedule: "0 2 * * *" # Daily at 2 AM - # -- Training job resource limits - resources: - limits: - cpu: 2000m - memory: 4Gi - -# -- Prometheus Integration -prometheus: - # -- Enable bundled Prometheus - enabled: false - - # -- External Prometheus URL (if not using bundled) - externalUrl: "" - - # -- ServiceMonitor configuration - serviceMonitor: - enabled: true - interval: 30s - scrapeTimeout: 10s - labels: {} - -# -- Grafana Integration -grafana: - # -- Enable bundled Grafana - enabled: false - - # -- Install Prescale dashboards - dashboards: - enabled: true - -# -- KEDA Autoscaling (prediction-driven) -keda: - # -- Enable KEDA integration - enabled: false - - # -- ScaledObject configuration - scaledObject: - # -- Polling interval - pollingInterval: 30 - # -- Cooldown period - cooldownPeriod: 300 - # -- Minimum replicas - minReplicaCount: 1 - # -- Maximum replicas - maxReplicaCount: 10 - -# -- Service Account -serviceAccount: - # -- Create service account - create: true - # -- Service account annotations - annotations: {} - # -- Service account name (generated if not set) - name: "" - -# -- Pod annotations -podAnnotations: {} - -# -- Pod security context -podSecurityContext: - fsGroup: 1000 - -# -- Container security context -securityContext: - runAsNonRoot: true - runAsUser: 1000 - readOnlyRootFilesystem: true - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - -# -- Network policies -networkPolicy: - # -- Enable network policies - enabled: false - # -- Ingress rules - ingress: [] - # -- Egress rules - egress: [] diff --git a/cli/pyproject.toml b/cli/pyproject.toml index 8d910d3..cc62950 100644 --- a/cli/pyproject.toml +++ b/cli/pyproject.toml @@ -5,26 +5,26 @@ build-backend = "hatchling.build" [project] name = "prescale-cli" version = "0.1.0" -description = "CLI for Prescale - Predictive Infrastructure Intelligence Platform" +description = "Launch-readiness load testing for solo/indie devs - find what breaks before your users do" readme = "README.md" license = "Apache-2.0" requires-python = ">=3.10" authors = [ - { name = "Prescale Team" } + { name = "Mujeeb Lawal-Saka", email = "lawalsakamujeeb@gmail.com" } ] -keywords = ["kubernetes", "ml", "infrastructure", "cli", "devops"] +keywords = ["load-testing", "performance", "stress-test", "cli", "devtools", "launch"] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 3 - Alpha", "Environment :: Console", "Intended Audience :: Developers", - "Intended Audience :: System Administrators", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", - "Topic :: System :: Systems Administration", + "Topic :: Software Development :: Testing", + "Topic :: Software Development :: Testing :: Traffic Generation", ] dependencies = [ "click>=8.0.0", diff --git a/cloudbuild-cost.yaml b/cloudbuild-cost.yaml deleted file mode 100644 index 6713f99..0000000 --- a/cloudbuild-cost.yaml +++ /dev/null @@ -1,29 +0,0 @@ -steps: - # Build the Cost Intelligence Docker image - - name: 'gcr.io/cloud-builders/docker' - args: - - 'build' - - '-t' - - 'gcr.io/$PROJECT_ID/prescale-cost-intelligence:${_TAG_NAME}' - - '-t' - - 'gcr.io/$PROJECT_ID/prescale-cost-intelligence:latest' - - '-f' - - 'ml/cost_intelligence/Dockerfile' - - 'ml/cost_intelligence' - - # Push the images - - name: 'gcr.io/cloud-builders/docker' - args: ['push', 'gcr.io/$PROJECT_ID/prescale-cost-intelligence:${_TAG_NAME}'] - - - name: 'gcr.io/cloud-builders/docker' - args: ['push', 'gcr.io/$PROJECT_ID/prescale-cost-intelligence:latest'] - -images: - - 'gcr.io/$PROJECT_ID/prescale-cost-intelligence:${_TAG_NAME}' - - 'gcr.io/$PROJECT_ID/prescale-cost-intelligence:latest' - -substitutions: - _TAG_NAME: '0.1.0' - -options: - logging: CLOUD_LOGGING_ONLY diff --git a/cloudbuild.yaml b/cloudbuild.yaml deleted file mode 100644 index 254bae9..0000000 --- a/cloudbuild.yaml +++ /dev/null @@ -1,22 +0,0 @@ -steps: - # Build the inference service image - - name: 'gcr.io/cloud-builders/docker' - args: - - 'build' - - '-t' - - 'gcr.io/$PROJECT_ID/prescale-inference:${_TAG_NAME}' - - '-t' - - 'gcr.io/$PROJECT_ID/prescale-inference:latest' - - '-f' - - 'ml/inference/Dockerfile' - - 'ml' - -images: - - 'gcr.io/$PROJECT_ID/prescale-inference:${_TAG_NAME}' - - 'gcr.io/$PROJECT_ID/prescale-inference:latest' - -substitutions: - _TAG_NAME: '0.1.0' - -options: - logging: CLOUD_LOGGING_ONLY diff --git a/commits.txt b/commits.txt deleted file mode 100644 index 105a08a..0000000 --- a/commits.txt +++ /dev/null @@ -1,90 +0,0 @@ -First commit -feat: Add Terraform modules for GCP infrastructure setup -Add load testing framework with personas and traffic patterns -feat: Add Kubernetes configuration for Locust load testing setup -feat: Enhance monitoring setup with Grafana and Prometheus configurations -feat: Phase 3 - Observability with GKE Managed Prometheus -Phase 4: ML pipeline with baseline, Prophet, and XGBoost models -feat(kubernetes): Add Prescale Inference Service with autoscaling and monitoring -Add predictor and recommender services for time-series forecasting and scaling decisions -feat(kubernetes): Add ConfigMap and Deployment for cost intelligence service; update inference service image and config - Implement cost forecasting and savings analysis modules -Add comprehensive documentation for Prescale ML Pipeline, Inference Service, and Cost Intelligence Dashboard -Add Windows and Bash setup scripts for GCP configuration -Update README.md with new content and formatting improvements -Update README.md with new content and formatting improvements -Reorganize repo: separate core product from demo environment -Remove obsolete metric fetching scripts and training data -Add Apache License 2.0 -Phase 1: Open Source Core Foundation -fix(ci): Make lint and security scan more lenient -fix: Update GitHub URLs from placeholder to pyjeebz/prescale -style: Auto-fix formatting and import sorting -Add unified metrics collection agent with pluggable sources -docs: Add prescale-agent documentation to README -fix: Correct Docker COPY paths for CI build context -feat: Add CI badges, quick deploy buttons, and Render config -fix: Use PYPI_API_TOKEN instead of OIDC for PyPI publishing -fix: Fix README encoding (UTF-16 to UTF-8) and add Helm dependency update step -fix: Rename packages to prescale-platform and prescale-platform-agent for PyPI uniqueness -feat: Add GitHub Pages workflow for Helm chart repository -chore: Trigger Helm Pages deployment -feat: Add ingestion endpoint for metrics from prescale-agent and create initial agent configuration -fix: open-source readiness - port alignment, docker-compose, Makefile -feat(agent): enhance GCP Cloud Monitoring collector with robust value extraction -feat(config): add GCP Cloud Monitoring backend configuration -feat(ml): add CLI args and portable model format for training pipeline -feat(inference): add portable model loaders for production deployment -feat(k8s): add GCS model download init container for inference deployment -feat(cli): enhance predict, detect, and recommend commands with rich output -build(ml): add Cloud Build configuration for inference container -build(cli): add pyproject.toml for prescale-cli package -chore: update .gitignore to exclude ML artifacts and temp files -chore(gitignore): add Prescale-specific exclusions for artifacts and temp files -docs(readme): comprehensive rewrite with updated architecture and features -docs(cli): complete CLI documentation with examples and options -docs(quickstart): rewrite with three deployment options -docs(infra): modernize deployment guide with current architecture -docs(examples): clarify demo environment purpose and usage -chore: remove unnecessary files and duplicate directories -Add example agent configuration with all sources documented -Add ML model tests for baseline, prophet, and xgboost -Add agent package tests and README for PyPI -Add CLI tests and fix pyproject.toml URLs -Update CI workflow with separate test jobs for ML, agent, and CLI -Update release workflow to publish both agent and CLI to PyPI -Add script to generate demo models for testing -Add optional API key authentication for inference service -Bump agent to v0.2.0, update default endpoint to deployed service -Rename package to prescale-agent -ci: retrigger build -chore: move files from web-ui to main -fix: resolve stash pop conflicts by keeping stashed versions -docs: update README with latest from web-ui -fix(web): Update vite proxy config for API routing -feat(web): Simplify dashboard to focus on AI insights - remove heavy metrics charts -feat(web): Add AgentInstall page with copy-to-clipboard and live status -feat(web): Add DeploymentsOverview page with create modal and environment badges -fix(web): Update Predictions page styling and fix API calls -fix(web): Update Anomalies page with improved detection display -feat(web): Add routes for AgentInstall and DeploymentsOverview pages -feat(web): Add deployments Pinia store for multi-deployment management -feat(web): Add agents Pinia store for agent status management -feat(web): Update App.vue with deployment selector in header and new navigation -feat(web): Add API service with ML endpoints for predict, detect, recommend -feat(web): Add reusable UI components (charts, badges, modals, stats cards) -style(web): Add global styles with Tailwind CSS configuration -chore(web): Add Vue app entry point and TypeScript declarations -chore(web): Add package.json with Vue, Tailwind, and Chart.js dependencies -chore(web): Add TypeScript configuration files -chore(web): Add Tailwind CSS and PostCSS configuration -chore(web): Add HTML entry point for Vue application -docs: Update README files with web dashboard documentation -docs: Enhance Quick Start with clearer setup options and first steps -docs: Update installation instructions to reflect package name change from prescale-platform-agent to prescale-agent -security: Remove GCP project ID from tracked config, add example file -fix: Update hostname retrieval logic in ingest_metrics function -fix: Update API target ports in Vite configuration -fix: Handle null values in anomaly deviation and value display -fix: Handle potential null values in action confidence display -fix: Merge config-level labels into collected metrics diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 8491def..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,83 +0,0 @@ -# Prescale - Local Development Docker Compose -# Use for local development and testing - -services: - # Inference Service - ML predictions and anomaly detection - inference: - build: - context: ./ml - dockerfile: inference/Dockerfile - ports: - - "8080:8080" - environment: - - MODELS_DIR=/app/models - - LOG_LEVEL=INFO - volumes: - - ./ml/models:/app/models:ro - healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 30s - restart: unless-stopped - - # Cost Intelligence Service - Cost analysis and recommendations - cost-intelligence: - build: - context: ./ml/cost_intelligence - dockerfile: Dockerfile - ports: - - "8001:8001" - environment: - - LOG_LEVEL=INFO - healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8001/health')"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 30s - restart: unless-stopped - - # Prometheus - Metrics collection (optional) - prometheus: - image: prom/prometheus:v2.48.0 - ports: - - "9090:9090" - volumes: - - ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - - prometheus_data:/prometheus - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.enable-lifecycle' - profiles: - - monitoring - restart: unless-stopped - - # Grafana - Visualization (optional) - grafana: - image: grafana/grafana:10.2.0 - ports: - - "3000:3000" - environment: - - GF_SECURITY_ADMIN_USER=admin - - GF_SECURITY_ADMIN_PASSWORD=admin - - GF_USERS_ALLOW_SIGN_UP=false - volumes: - - ./infra/grafana/dashboards:/etc/grafana/provisioning/dashboards:ro - - ./infra/grafana/datasources:/etc/grafana/provisioning/datasources:ro - - grafana_data:/var/lib/grafana - depends_on: - - prometheus - profiles: - - monitoring - restart: unless-stopped - -volumes: - prometheus_data: - grafana_data: - -networks: - default: - name: prescale-network diff --git a/docs/INTEGRATION.md b/docs/INTEGRATION.md deleted file mode 100644 index 4171b0f..0000000 --- a/docs/INTEGRATION.md +++ /dev/null @@ -1,775 +0,0 @@ -# Prescale Integration Guide - -## Integrating Prescale into Existing Infrastructure - -This guide explains how to deploy Prescale into an existing customer environment to provide predictive infrastructure intelligence. - ---- - -## Table of Contents - -1. [Integration Architecture](#integration-architecture) -2. [Deployment Options](#deployment-options) -3. [Step-by-Step Integration](#step-by-step-integration) -4. [Gaming App Example](#gaming-app-example) -5. [Custom Metrics Configuration](#custom-metrics-configuration) -6. [Multi-Tenant Setup](#multi-tenant-setup) - ---- - -## Integration Architecture - -Prescale connects to your existing observability stack - it doesn't replace it: - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ CUSTOMER INFRASTRUCTURE │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ Gaming App │ │ API Servers │ │ Database │ │ -│ │ (Unity/Unreal) │ │ (Node.js/Go) │ │ (PostgreSQL) │ │ -│ └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ │ -│ │ │ │ │ -│ └──────────────────────┼──────────────────────┘ │ -│ │ │ -│ ▼ │ -│ ┌─────────────────────────┐ │ -│ │ Existing Prometheus │◄─── Customer's existing │ -│ │ (or Datadog/CloudWatch) │ monitoring stack │ -│ └────────────┬────────────┘ │ -│ │ │ -│ ┌──────────────────────────────┼───────────────────────────────────────┐ │ -│ │ │ PRESCALE (NEW) │ │ -│ │ ▼ │ │ -│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ -│ │ │ Metrics Adapter │──▶│ ML Pipeline │──▶│ Inference API │ │ │ -│ │ │ • Prometheus │ │ • Training │ │ • /predict │ │ │ -│ │ │ • Datadog │ │ • XGBoost │ │ • /detect │ │ │ -│ │ │ • CloudWatch │ │ • Prophet │ │ • /recommend │ │ │ -│ │ └─────────────────┘ └─────────────────┘ └────────┬────────┘ │ │ -│ │ │ │ │ -│ │ ┌────────────┼────────────┐ │ │ -│ │ ▼ ▼ ▼ │ │ -│ │ ┌──────────┐ ┌──────────┐ ┌─────┐ │ │ -│ │ │ Webhook │ │ Slack │ │ API │ │ │ -│ │ │ (custom) │ │ Alerts │ │ │ │ │ -│ │ └──────────┘ └──────────┘ └─────┘ │ │ -│ └──────────────────────────────────────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` - ---- - -## Deployment Options - -### Option 1: Sidecar Deployment (Recommended for Single Customer) - -Deploy Prescale alongside existing services in the customer's cluster: - -```yaml -# prescale-customer-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prescale-inference - namespace: prescale -spec: - replicas: 2 - template: - spec: - containers: - - name: prescale-inference - image: gcr.io/YOUR_PROJECT/prescale-inference:latest - env: - - name: PROMETHEUS_URL - value: "http://prometheus.monitoring:9090" # Customer's Prometheus - - name: CUSTOMER_ID - value: "gaming-corp" - - name: METRICS_NAMESPACE - value: "gaming-app" -``` - -### Option 2: SaaS / Multi-Tenant - -Run Prescale as a service that multiple customers connect to: - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ PRESCALE SaaS PLATFORM │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ Customer A Customer B Customer C │ -│ (Gaming) (E-commerce) (Fintech) │ -│ │ │ │ │ -│ ▼ ▼ ▼ │ -│ ┌───────────────────────────────────────────────────────┐ │ -│ │ Prescale API Gateway │ │ -│ │ (tenant isolation) │ │ -│ └───────────────────────────────────────────────────────┘ │ -│ │ │ -│ ┌───────────┬───────────┼───────────┬───────────────┐ │ -│ ▼ ▼ ▼ ▼ ▼ │ -│ ┌─────┐ ┌─────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ -│ │ML-A │ │ML-B │ │ML-C │ │Models │ │Dashboard│ │ -│ │ │ │ │ │ │ │Store │ │(Grafana)│ │ -│ └─────┘ └─────┘ └─────────┘ └─────────┘ └─────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -### Option 3: Agent-Based (Remote Monitoring) - -Install a lightweight agent that pushes metrics to Prescale: - -```python -# prescale_agent.py - Install on customer infrastructure -from prescale_sdk import PrescaleAgent - -agent = PrescaleAgent( - api_key="customer-api-key", - endpoint="https://prescale.yourcompany.com", - customer_id="gaming-corp" -) - -# Auto-discover and forward metrics -agent.discover_prometheus("http://localhost:9090") -agent.start() -``` - ---- - -## Step-by-Step Integration - -### Step 1: Assess Customer's Current Stack - -```powershell -# Questions to answer: -# 1. What monitoring system? (Prometheus, Datadog, CloudWatch, etc.) -# 2. What cloud provider? (GCP, AWS, Azure, on-prem) -# 3. What orchestration? (Kubernetes, ECS, VMs) -# 4. What are the key services to monitor? -``` - -### Step 2: Configure Metrics Adapter - -Create a customer-specific configuration: - -```python -# ml/adapters/gaming_corp_config.py -from dataclasses import dataclass, field -from typing import List - -@dataclass -class GamingCorpConfig: - """Configuration for Gaming Corp integration.""" - - customer_id: str = "gaming-corp" - - # Their Prometheus endpoint - prometheus_url: str = "http://prometheus.gaming-corp.internal:9090" - - # Key metrics to monitor - metrics: List[str] = field(default_factory=lambda: [ - # Game server metrics - "game_server_active_players", - "game_server_matches_in_progress", - "game_server_queue_length", - "game_server_latency_ms", - - # Infrastructure metrics - "container_cpu_usage_seconds_total", - "container_memory_usage_bytes", - "http_requests_total", - "http_request_duration_seconds", - - # Database metrics - "pg_stat_activity_count", - "pg_database_size_bytes", - ]) - - # Gaming-specific thresholds - thresholds: dict = field(default_factory=lambda: { - "max_players_per_server": 100, - "latency_warning_ms": 50, - "latency_critical_ms": 100, - "queue_length_scale_trigger": 50, - }) -``` - -### Step 3: Train Models on Customer Data - -```python -# ml/train_customer.py -import os -from config import GamingCorpConfig -from pipeline.data_fetcher import DataFetcher -from models.xgboost_forecaster import XGBoostForecaster - -def train_for_customer(customer_config): - """Train models specific to customer's patterns.""" - - # Fetch historical data (7-30 days recommended) - fetcher = DataFetcher( - prometheus_url=customer_config.prometheus_url, - metrics=customer_config.metrics - ) - - data = fetcher.fetch_historical(days=14) - - # Train forecasting models - cpu_model = XGBoostForecaster(target="cpu") - cpu_model.train(data) - - memory_model = XGBoostForecaster(target="memory") - memory_model.train(data) - - # Train anomaly detection on their normal patterns - anomaly_model = IsolationForestDetector() - anomaly_model.train(data) - - # Save models with customer prefix - cpu_model.save(f"models/{customer_config.customer_id}_cpu_forecaster.joblib") - memory_model.save(f"models/{customer_config.customer_id}_memory_forecaster.joblib") - anomaly_model.save(f"models/{customer_config.customer_id}_anomaly_detector.joblib") - -if __name__ == "__main__": - config = GamingCorpConfig() - train_for_customer(config) -``` - -### Step 4: Deploy Customer Instance - -```yaml -# infra/kubernetes/customers/gaming-corp/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prescale-gaming-corp - namespace: prescale - labels: - customer: gaming-corp -spec: - replicas: 2 - selector: - matchLabels: - app: prescale-inference - customer: gaming-corp - template: - metadata: - labels: - app: prescale-inference - customer: gaming-corp - spec: - containers: - - name: inference - image: gcr.io/YOUR_PROJECT/prescale-inference:latest - ports: - - containerPort: 8080 - env: - - name: CUSTOMER_ID - value: "gaming-corp" - - name: PROMETHEUS_URL - value: "http://prometheus.gaming-corp.internal:9090" - - name: MODEL_PREFIX - value: "gaming-corp" - - name: GCS_BUCKET - value: "prescale-models-production" - resources: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "1Gi" - cpu: "500m" -``` - -### Step 5: Configure Webhooks/Alerts - -```yaml -# Customer alert configuration -apiVersion: v1 -kind: ConfigMap -metadata: - name: prescale-alerts-gaming-corp -data: - alerts.yaml: | - webhooks: - - name: slack - url: "https://hooks.slack.com/services/CUSTOMER_WEBHOOK" - events: - - anomaly_detected - - scaling_recommended - - capacity_warning - - - name: pagerduty - url: "https://events.pagerduty.com/v2/enqueue" - routing_key: "CUSTOMER_ROUTING_KEY" - events: - - anomaly_detected - - capacity_critical - - thresholds: - anomaly_alert: 2.5 # Z-score - capacity_warning: 0.75 - capacity_critical: 0.90 -``` - ---- - -## Gaming App Example - -### Gaming-Specific Challenges - -| Challenge | Prescale Solution | -|-----------|-----------------| -| **Spike traffic** (game launches, events) | Prophet model with custom seasonality for scheduled events | -| **Match-based scaling** | Custom metrics: `active_matches`, `players_in_queue` | -| **Regional patterns** | Time-zone aware forecasting | -| **Real-time requirements** | Sub-second latency detection | - -### Custom Metrics for Gaming - -```python -# ml/adapters/gaming_metrics.py -""" -Gaming-specific metric transformations and features. -""" - -GAMING_METRICS = { - # Player activity - "concurrent_users": { - "prometheus_query": "sum(game_active_sessions)", - "forecast_horizon": "1h", - "seasonality": ["hourly", "daily", "weekly"], - }, - - # Match orchestration - "matches_in_progress": { - "prometheus_query": "sum(game_matches_active)", - "scale_trigger": "matches_in_queue > 100", - }, - - # Server capacity - "server_utilization": { - "prometheus_query": "avg(game_server_players / game_server_capacity)", - "warning_threshold": 0.8, - "critical_threshold": 0.95, - }, - - # Latency (critical for gaming) - "game_latency_p99": { - "prometheus_query": "histogram_quantile(0.99, game_request_latency_bucket)", - "anomaly_detection": True, - "alert_threshold_ms": 100, - }, -} - -def create_gaming_features(df): - """Create gaming-specific features for ML models.""" - - # Time-based features - df['hour'] = df.index.hour - df['day_of_week'] = df.index.dayofweek - df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int) - - # Peak hours (typically 6 PM - 11 PM local time) - df['is_peak_hour'] = df['hour'].between(18, 23).astype(int) - - # Rolling statistics for player counts - df['players_rolling_1h'] = df['concurrent_users'].rolling('1H').mean() - df['players_rolling_24h'] = df['concurrent_users'].rolling('24H').mean() - - # Match queue pressure - df['queue_pressure'] = df['matches_in_queue'] / df['matches_in_progress'].clip(lower=1) - - # Server headroom - df['server_headroom'] = 1 - df['server_utilization'] - - return df -``` - -### Gaming-Specific Scaling Rules - -```yaml -# infra/kubernetes/customers/gaming-corp/keda-scaledobject.yaml -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: game-server-scaler - namespace: gaming -spec: - scaleTargetRef: - name: game-server - minReplicaCount: 5 - maxReplicaCount: 100 - - triggers: - # Scale based on Prescale predictions - - type: prometheus - metadata: - serverAddress: http://prescale-inference.prescale:8080 - metricName: prescale_predicted_players_1h - threshold: "80" # Players per server - query: | - prescale_forecast_value{ - customer="gaming-corp", - metric="concurrent_users", - horizon="1h" - } - - # Emergency scale on queue length - - type: prometheus - metadata: - serverAddress: http://prometheus.monitoring:9090 - metricName: game_match_queue - threshold: "50" - query: sum(game_matches_in_queue) - - # Scale based on anomaly detection - - type: prometheus - metadata: - serverAddress: http://prescale-inference.prescale:8080 - metricName: prescale_anomaly_score - threshold: "2.5" - query: | - prescale_anomaly_score{ - customer="gaming-corp", - metric="concurrent_users" - } - - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleUp: - stabilizationWindowSeconds: 30 # Fast scale-up for gaming - policies: - - type: Percent - value: 100 - periodSeconds: 15 - scaleDown: - stabilizationWindowSeconds: 300 # Slow scale-down - policies: - - type: Percent - value: 10 - periodSeconds: 60 -``` - -### Event-Based Scaling (Game Launches, Tournaments) - -```python -# ml/adapters/gaming_events.py -""" -Handle scheduled gaming events that cause predictable traffic spikes. -""" - -from datetime import datetime, timedelta -from typing import List - -class GamingEventScheduler: - """Manage known events that affect traffic.""" - - def __init__(self): - self.events = [] - - def add_event(self, name: str, start: datetime, duration_hours: int, - expected_multiplier: float): - """ - Add a scheduled event. - - Args: - name: Event name (e.g., "Season 5 Launch") - start: Event start time - duration_hours: How long the spike lasts - expected_multiplier: Expected traffic multiplier (e.g., 3.0 = 3x normal) - """ - self.events.append({ - "name": name, - "start": start, - "end": start + timedelta(hours=duration_hours), - "multiplier": expected_multiplier, - }) - - def get_multiplier(self, timestamp: datetime) -> float: - """Get the expected traffic multiplier for a given time.""" - for event in self.events: - if event["start"] <= timestamp <= event["end"]: - return event["multiplier"] - return 1.0 - - def adjust_forecast(self, forecast: float, timestamp: datetime) -> float: - """Adjust ML forecast based on scheduled events.""" - multiplier = self.get_multiplier(timestamp) - return forecast * multiplier - - -# Example usage -scheduler = GamingEventScheduler() - -# Add known events -scheduler.add_event( - name="Season 5 Launch", - start=datetime(2026, 1, 15, 10, 0), # 10 AM UTC - duration_hours=48, - expected_multiplier=5.0 # Expect 5x normal traffic -) - -scheduler.add_event( - name="Weekend Tournament", - start=datetime(2026, 1, 18, 14, 0), # Saturday 2 PM UTC - duration_hours=8, - expected_multiplier=2.5 -) -``` - -### Gaming Dashboard (Grafana) - -```json -{ - "dashboard": { - "title": "Prescale - Gaming Corp", - "panels": [ - { - "title": "Player Forecast (1 Hour)", - "type": "timeseries", - "targets": [ - { - "expr": "game_active_sessions", - "legendFormat": "Actual Players" - }, - { - "expr": "prescale_forecast_value{customer='gaming-corp', metric='concurrent_users', horizon='1h'}", - "legendFormat": "Predicted (1h)" - } - ] - }, - { - "title": "Anomaly Detection", - "type": "stat", - "targets": [ - { - "expr": "prescale_anomaly_score{customer='gaming-corp'}", - "legendFormat": "Anomaly Score" - } - ], - "thresholds": [ - {"value": 0, "color": "green"}, - {"value": 2, "color": "yellow"}, - {"value": 3, "color": "red"} - ] - }, - { - "title": "Scaling Recommendations", - "type": "table", - "targets": [ - { - "expr": "prescale_recommendation{customer='gaming-corp'}", - "format": "table" - } - ] - }, - { - "title": "Server Capacity Headroom", - "type": "gauge", - "targets": [ - { - "expr": "1 - avg(game_server_players / game_server_capacity)", - "legendFormat": "Available Capacity" - } - ] - } - ] - } -} -``` - ---- - -## Custom Metrics Configuration - -### Adapter Interface - -```python -# ml/adapters/base.py -from abc import ABC, abstractmethod -from typing import Dict, List -import pandas as pd - -class MetricsAdapter(ABC): - """Base class for metrics adapters.""" - - @abstractmethod - def fetch_metrics(self, start_time, end_time) -> pd.DataFrame: - """Fetch metrics from the source system.""" - pass - - @abstractmethod - def get_current_metrics(self) -> Dict[str, float]: - """Get current metric values.""" - pass - - @abstractmethod - def list_available_metrics(self) -> List[str]: - """List all available metrics.""" - pass - - -class PrometheusAdapter(MetricsAdapter): - """Adapter for Prometheus-compatible systems.""" - - def __init__(self, url: str, queries: Dict[str, str]): - self.url = url - self.queries = queries - - def fetch_metrics(self, start_time, end_time) -> pd.DataFrame: - # Implementation... - pass - - -class DatadogAdapter(MetricsAdapter): - """Adapter for Datadog.""" - - def __init__(self, api_key: str, app_key: str): - self.api_key = api_key - self.app_key = app_key - - def fetch_metrics(self, start_time, end_time) -> pd.DataFrame: - # Implementation using Datadog API... - pass - - -class CloudWatchAdapter(MetricsAdapter): - """Adapter for AWS CloudWatch.""" - - def __init__(self, region: str, namespace: str): - self.region = region - self.namespace = namespace - - def fetch_metrics(self, start_time, end_time) -> pd.DataFrame: - # Implementation using boto3... - pass -``` - ---- - -## Multi-Tenant Setup - -### Database Schema - -```sql --- Multi-tenant schema for Prescale SaaS -CREATE TABLE customers ( - id UUID PRIMARY KEY, - name VARCHAR(255) NOT NULL, - api_key VARCHAR(64) UNIQUE NOT NULL, - created_at TIMESTAMP DEFAULT NOW(), - plan VARCHAR(50) DEFAULT 'standard', -- standard, pro, enterprise - settings JSONB DEFAULT '{}' -); - -CREATE TABLE customer_metrics ( - id BIGSERIAL PRIMARY KEY, - customer_id UUID REFERENCES customers(id), - metric_name VARCHAR(255) NOT NULL, - prometheus_query TEXT, - forecast_enabled BOOLEAN DEFAULT true, - anomaly_enabled BOOLEAN DEFAULT true, - thresholds JSONB DEFAULT '{}' -); - -CREATE TABLE models ( - id UUID PRIMARY KEY, - customer_id UUID REFERENCES customers(id), - model_type VARCHAR(50) NOT NULL, -- forecaster, anomaly_detector - model_path VARCHAR(500) NOT NULL, - trained_at TIMESTAMP DEFAULT NOW(), - metrics JSONB, -- Training metrics (MAE, accuracy, etc.) - is_active BOOLEAN DEFAULT true -); - -CREATE TABLE predictions ( - id BIGSERIAL PRIMARY KEY, - customer_id UUID REFERENCES customers(id), - timestamp TIMESTAMP NOT NULL, - metric_name VARCHAR(255) NOT NULL, - predicted_value FLOAT NOT NULL, - actual_value FLOAT, - horizon_minutes INT NOT NULL, - model_id UUID REFERENCES models(id) -); - -CREATE TABLE anomalies ( - id BIGSERIAL PRIMARY KEY, - customer_id UUID REFERENCES customers(id), - detected_at TIMESTAMP NOT NULL, - metric_name VARCHAR(255) NOT NULL, - anomaly_score FLOAT NOT NULL, - description TEXT, - acknowledged BOOLEAN DEFAULT false -); -``` - -### API with Tenant Isolation - -```python -# ml/inference/app_multitenant.py -from fastapi import FastAPI, Depends, HTTPException, Header -from typing import Optional - -app = FastAPI(title="Prescale Multi-Tenant API") - -async def get_customer(x_api_key: str = Header(...)): - """Validate API key and return customer context.""" - customer = await db.customers.find_one({"api_key": x_api_key}) - if not customer: - raise HTTPException(status_code=401, detail="Invalid API key") - return customer - -@app.get("/predict") -async def predict( - metric: str, - horizon: int = 15, - customer = Depends(get_customer) -): - """Get prediction for customer's metric.""" - - # Load customer-specific model - model = load_model(f"{customer['id']}_forecaster") - - # Get customer's current metrics - adapter = get_adapter(customer) - current_data = adapter.get_current_metrics() - - # Generate prediction - prediction = model.predict(current_data, horizon) - - return { - "customer": customer["name"], - "metric": metric, - "horizon_minutes": horizon, - "predicted_value": prediction, - "timestamp": datetime.utcnow().isoformat() - } -``` - ---- - -## Pricing Model Considerations - -| Tier | Features | Metrics | Retention | Price | -|------|----------|---------|-----------|-------| -| **Starter** | Basic forecasting | 10 | 7 days | $99/mo | -| **Pro** | + Anomaly detection, webhooks | 50 | 30 days | $299/mo | -| **Enterprise** | + Custom models, SLA, support | Unlimited | 90 days | Custom | - ---- - -## Next Steps for Customer Integration - -1. **Discovery Call**: Understand their stack and pain points -2. **POC Deployment**: 2-week trial with their data -3. **Model Training**: Train on 14-30 days of historical data -4. **Integration**: Connect to their alerting systems -5. **Tuning**: Adjust thresholds based on feedback -6. **Production**: Full deployment with monitoring diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md deleted file mode 100644 index 00694a8..0000000 --- a/docs/QUICKSTART.md +++ /dev/null @@ -1,288 +0,0 @@ -# Prescale Quick Start Guide - -Get Prescale running in minutes with this step-by-step guide. - -## Prerequisites - -- Python 3.11+ -- Docker (for local development) -- kubectl (for Kubernetes deployment) -- gcloud CLI (for GCP deployment) - ---- - -## Option 1: Local Development (Docker) - -The fastest way to try Prescale locally. - -```bash -# Clone the repository -git clone https://github.com/pyjeebz/prescale.git -cd prescale - -# Start inference service -docker compose up -d inference - -# Verify it's running -curl http://localhost:8080/health -# {"status": "healthy", "models_loaded": 3} - -# Test prediction endpoint -curl -X POST http://localhost:8080/api/v1/predict \ - -H "Content-Type: application/json" \ - -d '{"deployment": "test", "namespace": "default", "metric": "cpu", "periods": 6}' -``` - ---- - -## Option 2: GCP Deployment (Production) - -Deploy Prescale to Google Kubernetes Engine with GCP Cloud Monitoring integration. - -### Step 1: Setup GCP Project - -```bash -# Set your project ID -export GCP_PROJECT_ID="your-gcp-project-id" - -# Authenticate -gcloud auth login -gcloud config set project $GCP_PROJECT_ID - -# Enable required APIs -gcloud services enable \ - container.googleapis.com \ - cloudbuild.googleapis.com \ - monitoring.googleapis.com \ - storage.googleapis.com -``` - -### Step 2: Create GKE Cluster (if needed) - -```bash -# Create cluster -gcloud container clusters create prescale-cluster \ - --region us-central1 \ - --num-nodes 2 \ - --machine-type e2-medium - -# Get credentials -gcloud container clusters get-credentials prescale-cluster --region us-central1 -``` - -### Step 3: Create GCS Bucket for Models - -```bash -# Create bucket -gsutil mb gs://${GCP_PROJECT_ID}-prescale-models - -# Enable uniform bucket-level access -gsutil uniformbucketlevelaccess set on gs://${GCP_PROJECT_ID}-prescale-models -``` - -### Step 4: Train Models (Optional) - -Train on your GCP Cloud Monitoring data: - -```bash -# Setup Python environment -python -m venv .venv -source .venv/bin/activate # Windows: .venv\Scripts\activate -pip install -r ml/requirements.txt - -# Train models -cd ml -python train.py --namespace your-namespace --hours 24 - -# Upload to GCS -gsutil cp -r artifacts/* gs://${GCP_PROJECT_ID}-prescale-models/ -``` - -### Step 5: Deploy Inference Service - -```bash -# Build container image -gcloud builds submit --config cloudbuild.yaml - -# Update deployment with your project ID -sed -i "s/GCP_PROJECT_ID/$GCP_PROJECT_ID/g" infra/kubernetes/prescale-inference/deployment.yaml - -# Deploy to GKE -kubectl create namespace prescale -kubectl apply -f infra/kubernetes/prescale-inference/ - -# Wait for deployment -kubectl wait --for=condition=available deployment/prescale-inference -n prescale --timeout=300s - -# Get external IP -kubectl get svc prescale-inference -n prescale -``` - -### Step 6: Install CLI & Agent - -```bash -# Install CLI -pip install prescale-cli - -# Install agent with GCP support -pip install prescale-agent[gcp] -``` - -### Step 7: Configure Agent - -Create `prescale-agent.yaml`: - -```yaml -agent: - collection_interval: 60 - log_level: INFO - -sources: - - type: gcp-monitoring - enabled: true - config: - project_id: your-gcp-project-id - metrics: - - kubernetes.io/container/cpu/limit_utilization - - kubernetes.io/container/memory/limit_utilization - filters: - namespace: your-namespace - -prescale: - endpoint: http://EXTERNAL_IP:8080 # Replace with actual IP -``` - -### Step 8: Run Agent - -```bash -# Test collection -prescale-agent run --config prescale-agent.yaml --once - -# Run continuously -prescale-agent run --config prescale-agent.yaml -``` - -### Step 9: Use CLI - -```bash -# Set endpoint -export PRESCALE_ENDPOINT="http://EXTERNAL_IP:8080" - -# Get predictions -prescale predict cpu --deployment your-app --namespace your-namespace - -# Detect anomalies -prescale detect --deployment your-app --namespace your-namespace - -# Get recommendations -prescale recommend --deployment your-app --namespace your-namespace --replicas 2 -``` - ---- - -## Option 3: Helm Installation - -For existing Kubernetes clusters: - -```bash -# Add Helm repo -helm repo add prescale https://pyjeebz.github.io/prescale -helm repo update - -# Install -helm install prescale prescale/prescale \ - --namespace prescale \ - --create-namespace \ - --set inference.image.tag=latest - -# Verify -kubectl get pods -n prescale -``` - ---- - -## Verify Installation - -### Check Service Health - -```bash -curl http://localhost:8080/health -``` - -Expected response: -```json -{ - "status": "healthy", - "models_loaded": 3, - "models": ["baseline", "prophet", "xgboost"] -} -``` - -### Check Models - -```bash -curl http://localhost:8080/models -``` - -### Test Prediction - -```bash -curl -X POST http://localhost:8080/api/v1/predict \ - -H "Content-Type: application/json" \ - -d '{ - "deployment": "test-app", - "namespace": "default", - "metric": "cpu", - "periods": 12 - }' -``` - ---- - -## Next Steps - -1. **Configure Alerting**: Set up Prometheus alerts based on Prescale predictions -2. **Enable KEDA**: Use predictions for predictive autoscaling -3. **Grafana Dashboards**: Import Prescale dashboards for visualization -4. **Web Dashboard**: Deploy the upcoming web UI for ClickOps management - ---- - -## Troubleshooting - -### Models Not Loading - -```bash -# Check pod logs -kubectl logs -n prescale deploy/prescale-inference - -# Verify GCS bucket access -gsutil ls gs://your-bucket/ -``` - -### Agent Not Collecting - -```bash -# Test with --once flag -prescale-agent run --config prescale-agent.yaml --once - -# Check GCP permissions -gcloud projects get-iam-policy $GCP_PROJECT_ID -``` - -### Connection Refused - -```bash -# Check service is running -kubectl get svc -n prescale - -# Verify endpoint -curl http://your-endpoint:8080/health -``` - ---- - -## Support - -- GitHub Issues: https://github.com/pyjeebz/prescale/issues -- Documentation: https://github.com/pyjeebz/prescale/tree/main/docs diff --git a/docs/architecture/ARCHITECTURE.md b/docs/architecture/ARCHITECTURE.md deleted file mode 100644 index 41f5fb5..0000000 --- a/docs/architecture/ARCHITECTURE.md +++ /dev/null @@ -1,698 +0,0 @@ -# Prescale Architecture Design - -## 1. System Overview - -Prescale is a **predictive infrastructure intelligence platform** that uses machine learning to forecast resource demand and provide proactive scaling recommendations. - -``` -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ PRESCALE PLATFORM │ -├─────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Metrics │───▶│ ML │───▶│ Inference │───▶│ Output │ │ -│ │ Adapter │ │ Pipeline │ │ Service │ │ Layer │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ -│ ▲ │ │ │ -│ │ ▼ ▼ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Cloud │ │ KEDA │ │ Grafana │ │ -│ │ Provider │ │ Autoscaler │ │ Dashboard │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ │ -│ ▲ │ │ -└─────────│───────────────────────────────────────│──────────────────────────────┘ - │ │ - │ ▼ -┌─────────────────────┐ ┌─────────────────────┐ -│ Target Workload │◀───────────────│ Kubernetes HPA │ -│ (e.g., Saleor) │ └─────────────────────┘ -└─────────────────────┘ -``` - -### Design Principles - -| Principle | Description | -|-----------|-------------| -| **Multi-Cloud** | Works on GKE, EKS, AKS, or any Kubernetes cluster | -| **Cloud-Agnostic** | Pluggable adapters for metrics sources | -| **Prometheus-Native** | Standard metrics format for easy integration | -| **Progressive** | Start simple (baseline), add complexity as needed | -| **Observable** | Full visibility into predictions and decisions | - ---- - -## 2. Component Architecture - -### 2.1 Metrics Adapter Layer - -**Purpose:** Abstract cloud provider differences, provide unified metrics interface. - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ METRICS ADAPTER LAYER │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ GCP │ │ AWS │ │ Azure │ │ -│ │ Adapter │ │ Adapter │ │ Adapter │ │ -│ │ (Cloud │ │ (Cloud │ │ (Azure │ │ -│ │ Monitoring)│ │ Watch) │ │ Monitor) │ │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ -│ │ │ │ │ -│ └────────────────┼────────────────┘ │ -│ ▼ │ -│ ┌─────────────────┐ │ -│ │ Prometheus │ (Alternative: direct scrape)│ -│ │ Adapter │ │ -│ └────────┬────────┘ │ -│ ▼ │ -│ ┌─────────────────┐ │ -│ │ Unified Metrics │ │ -│ │ Interface │ │ -│ └─────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Adapter Interface:** - -```python -class MetricsAdapter(ABC): - """Abstract base for cloud-agnostic metrics fetching.""" - - @abstractmethod - def fetch_container_metrics( - self, - namespace: str, - hours: int - ) -> pd.DataFrame: - """Fetch CPU, memory metrics for containers.""" - pass - - @abstractmethod - def fetch_database_metrics( - self, - instance_id: str, - hours: int - ) -> pd.DataFrame: - """Fetch database CPU, connections, etc.""" - pass -``` - -**Implemented Adapters:** - -| Adapter | Source | Status | -|---------|--------|--------| -| `GCPMetricsAdapter` | Cloud Monitoring API | ✅ Implemented | -| `AWSMetricsAdapter` | CloudWatch API | 🔲 Planned | -| `AzureMetricsAdapter` | Azure Monitor API | 🔲 Planned | -| `PrometheusMetricsAdapter` | Prometheus Query API | 🔲 Planned | - -**Metrics Collected:** - -```yaml -container_metrics: - - cpu_utilization # 0-1 ratio - - memory_utilization # 0-1 ratio - - memory_bytes # Absolute memory usage - - restart_count # Container restarts - -database_metrics: - - db_cpu_utilization # 0-1 ratio - - db_memory_utilization # 0-1 ratio - - db_connections # Active connections - - db_replication_lag # Seconds (if replica) - -cache_metrics: - - redis_memory_utilization # 0-1 ratio - - redis_connections # Connected clients - - redis_hit_rate # Cache hit ratio -``` - ---- - -### 2.2 ML Pipeline - -**Purpose:** Train and evaluate forecasting and anomaly detection models. - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ ML PIPELINE │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Data Fetcher │ │ -│ │ • Cloud Monitoring API integration │ │ -│ │ • Time alignment (5-min buckets) │ │ -│ │ • Multi-metric merging │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Feature Engineering │ │ -│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ -│ │ │ Temporal │ │ Lag │ │ Rolling │ │ Percent │ │ │ -│ │ │ Features │ │ Features │ │ Stats │ │ Change │ │ │ -│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │ -│ │ │ │ -│ │ Input: 7 raw metrics → Output: 108 features │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Model Training │ │ -│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ -│ │ │ Baseline │ │ Prophet │ │ XGBoost │ │ │ -│ │ │ (MA+Trend) │ │ (Forecaster) │ │ (Anomaly) │ │ │ -│ │ │ │ │ │ │ │ │ │ -│ │ │ MAPE: 2.6% │ │ Cov: 46.9% │ │ Rate: 0.69% │ │ │ -│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Artifacts Storage │ │ -│ │ • Model weights (joblib/pickle) │ │ -│ │ • Training metrics (JSON) │ │ -│ │ • Data summaries │ │ -│ └─────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Model Details:** - -| Model | Type | Purpose | Key Metrics | -|-------|------|---------|-------------| -| **Baseline** | Moving Average + Linear Trend | Simple forecasting benchmark | MAE, MAPE, Skill Score | -| **Prophet** | Facebook Prophet | Seasonality-aware forecasting | MAE, MAPE, Coverage | -| **XGBoost** | Gradient Boosting | Anomaly detection | Threshold, Anomaly Rate | - -**Feature Engineering (108 features from 7 inputs):** - -```python -# Temporal features (8) -hour, day_of_week, is_weekend -hour_sin, hour_cos, day_sin, day_cos -minutes_since_midnight - -# Per-metric features (14 per metric × 7 metrics = 98) -lag_1, lag_3, lag_6, lag_12 -rolling_mean_3, rolling_mean_6, rolling_mean_12 -rolling_std_3, rolling_std_6, rolling_std_12 -rolling_min_6, rolling_max_6 -pct_change_1, pct_change_3 - -# Cross-metric (2) -cpu_memory_ratio -``` - ---- - -### 2.3 Inference Service - -**Purpose:** Serve predictions, anomaly scores, and recommendations via REST API. - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ INFERENCE SERVICE (FastAPI) │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ API Endpoints │ │ -│ │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌─────────┐ │ │ -│ │ │ /predict │ │ /detect │ │/recommend │ │/metrics │ │ │ -│ │ │ │ │ │ │ │ │(Prom) │ │ │ -│ │ └───────────┘ └───────────┘ └───────────┘ └─────────┘ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ │ │ -│ ┌─────────────────────────┼───────────────────────────────┐ │ -│ │ Model Manager │ │ -│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ -│ │ │ Baseline │ │ Prophet │ │ XGBoost │ │ │ -│ │ │ Model │ │ Model │ │ Model │ │ │ -│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ -│ └─────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Recommendation Engine │ │ -│ │ • Scale up/down decisions │ │ -│ │ • Resource limit suggestions │ │ -│ │ • Proactive alerts │ │ -│ └─────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**API Specification:** - -| Endpoint | Method | Request | Response | -|----------|--------|---------|----------| -| `/health` | GET | - | `{status, models_loaded}` | -| `/models` | GET | - | List of loaded models | -| `/predict` | POST | `{metrics, periods}` | Forecasts + confidence | -| `/detect` | POST | `{metrics}` | Anomaly score + flag | -| `/recommend` | POST | `{metrics, forecasts}` | Scaling advice | -| `/metrics` | GET | - | Prometheus format | - -**Prometheus Metrics Exposed:** - -```prometheus -# Predictions as gauges -prescale_predicted_cpu{namespace="saleor", deployment="api"} 0.72 -prescale_predicted_memory{namespace="saleor", deployment="api"} 0.58 - -# Anomaly scores -prescale_anomaly_score{namespace="saleor", deployment="api"} 0.23 -prescale_anomaly_detected{namespace="saleor", deployment="api"} 0 - -# Recommendations -prescale_recommended_replicas{namespace="saleor", deployment="api"} 3 -prescale_recommendation_confidence{namespace="saleor", deployment="api"} 0.87 -``` - ---- - -### 2.4 Recommendation Engine - -**Purpose:** Convert forecasts into actionable scaling recommendations. - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ RECOMMENDATION ENGINE │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Input Signals │ │ -│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ -│ │ │ Forecasts │ │ Anomaly │ │ Current │ │ │ -│ │ │ (Prophet/ │ │ Scores │ │ State │ │ │ -│ │ │ Baseline) │ │ (XGBoost) │ │ (K8s API) │ │ │ -│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Decision Rules │ │ -│ │ │ │ -│ │ IF predicted_cpu_30min > 80% AND confidence > 0.8: │ │ -│ │ → SCALE_UP │ │ -│ │ │ │ -│ │ IF cpu_utilization < 20% for 1h: │ │ -│ │ → SCALE_DOWN │ │ -│ │ │ │ -│ │ IF anomaly_detected AND severity > threshold: │ │ -│ │ → ALERT (warning/critical) │ │ -│ │ │ │ -│ │ IF memory_utilization > 85%: │ │ -│ │ → INCREASE_MEMORY_LIMIT │ │ -│ │ │ │ -│ │ IF predicted_traffic_spike > 2x baseline: │ │ -│ │ → PREEMPTIVE_SCALE (before spike) │ │ -│ │ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Recommendation Output │ │ -│ │ { │ │ -│ │ "action": "scale_up", │ │ -│ │ "target_replicas": 5, │ │ -│ │ "current_replicas": 3, │ │ -│ │ "reason": "Predicted CPU 82% in 30 min", │ │ -│ │ "confidence": 0.87, │ │ -│ │ "urgency": "medium" │ │ -│ │ } │ │ -│ └─────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Recommendation Types:** - -| Type | Trigger | Action | -|------|---------|--------| -| `SCALE_UP` | Predicted high CPU/memory | Increase replicas | -| `SCALE_DOWN` | Low utilization for 1h+ | Decrease replicas | -| `INCREASE_MEMORY` | Memory > 85% | Adjust resource limits | -| `DECREASE_CPU` | CPU consistently < 20% | Reduce CPU requests | -| `ALERT_WARNING` | Anomaly detected | Notify operators | -| `ALERT_CRITICAL` | Severe anomaly | Page on-call | -| `PREEMPTIVE_SCALE` | Predicted spike | Scale before event | - ---- - -### 2.5 Autoscaling Integration (KEDA) - -**Purpose:** Automatically scale workloads based on Prescale predictions. - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ KEDA INTEGRATION │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Prescale Inference Service │ │ -│ │ │ │ -│ │ GET /metrics → prescale_predicted_cpu = 0.72 │ │ -│ │ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ │ Prometheus scrape │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Prometheus │ │ -│ │ │ │ -│ │ prescale_predicted_cpu{deployment="saleor-api"} 0.72 │ │ -│ │ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ │ PromQL query │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ KEDA ScaledObject │ │ -│ │ │ │ -│ │ triggers: │ │ -│ │ - type: prometheus │ │ -│ │ metadata: │ │ -│ │ query: prescale_predicted_cpu{deployment="..."} │ │ -│ │ threshold: "70" │ │ -│ │ │ │ -│ └─────────────────────────┬───────────────────────────────┘ │ -│ │ Scale decision │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Kubernetes HPA / Deployment │ │ -│ │ │ │ -│ │ replicas: 3 → 5 │ │ -│ │ │ │ -│ └─────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Why KEDA?** - -| Feature | Benefit | -|---------|---------| -| **Multi-cloud** | Works on GKE, EKS, AKS, any K8s | -| **Prometheus native** | Standard metrics integration | -| **Scale to zero** | Cost savings for idle workloads | -| **CNCF graduated** | Production-ready, vendor-neutral | -| **50+ scalers** | Future extensibility | - ---- - -## 3. Data Flow - -### 3.1 Training Flow - -``` -┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ -│ Cloud │───▶│ Data │───▶│ Feature │───▶│ Model │ -│ Monitoring │ │ Fetcher │ │ Engineering │ │ Training │ -└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ - │ - ▼ - ┌──────────────┐ - │ Artifacts │ - │ Storage │ - └──────────────┘ -``` - -### 3.2 Inference Flow - -``` -┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ -│ Client │───▶│ Inference │───▶│ Model │───▶│ Response │ -│ Request │ │ Service │ │ Execution │ │ (JSON/Prom) │ -└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ -``` - -### 3.3 Autoscaling Flow - -``` -┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ -│ Inference │───▶│ Prometheus │───▶│ KEDA │───▶│ HPA │ -│ /metrics │ │ (scrape) │ │ (evaluate) │ │ (scale) │ -└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ - │ - ▼ - ┌──────────────┐ - │ Deployment │ - │ (replicas) │ - └──────────────┘ -``` - ---- - -## 4. Technology Stack - -### 4.1 ML Pipeline - -| Component | Technology | Purpose | -|-----------|------------|---------| -| **Language** | Python 3.11+ | ML ecosystem | -| **Data** | Pandas, NumPy | Data manipulation | -| **Forecasting** | Prophet | Time-series prediction | -| **Anomaly** | XGBoost, Scikit-learn | Anomaly detection | -| **API** | FastAPI | High-performance REST | -| **Metrics** | prometheus-client | Prometheus exposition | - -### 4.2 Infrastructure - -| Component | Technology | Purpose | -|-----------|------------|---------| -| **IaC** | Terraform | Multi-cloud provisioning | -| **Orchestration** | Kubernetes | Container orchestration | -| **Autoscaling** | KEDA | Event-driven autoscaling | -| **Monitoring** | Prometheus | Metrics collection | -| **Visualization** | Grafana | Dashboards | -| **Alerting** | Alertmanager | Alert routing | - -### 4.3 Cloud Support Matrix - -| Feature | GCP | AWS | Azure | On-Prem | -|---------|-----|-----|-------|---------| -| **Metrics Adapter** | ✅ Cloud Monitoring | 🔲 CloudWatch | 🔲 Azure Monitor | ✅ Prometheus | -| **Kubernetes** | ✅ GKE | ✅ EKS | ✅ AKS | ✅ Any | -| **Database** | ✅ Cloud SQL | ✅ RDS | ✅ Azure SQL | ✅ PostgreSQL | -| **Cache** | ✅ Memorystore | ✅ ElastiCache | ✅ Azure Cache | ✅ Redis | -| **Storage** | ✅ GCS | ✅ S3 | ✅ Blob | ✅ MinIO | - ---- - -## 5. Kubernetes Architecture - -### 5.1 Namespace Layout - -``` -Namespaces: -├── prescale # Prescale inference service -├── saleor # Demo application (Saleor e-commerce) -├── loadtest # Locust load testing -├── monitoring # Prometheus, Grafana, Alertmanager -├── keda # KEDA operator -└── gke-gmp-system # GKE Managed Prometheus (GCP only) -``` - -### 5.2 Prescale Deployment - -```yaml -# infra/kubernetes/prescale-inference/ -├── namespace.yaml -├── deployment.yaml # Inference service (FastAPI) -├── service.yaml # ClusterIP service -├── configmap.yaml # Configuration -├── serviceaccount.yaml # For Workload Identity -├── pod-monitoring.yaml # Prometheus scrape config -└── kustomization.yaml -``` - -### 5.3 Resource Estimates - -| Service | CPU Request | Memory Request | Replicas | -|---------|-------------|----------------|----------| -| **Inference Service** | 250m | 512Mi | 2 | -| **Saleor API** | 250m | 512Mi | 1-10 (scaled) | -| **Saleor Dashboard** | 50m | 64Mi | 1 | -| **Locust Master** | 100m | 256Mi | 1 | -| **Locust Worker** | 200m | 256Mi | 2 | - ---- - -## 6. Security - -### 6.1 Authentication & Access - -| Layer | Mechanism | -|-------|-----------| -| **API** | API keys / JWT tokens | -| **K8s** | RBAC, ServiceAccounts | -| **Cloud** | Workload Identity (GCP), IAM Roles (AWS) | -| **Network** | NetworkPolicies, ClusterIP | - -### 6.2 Workload Identity (GCP Example) - -```yaml -# ServiceAccount with GCP Workload Identity -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prescale-inference - namespace: prescale - annotations: - iam.gke.io/gcp-service-account: prescale-inference@PROJECT.iam.gserviceaccount.com -``` - ---- - -## 7. Observability - -### 7.1 Prescale Self-Monitoring - -```yaml -Metrics (Prometheus): - # Inference latency - - prescale_inference_duration_seconds - - prescale_inference_requests_total - - # Model performance - - prescale_prediction_confidence - - prescale_anomaly_detections_total - - # Recommendation tracking - - prescale_recommendations_total - - prescale_scaling_actions_total - -Logging (Structured JSON): - - Request/response logs - - Model inference traces - - Recommendation audit trail - -Dashboards (Grafana): - - Prescale Overview (predictions vs actuals) - - Anomaly Timeline - - Scaling History - - Model Performance -``` - -### 7.2 Alert Rules - -```yaml -# Alertmanager rules -groups: - - name: prescale - rules: - - alert: PrescaleHighAnomalyRate - expr: rate(prescale_anomaly_detections_total[5m]) > 0.1 - for: 5m - labels: - severity: warning - annotations: - summary: "High anomaly detection rate" - - - alert: PrescaleInferenceLatency - expr: prescale_inference_duration_seconds > 1 - for: 5m - labels: - severity: warning - annotations: - summary: "Inference latency too high" -``` - ---- - -## 8. Development Phases - -### ✅ Phase 1: Foundation (Complete) -- [x] Project structure -- [x] Terraform modules (GKE, Cloud SQL, Redis, GCS) -- [x] GKE Autopilot cluster -- [x] Networking (VPC, subnets) - -### ✅ Phase 2: Demo Application (Complete) -- [x] Saleor e-commerce deployment -- [x] Cloud SQL PostgreSQL integration -- [x] Redis caching -- [x] GCS media storage - -### ✅ Phase 3: Observability (Complete) -- [x] GKE Managed Prometheus -- [x] PodMonitoring resources -- [x] Locust load testing (4 personas) -- [x] Metrics collection validated - -### ✅ Phase 4: ML Pipeline (Complete) -- [x] Cloud Monitoring data fetcher -- [x] Feature engineering (108 features) -- [x] Baseline model (MA + Trend) - 2.6% MAPE -- [x] Prophet forecasting - 46.9% coverage -- [x] XGBoost anomaly detection - 0.69% anomaly rate -- [x] Training pipeline orchestration - -### 🔲 Phase 5: Inference Service (In Progress) -- [ ] FastAPI inference service -- [ ] /predict, /detect, /recommend endpoints -- [ ] Prometheus /metrics endpoint -- [ ] Kubernetes deployment -- [ ] KEDA ScaledObject -- [ ] Grafana dashboards -- [ ] Alertmanager integration - -### 🔲 Phase 6: Multi-Cloud (Planned) -- [ ] AWS CloudWatch adapter -- [ ] Azure Monitor adapter -- [ ] Prometheus adapter (generic) -- [ ] Cross-cloud testing - -### 🔲 Phase 7: Advanced Models (Planned) -- [ ] LSTM sequence model -- [ ] Transformer architecture -- [ ] Ensemble methods -- [ ] Online learning - ---- - -## 9. Appendix - -### A. Configuration Reference - -```yaml -# Prescale configuration -gcp: - project_id: "your-project-id" - region: "us-central1" - -metrics: - lookback_hours: 24 - aggregation_interval_minutes: 5 - -models: - baseline: - moving_average_window: 12 - trend_window: 24 - prophet: - seasonality_mode: "multiplicative" - changepoint_prior_scale: 0.05 - xgboost: - n_estimators: 100 - max_depth: 6 - anomaly_threshold_sigma: 2.5 - -scaling: - cpu_scale_up_threshold: 0.80 - cpu_scale_down_threshold: 0.20 - memory_warning_threshold: 0.85 - min_replicas: 1 - max_replicas: 10 - cooldown_seconds: 300 -``` - -### B. Glossary - -| Term | Definition | -|------|------------| -| **MAPE** | Mean Absolute Percentage Error | -| **Coverage** | % of actuals within prediction interval | -| **Anomaly Score** | Distance from normal (in std deviations) | -| **KEDA** | Kubernetes Event-Driven Autoscaling | -| **HPA** | Horizontal Pod Autoscaler | -| **Workload Identity** | GCP service account for K8s pods | - -### C. References - -- [KEDA Documentation](https://keda.sh/docs/) -- [Prophet Documentation](https://facebook.github.io/prophet/) -- [XGBoost Documentation](https://xgboost.readthedocs.io/) -- [FastAPI Documentation](https://fastapi.tiangolo.com/) -- [Prometheus Client](https://github.com/prometheus/client_python) -- [GKE Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus) diff --git a/docs/cloudwatch_testing_guide.md b/docs/cloudwatch_testing_guide.md deleted file mode 100644 index 7aa20ae..0000000 --- a/docs/cloudwatch_testing_guide.md +++ /dev/null @@ -1,93 +0,0 @@ -# Prescale on AWS: Testing Guide - -This guide explains how to configure and test the Prescale ML Retraining service with AWS CloudWatch. - -## Prerequisites - -1. **AWS Account**: Access to an AWS account with CloudWatch metrics. -2. **Permissions**: User/Role needs `CloudWatchReadOnlyAccess` or `cloudwatch:GetMetricStatistics`. -3. **Data**: The account should have EC2, RDS, or ELB metrics available. - -## Configuration - -The Prescale Inference Service uses **Environment Variables** for configuration. - -### 1. Set Environment Variables - -Run these commands in PowerShell to configure the service for AWS: - -```powershell -# 1. Select CloudWatch as Data Source (REQUIRED) -$env:RETRAIN_DATA_SOURCE = "cloudwatch" -$env:RETRAIN_ENABLED = "true" - -# 2. AWS Credentials & Region -# Option A: Use standard AWS Env Vars (Recommended for local test) -$env:AWS_ACCESS_KEY_ID = "AKIA..." -$env:AWS_SECRET_ACCESS_KEY = "secret..." -$env:AWS_DEFAULT_REGION = "us-east-1" # Change to your region - -# Option B: Use AWS Profile (if you have ~/.aws/credentials) -# $env:AWS_PROFILE = "default" -# $env:AWS_DEFAULT_REGION = "us-east-1" -``` - -### 2. Verify Credentials - -Ensure you can list metrics (requires AWS CLI installed, optional but good for verification): - -```bash -aws cloudwatch list-metrics --namespace AWS/EC2 -``` - -## Running the Service - -Start the inference service: - -```powershell -# In c:\Users\Windows\Desktop\prescale -$env:PORT = "8001" -python -m ml.inference.app -``` - -You should see logs indicating the scheduler started: -`INFO: Retraining scheduler started: every 6h, source=cloudwatch` - -## Triggering a Test - -### Option A: Using the Dashboard - -1. Start the frontend: - ```bash - cd ml/inference/web - npm run dev - ``` -2. Open `http://localhost:3000`. -3. The **Model Training** card should show "Data Source: cloudwatch". -4. Click **Retrain Now**. - -### Option B: Using PowerShell / curl - -```powershell -# Trigger a retrain (fetch last 24h of data) -Invoke-RestMethod -Uri "http://localhost:8001/api/retrain/trigger" -Method Post -Body '{"hours": 24}' -ContentType "application/json" - -# Check status -Invoke-RestMethod -Uri "http://localhost:8001/api/retrain/status" -``` - -## Troubleshooting - -- **"No data returned"**: - - Verify your `AWS_DEFAULT_REGION` matches where your resources are. - - The fetcher looks for specific metrics by default: - - `AWS/EC2`: `CPUUtilization`, `NetworkIn`, `NetworkOut` - - `AWS/RDS`: `CPUUtilization` - - If your account is new/empty, you might not have these metrics. Launch an EC2 instance to generate data. - -- **"boto3 not installed"**: - - Run `pip install boto3`. - -- **Authentication Errors**: - - Double-check your Access Key and Secret Key. - - Ensure the IAM user has `CloudWatchReadOnlyAccess`. diff --git a/docs/deployment_to_gcp_vm.md b/docs/deployment_to_gcp_vm.md deleted file mode 100644 index 9ecce56..0000000 --- a/docs/deployment_to_gcp_vm.md +++ /dev/null @@ -1,122 +0,0 @@ -# Deploying Prescale to a Google Cloud (GCP) VM - -Yes, you can absolutely run Prescale on a GCP VM! This guide walks you through setting up a "Command Center" instance in the cloud. - -## 1. Create a VM Instance - -You can use the Google Cloud Console or the `gcloud` CLI. - -**Recommended Specs:** -- **Machine Type**: `e2-medium` (2 vCPU, 4GB RAM) - Good balance for dev/test. -- **OS**: Ubuntu 22.04 LTS. -- **Firewall**: Allow HTTP/HTTPS. - -### Using CLI (Quickest) -```bash -gcloud compute instances create prescale-server \ - --project=YOUR_PROJECT_ID \ - --zone=us-central1-a \ - --machine-type=e2-medium \ - --image-family=ubuntu-2204-lts \ - --image-project=ubuntu-os-cloud \ - --tags=http-server,https-server,prescale-web -``` - ---- - -## 2. Configure Firewall Rules - -By default, GCP blocks external ports. We need to open: -- **8080**: Prescale Backend API -- **3000**: Prescale Frontend (Web UI) - -```bash -gcloud compute firewall-rules create allow-prescale \ - --direction=INGRESS \ - --priority=1000 \ - --network=default \ - --action=ALLOW \ - --rules=tcp:8080,tcp:3000 \ - --source-ranges=0.0.0.0/0 \ - --target-tags=prescale-web -``` - ---- - -## 3. Install Dependencies - -SSH into your new VM: -```bash -gcloud compute ssh prescale-server -``` - -Run the following inside the VM: - -```bash -# Update and install system tools -sudo apt update && sudo apt install -y git python3-pip python3-venv nodejs npm - -# Install global node tools (optional but recommended) -sudo npm install -g n -sudo n stable -``` - ---- - -## 4. Deploy Prescale - -### A. Clone the Repository -```bash -git clone https://github.com/your-username/prescale.git -cd prescale -``` -*(Note: If using a private repo, you may need to set up an SSH key or use a Personal Access Token)* - -### B. Setup Backend -```bash -# Create virtual environment -python3 -m venv venv -source venv/bin/activate - -# Install dependencies -pip install -r requirements.txt -pip install prescale-agent # If agent is needed on the server itself - -# Run Backend (Background) -export PORT=8080 -export HOST=0.0.0.0 -nohup python3 -m prescale.ml.inference.app > backend.log 2>&1 & -``` - -### C. Setup Frontend -```bash -cd ml/inference/web - -# Install dependencies -npm install - -# Build for production (Recommended) -npm run build -npm run preview -- --host 0.0.0.0 --port 3000 -# OR for development mode -# npm run dev -- --host 0.0.0.0 --port 3000 -``` - ---- - -## 5. Access Your Prescale Instance - -1. Find your VM's **External IP**: - ```bash - gcloud compute instances list - ``` -2. Open your browser: - - **Dashboard**: `http://:3000` - - **API Docs**: `http://:8080/docs` - -## 6. Making it "Production Ready" (Optional) - -For a persistent setup, consider: -1. **Process Management**: Use `systemd` or `Supervisor` to keep the Python backend running. -2. **Reverse Proxy**: Use **Nginx** or **Caddy** to forward port 80 to 3000/8080 so you don't need to type ports in the URL. -3. **Docker**: Containerize the app for easier deployment (see `Dockerfile` if available). diff --git a/docs/gcp_agent_guide.md b/docs/gcp_agent_guide.md deleted file mode 100644 index 364bcb3..0000000 --- a/docs/gcp_agent_guide.md +++ /dev/null @@ -1,75 +0,0 @@ -# Prescale Agent on GCP: Usage Guide - -This guide explains how to install, configure, and run the Prescale Agent on Google Cloud Platform (GCP) to collect metrics. - -## 1. Installation - -The Prescale Agent is distributed as a Python package. You can install it with the GCP extra to include necessary dependencies (`google-cloud-monitoring`). - -```bash -# Install with GCP support -pip install prescale-agent[gcp] -``` - -## 2. Authentication - -The agent uses **Google Application Default Credentials (ADC)**. - -* **On a GKE Cluster or Compute Engine**: - * Ensure the attached Service Account has the **Monitoring Viewer** (`roles/monitoring.viewer`) IAM role. - * No further action is required; the agent auto-detects credentials. - -* **Locally (for testing)**: - ```bash - gcloud auth application-default login - ``` - -## 3. Configuration - -Create a configuration file named `prescale-agent.yaml`. - -```yaml -# prescale-agent.yaml - -# 1. Prescale Backend Endpoint -endpoint: - url: "http://localhost:8080" # URL of your Prescale Inference Service - api_key: "${PRESCALE_API_KEY}" # Optional, if auth is enabled - -# 2. Configure GCP Source -sources: - - name: gcp-production - type: gcp_monitoring - enabled: true - project_id: "your-gcp-project-id" # Optional: defaults to ADC project if omitted - interval: 60 # Collection interval in seconds - metrics: - - "kubernetes.io/container/cpu/limit_utilization" - - "kubernetes.io/container/memory/limit_utilization" - - "kubernetes.io/container/memory/used_bytes" -``` - -## 4. Running the Agent - -Run the agent pointing to your configuration file: - -```bash -# Set API Key if using variable expansion -export PRESCALE_API_KEY="your-secret-key" - -# Start the agent -prescale-agent --config prescale-agent.yaml -``` - -The agent will start, authenticate with GCP, fetch the configured metrics every 60 seconds, and push them to the Prescale backend. - -## 5. Verification - -You should see logs indicating successful collection: - -```text -INFO:prescale_agent.agent:Starting Prescale Agent v0.2.0... -INFO:prescale_agent.sources.gcp:Connected to GCP Project: your-gcp-project-id -INFO:prescale_agent.agent:Collected 15 metrics from gcp-production -INFO:prescale_agent.client:Successfully pushed batch of 15 metrics to http://localhost:8080 -``` diff --git a/docs/gcp_full_setup_guide.md b/docs/gcp_full_setup_guide.md deleted file mode 100644 index a7660b0..0000000 --- a/docs/gcp_full_setup_guide.md +++ /dev/null @@ -1,110 +0,0 @@ -# Prescale on GCP: Complete Setup Guide - -This guide details how to set up Prescale to: -1. **Collect Real-time Metrics** (using the Prescale Agent). -2. **Retrain Models Automatically** (using the Prescale Backend). - -## Prerequisites - -1. **GCP Project**: A Google Cloud Project with Cloud Monitoring API enabled. -2. **Credentials**: - * **Service Account** with `Monitoring Viewer` role. - * **Local Dev**: Run `gcloud auth application-default login`. - ---- - -## Part 1: Real-time Monitoring (The Agent) - -The **Prescale Agent** runs on your infrastructure (e.g., inside GKE), collects metrics, and pushes them to the Prescale Dashboard in real-time. - -### 1. Install the Agent - -```bash -pip install prescale-agent[gcp] -``` - -### 2. Configure the Agent - -Create a `prescale-agent.yaml` file: - -```yaml -# prescale-agent.yaml -endpoint: - url: "http://localhost:8001" # Backend URL - -sources: - - name: gcp-cluster-1 - type: gcp_monitoring - enabled: true - # Authentication uses ADC (gcloud login) automatically. - # explicit project_id is optional if ADC has a default project. - credentials: - project_id: "your-gcp-project-id" - metrics: - - "kubernetes.io/container/cpu/limit_utilization" - - "kubernetes.io/container/memory/limit_utilization" -``` - -### 3. Run the Agent - -```bash -prescale-agent --config prescale-agent.yaml -``` - -### 4. Verify on Dashboard - -1. Go to the **Dashboard**. -2. Look at the **Agents** card (Green icon). -3. You should see **"1 Agents Online"**. - ---- - -## Part 2: Automated Model Retraining (The Backend) - -The **Prescale Backend** can independently connect to GCP to fetch historical data (e.g., last 24h) to train new ML models. - -### 1. Configure the Backend - -Set these environment variables for the backend service: - -```powershell -# Windows PowerShell -$env:GOOGLE_CLOUD_PROJECT = "your-gcp-project-id" -$env:RETRAIN_ENABLED = "true" -$env:RETRAIN_DATA_SOURCE = "gcp" -``` - -### 2. Run the Backend - -```powershell -# Ensure you are in the prescale root directory -$env:PORT = "8001" -python -m ml.inference.app -``` - -### 3. Verify on Dashboard - -1. Go to the **Dashboard**. -2. Look at the **Model Training** card (Purple icon). -3. It should say **Source: GCP**. -4. Click **"Retrain Now"** to trigger an immediate fetch-and-train cycle. - ---- - -## Part 3: Running the Dashboard - -To see all of this, you need the frontend running: - -```bash -cd ml/inference/web -npm run dev -``` - -Open your browser to `http://localhost:3000`. - -## Summary - -| Component | Connected To | Purpose | Dashboard Verification | -|-----------|--------------|---------|------------------------| -| **Agent** | `prescale-agent.yaml` | Real-time stream | **Agents** Card | -| **Backend** | Env Vars | Historical training | **Model Training** Card | diff --git a/docs/gcp_testing_guide.md b/docs/gcp_testing_guide.md deleted file mode 100644 index 765ae78..0000000 --- a/docs/gcp_testing_guide.md +++ /dev/null @@ -1,90 +0,0 @@ -# Prescale on GCP: Testing Guide - -This guide explains how to configure and test the Prescale ML Retraining service with Google Cloud Platform (GCP) Cloud Monitoring. - -## Prerequisites - -1. **GCP Project**: You need a Google Cloud Project with Cloud Monitoring enabled. -2. **Permissions**: Your user/service account needs `Monitoring Viewer` role. -3. **Data**: The project should have GKE or other metrics available (e.g., `kubernetes.io/container/cpu/limit_utilization`). - -## Configuration - -The Prescale Inference Service uses **Environment Variables** for configuration. It does *not* read `prescale-agent.yaml` directly (that is for the Go agent). - -### 1. Set Environment Variables - -Run these commands in PowerShell before starting the service: - -```powershell -# 1. Set your GCP Project ID (REQUIRED) -$env:GOOGLE_CLOUD_PROJECT = "your-gcp-project-id" - -# 2. Authentication (ADC) -# If running locally, you likely don't need to set GOOGLE_APPLICATION_CREDENTIALS -# if you have run 'gcloud auth application-default login'. -# Otherwise: -# $env:GOOGLE_APPLICATION_CREDENTIALS = "C:\path\to\service-account-key.json" - -# 3. Configure Retraining -$env:RETRAIN_ENABLED = "true" -$env:RETRAIN_DATA_SOURCE = "gcp" -$env:RETRAIN_INTERVAL_HOURS = "6" -``` - -### 2. Verify Authentication - -Ensure you are authenticated locally: - -```bash -gcloud auth application-default login -``` - -## Running the Service - -Start the inference service: - -```powershell -# In c:\Users\Windows\Desktop\prescale -$env:PORT = "8001" -python -m ml.inference.app -``` - -You should see logs indicating the scheduler started: -`INFO: Retraining scheduler started: every 6h, source=gcp` - -## Triggering a Test - -You can manually trigger a retraining cycle to verify data fetching and model training without waiting for the schedule. - -### Option A: Using the Dashboard - -1. Start the frontend: - ```bash - cd ml/inference/web - npm run dev - ``` -2. Open `http://localhost:3000`. -3. Locate the **Model Training** card. -4. Click **Retrain Now**. - -### Option B: Using PowerShell / curl - -```powershell -# Trigger a retrain (fetch last 24h of data) -Invoke-RestMethod -Uri "http://localhost:8001/api/retrain/trigger" -Method Post -Body '{"hours": 24}' -ContentType "application/json" - -# Check status -Invoke-RestMethod -Uri "http://localhost:8001/api/retrain/status" -``` - -## Troubleshooting - -- **"No data returned"**: - - Verify your GCP Project ID is correct. - - Check if the project actually has metrics for the requested window (`kubernetes.io/container/...`). - - Try increasing the window: `{"hours": 168}` (7 days). - -- **Authentication Errors**: - - Run `gcloud auth application-default login` again. - - Check if `GOOGLE_APPLICATION_CREDENTIALS` is pointing to a valid key file. diff --git a/docs/phase5-design.md b/docs/phase5-design.md deleted file mode 100644 index 70deda2..0000000 --- a/docs/phase5-design.md +++ /dev/null @@ -1,510 +0,0 @@ -# Phase 5: Inference Service Design - -## Overview - -Phase 5 implements the **Prescale Inference Service** - a real-time prediction API that serves ML models and provides actionable scaling recommendations. - ---- - -## Goals - -1. **Serve trained models** via REST API -2. **Expose Prometheus metrics** for integration with KEDA -3. **Provide recommendations** for scaling and resource optimization -4. **Enable autoscaling** through KEDA + Prometheus integration - ---- - -## Architecture - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ PHASE 5 ARCHITECTURE │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────────────────────────────────────────────────────────┐ │ -│ │ INFERENCE SERVICE (FastAPI) │ │ -│ │ Port: 8080 │ │ -│ │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │ -│ │ │ /health │ │ /predict │ │ /detect │ │/recommend │ │ │ -│ │ │ /ready │ │ │ │ │ │ │ │ │ -│ │ │ /models │ │ │ │ │ │ │ │ │ -│ │ └───────────┘ └───────────┘ └───────────┘ └───────────┘ │ │ -│ │ │ │ -│ │ ┌───────────────────────────────────────────────────────────────┐ │ │ -│ │ │ /metrics (Prometheus format) │ │ │ -│ │ │ - prescale_predicted_cpu │ │ │ -│ │ │ - prescale_predicted_memory │ │ │ -│ │ │ - prescale_anomaly_score │ │ │ -│ │ │ - prescale_recommended_replicas │ │ │ -│ │ └───────────────────────────────────────────────────────────────┘ │ │ -│ └─────────────────────────────────────────────────────────────────────┘ │ -│ │ │ -│ │ Prometheus scrape │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────────────────┐ │ -│ │ PROMETHEUS │ │ -│ │ - Scrapes /metrics endpoint │ │ -│ │ - Stores time-series data │ │ -│ │ - Provides PromQL interface │ │ -│ └─────────────────────────┬───────────────────────────────────────────┘ │ -│ │ │ -│ ┌──────────────────┼──────────────────┐ │ -│ ▼ ▼ ▼ │ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ Grafana │ │ KEDA │ │Alertmanager │ │ -│ │ (dashboards)│ │ (autoscale) │ │ (alerts) │ │ -│ └─────────────┘ └──────┬──────┘ └─────────────┘ │ -│ │ │ -│ ▼ │ -│ ┌─────────────┐ │ -│ │ Target HPA │ │ -│ │ (Saleor) │ │ -│ └─────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` - ---- - -## Components - -### 1. Inference Service (FastAPI) - -**Location:** `ml/inference/` - -``` -ml/inference/ -├── __init__.py -├── app.py # FastAPI application -├── config.py # Service configuration -├── models.py # Pydantic request/response schemas -├── model_manager.py # Load and manage ML models -├── predictor.py # Prediction logic -├── anomaly_detector.py # Anomaly scoring logic -├── recommender.py # Recommendation engine -└── metrics.py # Prometheus metrics -``` - -### 2. Real-time Scoring Loop - -**Location:** `ml/scoring/` - -``` -ml/scoring/ -├── __init__.py -├── scorer.py # Main scoring loop -├── metrics_stream.py # Fetch latest metrics from cloud -└── publisher.py # Update Prometheus gauges -``` - -### 3. Kubernetes Manifests - -**Location:** `infra/kubernetes/prescale-inference/` - -``` -infra/kubernetes/prescale-inference/ -├── namespace.yaml -├── configmap.yaml -├── deployment.yaml -├── service.yaml -├── serviceaccount.yaml -├── pod-monitoring.yaml -└── kustomization.yaml -``` - -### 4. KEDA Integration - -**Location:** `infra/kubernetes/keda/` - -``` -infra/kubernetes/keda/ -├── namespace.yaml -├── scaledobject-saleor.yaml -└── kustomization.yaml -``` - -### 5. Grafana Dashboards - -**Location:** `infra/kubernetes/grafana/dashboards/` - -``` -infra/kubernetes/grafana/dashboards/ -├── prescale-overview.json -├── prescale-predictions.json -├── prescale-anomalies.json -└── prescale-recommendations.json -``` - -### 6. Alertmanager Rules - -**Location:** `infra/kubernetes/alertmanager/` - -``` -infra/kubernetes/alertmanager/ -├── alertmanager-config.yaml -└── prescale-rules.yaml -``` - ---- - -## API Specification - -### Health & Info - -```yaml -GET /health -Response: - { - "status": "healthy", - "timestamp": "2026-01-02T10:00:00Z" - } - -GET /ready -Response: - { - "ready": true, - "models_loaded": ["baseline", "prophet", "xgboost"] - } - -GET /models -Response: - { - "models": [ - { - "name": "baseline", - "version": "1.0.0", - "loaded_at": "2026-01-02T09:00:00Z", - "metrics": {"mape": 0.026} - }, - { - "name": "prophet", - "version": "1.0.0", - "loaded_at": "2026-01-02T09:00:00Z", - "metrics": {"coverage": 0.469} - }, - { - "name": "xgboost", - "version": "1.0.0", - "loaded_at": "2026-01-02T09:00:00Z", - "metrics": {"anomaly_rate": 0.0069} - } - ] - } -``` - -### Prediction - -```yaml -POST /predict -Request: - { - "metrics": { - "cpu_utilization": 0.45, - "memory_utilization": 0.62, - "memory_bytes": 536870912, - "db_cpu": 0.30, - "db_memory": 0.55, - "db_connections": 15 - }, - "periods": 12, - "model": "baseline" # Optional: baseline, prophet, or auto - } - -Response: - { - "predictions": [ - { - "period": 1, - "timestamp": "2026-01-02T10:05:00Z", - "cpu_utilization": 0.48, - "memory_utilization": 0.63, - "confidence_lower": 0.42, - "confidence_upper": 0.54 - }, - # ... 12 periods - ], - "model_used": "baseline", - "confidence": 0.87 - } -``` - -### Anomaly Detection - -```yaml -POST /detect -Request: - { - "metrics": { - "cpu_utilization": 0.95, - "memory_utilization": 0.88, - "memory_bytes": 1073741824, - "db_cpu": 0.85, - "db_memory": 0.75, - "db_connections": 150 - } - } - -Response: - { - "anomaly_score": 2.8, - "is_anomaly": true, - "threshold": 2.5, - "severity": "warning", # normal, warning, critical - "contributing_features": [ - {"feature": "db_connections", "contribution": 0.45}, - {"feature": "cpu_utilization", "contribution": 0.30}, - {"feature": "memory_utilization", "contribution": 0.25} - ] - } -``` - -### Recommendations - -```yaml -POST /recommend -Request: - { - "namespace": "saleor", - "deployment": "saleor-api", - "current_replicas": 2, - "current_metrics": { - "cpu_utilization": 0.75, - "memory_utilization": 0.60 - }, - "predicted_metrics": { - "cpu_utilization": 0.85, - "memory_utilization": 0.65 - }, - "anomaly_score": 1.2 - } - -Response: - { - "recommendations": [ - { - "type": "scaling", - "action": "scale_up", - "target_replicas": 4, - "reason": "Predicted CPU 85% exceeds threshold 80%", - "confidence": 0.87, - "urgency": "medium", - "execute_by": "2026-01-02T10:15:00Z" - } - ], - "summary": "Scale up recommended before predicted load increase" - } -``` - -### Prometheus Metrics - -```yaml -GET /metrics -Response (text/plain): - # HELP prescale_predicted_cpu Predicted CPU utilization - # TYPE prescale_predicted_cpu gauge - prescale_predicted_cpu{namespace="saleor",deployment="saleor-api",horizon="5m"} 0.48 - prescale_predicted_cpu{namespace="saleor",deployment="saleor-api",horizon="30m"} 0.72 - - # HELP prescale_anomaly_score Current anomaly score - # TYPE prescale_anomaly_score gauge - prescale_anomaly_score{namespace="saleor",deployment="saleor-api"} 1.2 - - # HELP prescale_anomaly_detected Whether anomaly is detected (0/1) - # TYPE prescale_anomaly_detected gauge - prescale_anomaly_detected{namespace="saleor",deployment="saleor-api"} 0 - - # HELP prescale_recommended_replicas Recommended replica count - # TYPE prescale_recommended_replicas gauge - prescale_recommended_replicas{namespace="saleor",deployment="saleor-api"} 4 - - # HELP prescale_inference_duration_seconds Inference latency - # TYPE prescale_inference_duration_seconds histogram - prescale_inference_duration_seconds_bucket{endpoint="/predict",le="0.1"} 950 - prescale_inference_duration_seconds_bucket{endpoint="/predict",le="0.5"} 999 - prescale_inference_duration_seconds_bucket{endpoint="/predict",le="1.0"} 1000 -``` - ---- - -## KEDA ScaledObject - -```yaml -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: saleor-api-scaler - namespace: saleor -spec: - scaleTargetRef: - name: saleor-api - minReplicaCount: 1 - maxReplicaCount: 10 - pollingInterval: 30 - cooldownPeriod: 300 - triggers: - - type: prometheus - metadata: - serverAddress: http://prometheus.monitoring.svc:9090 - metricName: prescale_predicted_cpu - query: | - prescale_predicted_cpu{ - namespace="saleor", - deployment="saleor-api", - horizon="30m" - } - threshold: "70" - activationThreshold: "50" -``` - ---- - -## Configuration - -### Environment Variables - -```bash -# Service -PRESCALE_PORT=8080 -PRESCALE_LOG_LEVEL=INFO - -# Models -PRESCALE_MODEL_PATH=/app/models -PRESCALE_MODEL_REFRESH_INTERVAL=3600 - -# Metrics -PRESCALE_METRICS_NAMESPACE=saleor -PRESCALE_METRICS_DEPLOYMENT=saleor-api - -# Cloud (for real-time scoring) -GCP_PROJECT_ID=YOUR_GCP_PROJECT_ID -PRESCALE_SCORING_INTERVAL=300 - -# Thresholds -PRESCALE_CPU_SCALE_UP=0.80 -PRESCALE_CPU_SCALE_DOWN=0.20 -PRESCALE_MEMORY_WARNING=0.85 -PRESCALE_ANOMALY_THRESHOLD=2.5 -``` - -### ConfigMap - -```yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: prescale-config - namespace: prescale -data: - config.yaml: | - service: - port: 8080 - log_level: INFO - - scoring: - interval_seconds: 300 - namespace: saleor - deployment: saleor-api - - thresholds: - cpu_scale_up: 0.80 - cpu_scale_down: 0.20 - memory_warning: 0.85 - anomaly_sigma: 2.5 - - scaling: - min_replicas: 1 - max_replicas: 10 - cooldown_seconds: 300 -``` - ---- - -## Implementation Order - -| Step | Task | Est. Time | -|------|------|-----------| -| 5.1 | Pydantic schemas (`models.py`) | 15 min | -| 5.2 | Model manager (`model_manager.py`) | 20 min | -| 5.3 | Predictor (`predictor.py`) | 15 min | -| 5.4 | Anomaly detector (`anomaly_detector.py`) | 15 min | -| 5.5 | Recommender (`recommender.py`) | 25 min | -| 5.6 | Prometheus metrics (`metrics.py`) | 15 min | -| 5.7 | FastAPI app (`app.py`) | 20 min | -| 5.8 | Real-time scorer (`scoring/`) | 25 min | -| 5.9 | Dockerfile | 10 min | -| 5.10 | K8s manifests | 20 min | -| 5.11 | KEDA ScaledObject | 15 min | -| 5.12 | Grafana dashboards | 20 min | -| 5.13 | Alertmanager rules | 15 min | -| 5.14 | Testing & validation | 30 min | - -**Total: ~4 hours** - ---- - -## Success Criteria - -- [ ] `/health` responds < 100ms -- [ ] `/predict` returns forecasts with confidence intervals -- [ ] `/detect` flags anomalies correctly -- [ ] `/recommend` provides actionable scaling advice -- [ ] `/metrics` exposes Prometheus gauges -- [ ] KEDA scales Saleor based on predictions -- [ ] Grafana shows predictions vs actuals -- [ ] Alerts fire on anomaly detection - ---- - -## Testing Plan - -### Unit Tests -```bash -cd ml -pytest tests/inference/ -``` - -### Integration Tests -```bash -# Port-forward inference service -kubectl port-forward -n prescale svc/prescale-inference 8080:8080 - -# Test health -curl http://localhost:8080/health - -# Test predict -curl -X POST http://localhost:8080/predict \ - -H "Content-Type: application/json" \ - -d '{"metrics": {"cpu_utilization": 0.5}, "periods": 6}' - -# Test detect -curl -X POST http://localhost:8080/detect \ - -H "Content-Type: application/json" \ - -d '{"metrics": {"cpu_utilization": 0.95}}' - -# Test metrics -curl http://localhost:8080/metrics -``` - -### Load Test -```bash -# Use Locust to stress test inference service -locust -f loadtest/inference_test.py --host=http://localhost:8080 -``` - ---- - -## Rollback Plan - -1. **If inference service fails:** - - KEDA continues using last known metric values - - Revert to standard CPU-based HPA as fallback - -2. **If predictions are inaccurate:** - - Disable KEDA ScaledObject - - Investigate model drift - - Retrain with fresh data - -3. **If alerts are noisy:** - - Adjust thresholds in ConfigMap - - Add silencing rules in Alertmanager diff --git a/docs/real_world_testing.md b/docs/real_world_testing.md deleted file mode 100644 index 8e90ea3..0000000 --- a/docs/real_world_testing.md +++ /dev/null @@ -1,108 +0,0 @@ -# Real-World Scenario Testing Guide for Prescale - -This guide explains how to validate Prescale in a realistic environment by running the agent on a separate machine (or simulated VM) and generating synthetic load. - -## 1. Prerequisites -- **Prescale Server**: Identifying your machine's IP address. -- **Target Machine**: A Linux VM (Ubuntu/Debian recommended) or a Docker container to act as the "Production Server". -- **Traffic Generator**: `stress-ng` (Linux) or similar. - ---- - -## 2. Server Setup (Your Dev Machine) -Ensure Prescale is running and accessible from outside localhost. - -1. **Find your Local IP**: - ```bash - # Windows PowerShell - ipconfig - # Look for IPv4 Address (e.g., 192.168.1.50) - ``` - -2. **Start Prescale Backend**: - Ensure it listens on `0.0.0.0` (all interfaces), not just `127.0.0.1`. - ```bash - # In c:\Users\Windows\Desktop\prescale - $env:HOST="0.0.0.0" - $env:PORT="8080" - python -m prescale.ml.inference.app - ``` - *Note: If using `uvicorn` directly, add `--host 0.0.0.0`.* - ---- - -## 3. Agent Setup (The "Real World" Server) - -### Option A: Using Docker (Quickest) -If you have Docker, you can simulate a separate server instantly. - -1. **Run the Agent Container**: - Replace `YOUR_PC_IP` with your actual IP (e.g., `192.168.1.50`). - ```bash - docker run -d --name production-db-node \ - -e PRESCALE_ENDPOINT="http://YOUR_PC_IP:8080" \ - -e PRESCALE_DEPLOYMENT="production-cluster" \ - --network host \ - prescale-agent:latest - ``` - -### Option B: Using a Linux VM/VPS -If you have a cloud VM (AWS/GCP) or local VM (VirtualBox/WSL2). - -1. **Install Python & Pip**: - ```bash - sudo apt update && sudo apt install -y python3-pip stress-ng - ``` - -2. **Install Prescale Agent**: - ```bash - pip install prescale-agent - ``` - -3. **Run the Agent**: - ```bash - export PRESCALE_ENDPOINT="http://YOUR_PC_IP:8080" - export PRESCALE_DEPLOYMENT="production-cluster" - prescale-agent run - ``` - ---- - -## 4. Scenario: The "Memory Leak" Simulation -Let's trick Prescale into thinking there's an anomaly. - -1. **Go to Prescale Dashboard**: - - Create a new deployment called `production-cluster` (if not auto-created). - - Verify the agent `production-db-node` is **Online**. - -2. **Generate Normal Load (Baseline)**: - Let the agent run for ~5 minutes to establish a baseline. - - CPU: ~10-20% - - Memory: Stable. - -3. **Trigger the Incident**: - Run `stress-ng` on the target machine (or inside the Docker container) to simulate a memory leak and CPU spike. - - ```bash - # Simulate high load: 2 CPU cores, growing memory usage - stress-ng --cpu 2 --vm 1 --vm-bytes 500M --timeout 60s - ``` - -4. **Observe in Prescale**: - - **Real-time**: Watch the **Dashboard** CPU/Memory gauges spike. - - **Predictions**: See the **Predictions** graph diverge from the baseline (green line). - - **Anomalies**: Wait ~1-2 minutes. Go to the **Anomalies** tab. You should see a `Critical` or `High` severity anomaly detected for `cpu_percent` or `memory_percent`. - ---- - -## 5. Scenario: Network Failure -Simulate an agent disconnection to test resilience. - -1. **Kill the Agent**: - - Docker: `docker stop production-db-node` - - Linux: `Ctrl+C` or `kill ` -2. **Observe**: - - The Agent status on the **Agents** page should turn **Offline** (Red) after the heartbeat timeout (usually ~30-60s). - -## Summary -By separating the Agent from the Server and injecting synthetic faults (`stress-ng`), you demonstrate Prescale's true value: **Observability without access to the machine itself.** diff --git a/docs/research_infra_chat.md b/docs/research_infra_chat.md deleted file mode 100644 index b098a8a..0000000 --- a/docs/research_infra_chat.md +++ /dev/null @@ -1,186 +0,0 @@ -# Infrastructure Chat Feature — Research Report - -## 1. LLM Provider Comparison - -| Provider | Model | Input Cost (per 1M tokens) | Output Cost (per 1M tokens) | Tool Calling Maturity | Latency | -|---|---|---|---|---|---| -| **Google Gemini 2.5 Flash** | gemini-2.5-flash | **$0.30** | $2.50 | Good (improving) | ~263 tok/s | -| Google Gemini 2.5 Pro | gemini-2.5-pro | $1.25 | $10.00 | Good | Higher (Deep Think) | -| Google Gemini 3 Flash | gemini-3-flash | $0.50 | $3.00 | Newest | Sub-200ms p95 | -| **OpenAI GPT-4o mini** | gpt-4o-mini | **$0.15** | $0.60 | Most mature | Fast | -| OpenAI GPT-4o | gpt-4o | $2.50 | $10.00 | Most mature | Moderate | -| Anthropic Claude 3.5 Sonnet | claude-3.5-sonnet | $3.00 | $15.00 | Good | Moderate | - -### Recommendation for Prescale - -**Primary: Gemini 2.5 Flash** — Prescale is already a GCP-native platform, so Gemini keeps the ecosystem consistent. At $0.30/1M input tokens, it's extremely cost-effective for a monitoring tool that makes many small tool-call queries. - -**Fallback consideration: GPT-4o mini** — Cheapest option overall and has the most mature tool-calling. Worth supporting as an alternative if users prefer OpenAI. - -> **Key Insight**: For Prescale's use case (short queries + tool calls + short answers), each chat turn will use roughly **500–2,000 tokens**. At ~1,000 tokens/turn with Gemini Flash, that's **$0.0003 per question** — effectively free for individual developers. - ---- - -## 2. Architecture Patterns - -### Option A: Raw Google Gemini SDK (Recommended for Prescale) - -``` -User → FastAPI endpoint → Gemini SDK (with tool defs) → Tool execution → Response -``` - -**Pros**: Simplest, no extra dependencies, full control, lowest latency -**Cons**: Must handle conversation state and tool-call loops manually -**Best for**: Prescale — our use case is straightforward (single-agent, ~7 tools, no multi-agent coordination) - -### Option B: LangChain - -**Pros**: Quick prototyping, many integrations, familiar ecosystem -**Cons**: Opaque control flow, debugging is harder, overkill for our tool count -**Best for**: RAG-heavy applications, rapid prototyping - -### Option C: LangGraph - -**Pros**: Graph-based state machine, production-grade, human-in-the-loop, multi-agent support -**Cons**: Steeper learning curve, heavier dependency, unnecessary complexity for <10 tools -**Best for**: Complex multi-agent systems, applications requiring branching workflows - -### Verdict for Prescale - -**Use raw Gemini SDK (`google-genai`).** Our chat feature is a single agent with ~7 well-defined tools querying internal Prescale data. LangGraph's state machine and multi-agent coordination are overkill. We can always migrate later if complexity grows. - -The tool-call loop is simple: -1. Send user message + tool definitions to Gemini -2. If Gemini returns a tool call → execute it against Prescale internals → send result back -3. Repeat until Gemini returns a text response -4. Stream the final text to the frontend via SSE - ---- - -## 3. Competitor Analysis - -### Datadog — Bits AI (Market Leader) - -- Acts as an **autonomous SRE agent** that can investigate outages, suggest code changes, and execute remediation playbooks -- Users can trigger safe, audited **actions from chat** (restart services, flush caches, quarantine accounts) -- All actions are validated against **role-based policies** and logged for accountability -- GA since Dec 2025, 1,000+ customers using AI products -- **Takeaway for Prescale**: The "action" layer (executing changes from chat) is the next frontier. We should start with **read-only** queries and consider actions later. - -### New Relic — AI / Grok - -- Conversational interface for natural language queries across telemetry data -- Root cause hypothesis generation and impact surface mapping -- **Predictive analytics** (forecast anomalies from chat) — directly relevant to Prescale -- **Agentic integrations** with external tools (ServiceNow, etc.) -- **Takeaway for Prescale**: Their approach of surfacing predictions and forecasts through chat is exactly what we should do. Our ML models give us a differentiator here. - -### Positioning for Prescale - -| Feature | Datadog Bits AI | New Relic Grok | Prescale Chat (Proposed) | -|---|---|---|---| -| Natural language queries | ✅ | ✅ | ✅ | -| Anomaly explanation | ✅ | ✅ | ✅ | -| **Predictive forecasting from chat** | ❌ | ✅ (limited) | **✅ (core strength)** | -| Execute remediation actions | ✅ | ❌ | ❌ (v1, future) | -| Multi-cloud support | ✅ | ✅ | ✅ (6+ sources) | -| Self-hosted / Open-source | ❌ | ❌ | **✅** | -| Cost | $$$$ | $$$ | **Free (self-hosted)** | - -**Prescale's edge**: Open-source + built-in ML predictions. No other open-source observability tool has an LLM chat grounded in its own forecasting models. - ---- - -## 4. UX Research — What SREs Actually Ask - -Research shows SRE questions fall into **5 categories**: - -### Category 1: Incident Investigation (Most Common) -> "What caused the latency spike in the auth service?" -> "Show me error logs from the last 30 minutes" -> "Correlate recent deployments with CPU spikes" - -**Prescale tools needed**: `get_metric_data`, `get_latest_anomalies` - -### Category 2: Health & Performance Monitoring -> "What's the current health of my cluster?" -> "Show P99 latency for the search service" -> "Are any services exceeding their error budget?" - -**Prescale tools needed**: `get_metric_latest`, `get_agent_status`, `get_metric_names` - -### Category 3: Capacity Planning & Prediction -> "Will my database handle next month's traffic?" -> "When will disk usage hit 90%?" -> "What scaling do you recommend?" - -**Prescale tools needed**: `get_predictions`, `get_recommendations` — **this is our strongest category** - -### Category 4: Alert Management -> "What are the top alerts from the last 12 hours?" -> "Summarize incidents while I was off-call" - -**Prescale tools needed**: `get_latest_anomalies` (with time filters) - -### Category 5: System Understanding -> "Explain the data flow between service A and B" -> "What monitoring is configured for the dashboard?" - -**Prescale tools needed**: `get_deployments`, `get_agent_status`, `get_metric_names` - -### Suggested Starter Prompts for Prescale Chat - -``` -"Any anomalies right now?" -"How's CPU looking across my agents?" -"Predict memory usage for the next 6 hours" -"What scaling changes do you recommend?" -"Which agents are offline?" -"Summarize the health of my infrastructure" -``` - ---- - -## 5. Cost & Feasibility Analysis - -### Token Usage Estimates - -| Interaction Type | Input Tokens | Output Tokens | Cost (Gemini 2.5 Flash) | -|---|---|---|---| -| Simple query ("any anomalies?") | ~800 | ~200 | $0.0007 | -| Tool-call query (1 tool) | ~1,500 | ~500 | $0.002 | -| Complex query (3 tools) | ~3,000 | ~800 | $0.003 | -| Heavy session (20 questions) | ~30,000 | ~8,000 | $0.03 | - -**Monthly cost estimates** (per user): -- Light use (5 questions/day): **~$0.50/month** -- Medium use (20 questions/day): **~$2.00/month** -- Heavy use (50 questions/day): **~$5.00/month** - -### Latency Expectations - -- **Gemini 2.5 Flash**: ~263 tokens/sec → a 200-token answer streams in <1 second -- **With 1 tool call**: Add ~200ms for internal Prescale query → total ~1.5s -- **With 3 tool calls**: Add ~600ms → total ~2.5s -- **SSE streaming**: User sees first tokens within ~500ms, creating a responsive feel - -### Implementation Effort - -| Component | Estimated Effort | Complexity | -|---|---|---| -| `chat.py` backend module | 2–3 hours | Medium (tool-call loop, SSE streaming) | -| Chat API endpoints in `app.py` | 30 min | Low | -| `InfraChat.vue` frontend | 2–3 hours | Medium (streaming UI, markdown rendering) | -| Router + sidebar updates | 15 min | Low | -| Testing & polish | 1–2 hours | Low | -| **Total** | **~6–8 hours** | — | - -### Risk Assessment - -| Risk | Severity | Mitigation | -|---|---|---| -| LLM hallucination (wrong metric values) | Medium | Ground all answers in real tool-call data; never let LLM guess values | -| API key exposure | Low | Server-side only; key never reaches frontend | -| Cost runaway | Low | Rate limiting on `/api/chat` endpoint | -| Gemini API downtime | Low | Feature is optional; rest of Prescale works without it | -| Slow responses on complex queries | Medium | Stream tokens via SSE; show "thinking" indicator | diff --git a/docs/user_journey_gcp_locust.md b/docs/user_journey_gcp_locust.md deleted file mode 100644 index 495cf71..0000000 --- a/docs/user_journey_gcp_locust.md +++ /dev/null @@ -1,186 +0,0 @@ -# User Journey: GCP & Locust Load Testing - -This guide simulates a real-world "Developer" scenario: -1. **You have an app on GCP.** -2. **You monitor it with Prescale.** -3. **You stress-test it with Locust.** -4. **You watch Prescale react (Predictions, Anomalies, Recommendations).** - ---- - -## Prerequisites (The "Third Party" Setup) - -1. **Google Cloud Project**: You need a project ID (e.g., `my-cool-project`). -2. **GCP CLI (`gcloud`)**: Installed and authenticated. -3. **Python 3.10+**: For running Prescale. - -### Authentication -Run this once to let Prescale access your GCP metrics: -```bash -gcloud auth application-default login -``` - ---- - -## Step 1: Deploy a Target Service (The "App") - -First, we need something to monitor. Let's deploy a simple "Hello World" service to **Cloud Run**. - -```bash -# Deploy a sample container (Google's "hello-cloud-run") -gcloud run deploy prescale-demo-target \ - --image=gcr.io/cloudrun/hello \ - --platform=managed \ - --region=us-central1 \ - --allow-unauthenticated -``` -*Take note of the URL provided (e.g., `https://prescale-demo-target-xyz.a.run.app`).* - ---- - -## Step 2: Configure Prescale Backend (The "Brain") - -We need to tell Prescale to pull metrics from GCP for its ML models. - -### 1. Set Environment Variables -In your terminal (PowerShell or Bash), configure the backend to use GCP as the data source. - -**PowerShell:** -```powershell -$env:GOOGLE_CLOUD_PROJECT = "YOUR_PROJECT_ID_HERE" -$env:RETRAIN_DATA_SOURCE = "gcp" - -# Important: Lower requirements for testing so we don't have to wait 24h -$env:RETRAIN_MIN_DATA_POINTS = "12" # Only require 1 hour of data (5-min intervals) -$env:RETRAIN_TRAINING_HOURS = "1" # Fetch last 1 hour -``` - -### 2. Run the Backend -```powershell -# From prescale root directory -python -m ml.inference.app -``` - -### 3. Run the Frontend (New Terminal) -```powershell -cd ml/inference/web -npm run dev -``` - ---- - -## Step 3: Configure Prescale Agent (The "Eyes") - -Now, we point the agent at your Cloud Run service. - -### 1. Edit `prescale-agent-gcp.yaml` -Update the `metrics` and `credentials` sections. - -```yaml -endpoint: - url: "http://localhost:8080" - -sources: - - name: gcp-demo-target - type: gcp_monitoring - enabled: true - interval_seconds: 60 - credentials: - project_id: "YOUR_PROJECT_ID_HERE" - - # Cloud Run Metrics - metrics: - - "run.googleapis.com/container/cpu/utilizations" - - "run.googleapis.com/container/memory/utilizations" - - "run.googleapis.com/request_count" - - labels: - service_name: "prescale-demo-target" - location: "us-central1" -``` - -### 2. Run the Agent -```powershell -prescale-agent --config prescale-agent-gcp.yaml -``` - -**Check Dashboard**: Go to `http://localhost:3000/agents`. You should see `gcp-demo-target` is **Online**. - ---- - -## Step 4: Establish a Baseline (Wait ~5-10 mins) - -Prescale needs data to learn what "normal" looks like. -1. Let the agent run for 5-10 minutes. -2. Go to **Dashboard** -> **Model Training**. -3. Click **"Retrain Now"**. -4. If successful, you'll see "Last Run: Completed" and data points > 0. - -*Note: If it says "Skipped (Insufficient data)", wait another 5 minutes and try again.* - ---- - -## Step 5: The Attack (Locust Load Test) - -Now we disrupt the system to trigger predictions and anomalies. - -### 1. Install Locust -```bash -pip install locust -``` - -### 2. Create `locustfile.py` -Save this file in your root folder: - -```python -from locust import HttpUser, task, between - -class WebsiteUser(HttpUser): - wait_time = between(1, 3) - - @task - def index(self): - self.client.get("/") -``` - -### 3. Launch the Attack -Run Locust pointing at your Cloud Run URL: -```bash -locust -f locustfile.py --host https://prescale-demo-target-xyz.a.run.app -``` - -1. Open `http://localhost:8089` (Locust Interface). -2. Start with **10 users**, Spawn rate **1**. -3. Let it run for 2 minutes (Baseline traffic). -4. **Ramp Up**: Edit the test to **500 users**, Spawn rate **50**. This causes a massive spike. - ---- - -## Step 6: Verify Prescale Features - -Go back to `http://localhost:3000`. - -### 1. Predictions -* Go to the **Predictions** tab. -* You should see the forecast (green line) expecting low/stable traffic. -* The actual values (blue line) will skyrocket above the confidence interval. - -### 2. Anomalies -* Wait ~1-2 minutes for the agent to report the spike. -* Go to the **Anomalies** tab. -* You should see a **High/Critical Severity** anomaly for `request_count` or `cpu/utilizations`. -* Description: "Value X is significantly higher than expected Y". - -### 3. Recommendations -* Go to the **Dashboard**. -* Look at the "AI Recommendations" card. -* You might see: "Scale UP prescale-demo-target to 5 replicas (Reason: High CPU utilization predicted)". - ---- - -## Summary -You have successfully: -1. Deployed a cloud service. -2. Connected Prescale (Backend & Agent) to it. -3. Simulated a traffic surge with Locust. -4. Confirmed Prescale detected the anomaly and offered a recommendation. diff --git a/docs/user_scenarios.md b/docs/user_scenarios.md deleted file mode 100644 index b6cc866..0000000 --- a/docs/user_scenarios.md +++ /dev/null @@ -1,103 +0,0 @@ -# Prescale Deployment Scenarios - -Prescale is designed to be flexible. Whether you are a solo developer or an enterprise SRE team, you can deploy Prescale to fit your infrastructure. - -## Architecture Overview - -Prescale consists of two main components: -1. **Prescale Server (The Brain)**: - * **Backend**: Python FastAPI service (Inference, Anomaly Detection). - * **Frontend**: Vue.js Web Dashboard. - * **Storage**: SQLite (default) or PostgreSQL. -2. **Prescale Agent (The Eyes)**: - * Python service running on *each* server/node you want to monitor. - ---- - -## Scenario 1: The Local Developer & Tester -**Best For:** Evaluating Prescale, developing plugins, or monitoring your local dev machine. - -In this setup, everything runs on your laptop. - -### Setup -1. **Backend**: Run via Docker Compose. - ```bash - docker-compose up -d inference - ``` -2. **Frontend**: Run locally with Node.js. - ```bash - cd ml/inference/web - npm run dev - ``` -3. **Agent**: Run locally to monitor your own machine. - ```bash - pip install prescale-agent - export PRESCALE_ENDPOINT="http://localhost:8080" - prescale-agent run - ``` - ---- - -## Scenario 2: The "v1" Production Setup (Small Team) -**Best For:** Startups, Homelabs, or specific project monitoring. - -You deploy the Prescale Server on a dedicated VM (e.g., AWS EC2, GCP Compute Engine, DigitalOcean Droplet) and install agents on your application servers. - -### Architecture -* **1 x VM (e.g., Ubuntu)**: Runs Prescale Backend + Frontend. -* **N x App Servers**: Run Prescale Agent. - -### Setup -1. **Provision VM**: follow the [GCP Deployment Guide](deployment_to_gcp_vm.md). -2. **Network**: Open ports `8080` (API) and `3000` (UI). -3. **Agents**: On your app servers (DB, Web, Worker), install and point to your VM. - ```bash - export PRESCALE_ENDPOINT="http://:8080" - prescale-agent run - ``` - ---- - -## Scenario 3: The "Cloud Native" Setup (Kubernetes) -**Best For:** Large scale deployments, SRE teams. - -Prescale Server runs as a Service in your K8s cluster. Agents run as a **DaemonSet**, ensuring every node in your cluster is automatically monitored. - -### Setup -1. **Prescale Server Deployment**: - * Deploy `prescale-backend` Deployment + Service. - * Deploy `prescale-frontend` Deployment + Service (LoadBalancer/Ingress). -2. **Agent DaemonSet**: - ```yaml - apiVersion: apps/v1 - kind: DaemonSet - metadata: - name: prescale-agent - spec: - template: - spec: - containers: - - name: agent - image: prescale-agent:latest - env: - - name: PRESCALE_ENDPOINT - value: "http://prescale-backend.default.svc.cluster.local:8080" - ``` - ---- - -## Scenario 4: The Hybrid / Multi-Cloud Monitor -**Best For:** Enterprises with on-prem legacy servers AND cloud resources. - -Prescale Server acts as a central hub. Agents from different environments connect back to it. - -### Architecture -* **Central Hub**: Prescale Server running in a robust location (e.g., Cluster or Main Region). -* **Edge Agents**: - * **On-Prem Server**: Agent installed via `pip`. - * **AWS EC2**: Agent via UserData script. - * **GCP VM**: Agent via Startup script. - -### Configuration -* **Public Access**: The Prescale Server needs a public IP or DNS (e.g., `monitor.company.com`). -* **Security**: Use API Keys (configured in `config.py`) to prevent unauthorized agents from pushing data. diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 98572e2..0000000 --- a/examples/README.md +++ /dev/null @@ -1,99 +0,0 @@ -# Prescale Examples - -Demo environments and examples for testing Prescale capabilities. - -> **Note**: These are for testing and demos only, not part of the core Prescale product. - -## Contents - -### demo-environment/ - -A complete test environment with: - -| Component | Description | -|-----------|-------------| -| `kubernetes/` | K8s manifests for demo apps | -| `loadtest/` | Locust load testing scripts | -| `data/` | Sample metrics data | - -## Quick Start - -### Deploy Load Test Environment - -```bash -# Create namespace -kubectl create namespace loadtest - -# Deploy Locust load testing -kubectl apply -f demo-environment/kubernetes/locust/ - -# Wait for pods -kubectl wait --for=condition=ready pod -l app=locust-master -n loadtest --timeout=120s -``` - -### Access Locust UI - -```bash -# Port-forward (or use LoadBalancer) -kubectl port-forward svc/locust-master 8089:8089 -n loadtest - -# Open http://localhost:8089 -``` - -### Run Load Test - -1. Open Locust UI at http://localhost:8089 -2. Set target host (e.g., `http://your-app-service`) -3. Configure users: 50 users, spawn rate 5/s -4. Click "Start swarming" - -## Using with Prescale - -### Collect Metrics from Load Test - -```yaml -# prescale-agent.yaml -sources: - - type: gcp-monitoring - enabled: true - config: - project_id: your-project - filters: - namespace: loadtest -``` - -### Train Models on Load Test Data - -```bash -cd ml -python train.py --namespace loadtest --hours 24 -``` - -### Test Predictions - -```bash -# After training -prescale predict cpu --deployment locust-worker --namespace loadtest -prescale detect --deployment locust-worker --namespace loadtest -``` - -## Sample Data - -The `data/` directory contains sample metrics for offline testing: - -```bash -# Use sample data for training without cloud access -python train.py --data-file examples/demo-environment/data/sample_metrics.json -``` - -## What Ships to Customers - -Customers receive only the **core Prescale product**: - -- ✅ Prescale Agent -- ✅ Prescale CLI -- ✅ Inference Service -- ✅ Helm Charts -- ✅ Documentation - -These demo environments are for internal development and testing only. diff --git a/examples/demo-environment/data/.gitkeep b/examples/demo-environment/data/.gitkeep deleted file mode 100644 index ea06d26..0000000 --- a/examples/demo-environment/data/.gitkeep +++ /dev/null @@ -1,6 +0,0 @@ -# Load test data for ML training -# Place Locust CSV exports here: -# - *_stats.csv - Request statistics (RPS, response times) -# - *_failures.csv - Error details -# - *_exceptions.csv - Python exceptions -# - *_stats_history.csv - Time-series data (most useful for ML) diff --git a/examples/demo-environment/data/2025-12-31_100users_20mins_exceptions.csv b/examples/demo-environment/data/2025-12-31_100users_20mins_exceptions.csv deleted file mode 100644 index c7d81b8..0000000 --- a/examples/demo-environment/data/2025-12-31_100users_20mins_exceptions.csv +++ /dev/null @@ -1 +0,0 @@ -Count,Message,Traceback,Nodes diff --git a/examples/demo-environment/data/2025-12-31_100users_20mins_failures.csv b/examples/demo-environment/data/2025-12-31_100users_20mins_failures.csv deleted file mode 100644 index 4e00162..0000000 --- a/examples/demo-environment/data/2025-12-31_100users_20mins_failures.csv +++ /dev/null @@ -1,7 +0,0 @@ -Method,Name,Error,Occurrences -POST,[Searcher] Search,"""CatchResponseError('HTTP 0')""",170 -POST,[Browser] Categories,"""CatchResponseError('HTTP 0')""",68 -POST,[Searcher] Search+Sort,"""CatchResponseError('HTTP 0')""",100 -POST,[Buyer] Products,"""CatchResponseError('HTTP 0')""",32 -POST,[Browser] Products,"""CatchResponseError('HTTP 0')""",155 -POST,[Searcher] Browse,"""CatchResponseError('HTTP 0')""",27 diff --git a/examples/demo-environment/data/2025-12-31_100users_20mins_stats.csv b/examples/demo-environment/data/2025-12-31_100users_20mins_stats.csv deleted file mode 100644 index c84f5a9..0000000 --- a/examples/demo-environment/data/2025-12-31_100users_20mins_stats.csv +++ /dev/null @@ -1,11 +0,0 @@ -Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100% -POST,[Admin] Categories,355,0,1400.0,1433.6367448929575,49.89497999849846,4550.572435000049,210.0,0.17728200484095682,0.0,1400,1800,2000,2200,2600,2900,3200,3900,4600,4600,4600 -POST,[Admin] FetchList,360,0,2100.0,1924.2016613250019,69.35519599937834,5231.891444000013,133.0,0.17977893448660412,0.0,2100,2500,2800,2900,3400,3700,4000,4200,5200,5200,5200 -POST,[Admin] Products,497,0,2000.0,1822.5595172897772,73.31700999930035,5503.53131899999,133.0,0.24819480677733957,0.0,2000,2400,2700,2900,3300,3600,4500,4700,5500,5500,5500 -POST,[Browser] Categories,4839,68,1300.0,1309.3086601950806,2.9919059999201636,5286.7371900000535,207.04897706137632,2.416528511057437,0.033958243180803,1300,1700,1900,2100,2500,2900,3300,3600,4700,5300,5300 -POST,[Browser] Products,11559,155,2000.0,1892.2878983412875,2.4844750000738713,6909.976255999936,131.2165412232892,5.77240195480738,0.07740481901506566,2000,2400,2700,2900,3400,3800,4200,4600,5500,6100,6900 -POST,[Buyer] Products,3418,32,2100.0,1969.9734858376232,16.14462699944852,5853.077951000159,131.75482738443534,1.70690110576448,0.015980349732142587,2100,2500,2800,2900,3400,3800,4200,4600,5700,5900,5900 -POST,[Searcher] Browse,1069,27,1900.0,1724.6707263928797,8.597456999950737,5204.810786999986,129.64078578110383,0.5338435582393883,0.013483420086495309,1900,2300,2600,2800,3200,3600,4200,4500,5200,5200,5200 -POST,[Searcher] Search,6456,170,2100.0,1862.8749708410755,2.4630780000052255,6044.65987399999,129.49783147459726,3.224035558459767,0.0848956079520075,2100,2500,2800,3000,3500,3800,4300,4600,5800,6000,6000 -POST,[Searcher] Search+Sort,3414,100,2100.0,1925.0054441397074,1.7898959999911312,6343.0799470000065,129.10427650849442,1.7049035620479622,0.049938592912945584,2100,2600,2800,3000,3500,3900,4400,4700,5600,6300,6300 -,Aggregated,31967,552,1900.0,1798.4764890470751,1.7898959999911312,6909.976255999936,143.050552131886,15.963869996481316,0.27566103287945964,1900,2400,2700,2800,3300,3700,4200,4500,5500,6100,6900 diff --git a/examples/demo-environment/data/2025-12-31_100users_report.html b/examples/demo-environment/data/2025-12-31_100users_report.html deleted file mode 100644 index deafc49..0000000 --- a/examples/demo-environment/data/2025-12-31_100users_report.html +++ /dev/null @@ -1,859 +0,0 @@ - - - - Test Report for locustfile.py - - - - -
-

Locust Test Report

- -
- -

During: 2025-12-31 18:16:25 - 2025-12-31 18:49:48

-

Target Host: http://saleor-api.saleor.svc.cluster.local

-

Script: locustfile.py

-
- -
-

Request Statistics

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MethodName# Requests# FailsAverage (ms)Min (ms)Max (ms)Average size (bytes)RPSFailures/s
POST[Admin] Categories355014334945502100.20.0
POST[Admin] FetchList360019246952311330.20.0
POST[Admin] Products497018227355031330.20.0
POST[Browser] Categories4839681309252862072.40.0
POST[Browser] Products115591551892269091315.80.1
POST[Buyer] Products34183219691658531311.70.0
POST[Searcher] Browse1069271724852041290.50.0
POST[Searcher] Search64561701862260441293.20.1
POST[Searcher] Search+Sort34141001925163431291.70.0
Aggregated3196755217981690914316.00.3
-
- -
-

Response Time Statistics

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MethodName50%ile (ms)60%ile (ms)70%ile (ms)80%ile (ms)90%ile (ms)95%ile (ms)99%ile (ms)100%ile (ms)
POST[Admin] Categories14001600190022002600290039004600
POST[Admin] FetchList21002400260029003400370042005200
POST[Admin] Products20002200250029003300360047005500
POST[Browser] Categories13001600180021002500290036005300
POST[Browser] Products20002300260029003400380046006900
POST[Buyer] Products21002400260029003400380046005900
POST[Searcher] Browse19002200240028003200360045005200
POST[Searcher] Search21002400260030003500380046006000
POST[Searcher] Search+Sort21002400270030003500390047006300
Aggregated19002200250028003300370045006900
-
- - -
-

Failures Statistics

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MethodNameErrorOccurrences
POST[Searcher] SearchCatchResponseError('HTTP 0')170
POST[Browser] CategoriesCatchResponseError('HTTP 0')68
POST[Searcher] Search+SortCatchResponseError('HTTP 0')100
POST[Buyer] ProductsCatchResponseError('HTTP 0')32
POST[Browser] ProductsCatchResponseError('HTTP 0')155
POST[Searcher] BrowseCatchResponseError('HTTP 0')27
-
- - - - - -
-

Charts

-
- - -
-

Final ratio

-
-
- -
- - - - - - - - - - - - - \ No newline at end of file diff --git a/examples/demo-environment/kubernetes/locust/base/configmap.yaml b/examples/demo-environment/kubernetes/locust/base/configmap.yaml deleted file mode 100644 index 0aa8a28..0000000 --- a/examples/demo-environment/kubernetes/locust/base/configmap.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: locust-config - namespace: loadtest -data: - # Target Saleor API (internal cluster DNS) - LOCUST_HOST: "http://saleor-api.saleor.svc.cluster.local" - - # Admin credentials for admin persona - ADMIN_EMAIL: "admin@example.com" - ADMIN_PASSWORD: "admin123456" - - # Python path for imports - PYTHONPATH: "/mnt/locust" diff --git a/examples/demo-environment/kubernetes/locust/base/deployment.yaml b/examples/demo-environment/kubernetes/locust/base/deployment.yaml deleted file mode 100644 index ff80ce4..0000000 --- a/examples/demo-environment/kubernetes/locust/base/deployment.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: locust-master - namespace: loadtest - labels: - app: locust - role: master -spec: - replicas: 1 - selector: - matchLabels: - app: locust - role: master - template: - metadata: - labels: - app: locust - role: master - spec: - containers: - - name: locust-master - image: locustio/locust:2.20.0 - ports: - - containerPort: 8089 - name: web - - containerPort: 5557 - name: master - - containerPort: 5558 - name: master-bind - - containerPort: 9646 - name: prometheus - envFrom: - - configMapRef: - name: locust-config - command: - - locust - args: - - -f - - /mnt/locust/locustfile.py - - --master - - --web-host=0.0.0.0 - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 500m - memory: 512Mi - volumeMounts: - - name: locust-scripts - mountPath: /mnt/locust - livenessProbe: - httpGet: - path: / - port: 8089 - initialDelaySeconds: 10 - periodSeconds: 30 - readinessProbe: - httpGet: - path: / - port: 8089 - initialDelaySeconds: 5 - periodSeconds: 10 - volumes: - - name: locust-scripts - configMap: - name: locust-scripts ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: locust-worker - namespace: loadtest - labels: - app: locust - role: worker -spec: - replicas: 2 - selector: - matchLabels: - app: locust - role: worker - template: - metadata: - labels: - app: locust - role: worker - spec: - containers: - - name: locust-worker - image: locustio/locust:2.20.0 - envFrom: - - configMapRef: - name: locust-config - command: - - locust - args: - - -f - - /mnt/locust/locustfile.py - - --worker - - --master-host=locust-master - resources: - requests: - cpu: 200m - memory: 256Mi - limits: - cpu: 1 - memory: 512Mi - volumeMounts: - - name: locust-scripts - mountPath: /mnt/locust - volumes: - - name: locust-scripts - configMap: - name: locust-scripts diff --git a/examples/demo-environment/kubernetes/locust/base/kustomization.yaml b/examples/demo-environment/kubernetes/locust/base/kustomization.yaml deleted file mode 100644 index a2535b4..0000000 --- a/examples/demo-environment/kubernetes/locust/base/kustomization.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: loadtest - -resources: -- namespace.yaml -- configmap.yaml -- deployment.yaml -- service.yaml - -# ConfigMap generator for locust scripts (single file for simplicity) -configMapGenerator: -- name: locust-scripts - files: - - locustfile.py=scripts/locustfile.py - -commonLabels: - app.kubernetes.io/name: locust - app.kubernetes.io/part-of: prescale - app.kubernetes.io/component: loadtest diff --git a/examples/demo-environment/kubernetes/locust/base/namespace.yaml b/examples/demo-environment/kubernetes/locust/base/namespace.yaml deleted file mode 100644 index d04afca..0000000 --- a/examples/demo-environment/kubernetes/locust/base/namespace.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: loadtest - labels: - app.kubernetes.io/name: loadtest - app.kubernetes.io/part-of: prescale diff --git a/examples/demo-environment/kubernetes/locust/base/scripts/locustfile.py b/examples/demo-environment/kubernetes/locust/base/scripts/locustfile.py deleted file mode 100644 index 683ecaa..0000000 --- a/examples/demo-environment/kubernetes/locust/base/scripts/locustfile.py +++ /dev/null @@ -1,291 +0,0 @@ -""" -Simplified locustfile for Kubernetes deployment. -All code in single file for ConfigMap deployment. -""" -import json -import random -import string -from typing import Any, Optional, List -from locust import HttpUser, task, between, tag - - -# ============== UTILITIES ============== - -SEARCH_TERMS = [ - "shirt", "shoes", "pants", "jacket", "dress", "hat", "bag", "watch", - "phone", "laptop", "headphones", "camera", "book", "toy", "game" -] - -FIRST_NAMES = ["John", "Jane", "Michael", "Sarah", "David", "Emily", "Chris", "Amanda"] -LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller"] -CITIES = [ - ("New York", "NY", "10001"), - ("Los Angeles", "CA", "90001"), - ("Chicago", "IL", "60601"), -] - -def random_email() -> str: - username = ''.join(random.choices(string.ascii_lowercase, k=8)) - return f"{username}@loadtest.local" - -def random_address() -> dict: - city, state, postal = random.choice(CITIES) - return { - "firstName": random.choice(FIRST_NAMES), - "lastName": random.choice(LAST_NAMES), - "streetAddress1": f"{random.randint(100, 999)} Test Street", - "city": city, - "postalCode": postal, - "country": "US", - "countryArea": state, - } - - -# ============== GRAPHQL QUERIES ============== - -CATEGORIES_QUERY = """ -query Categories($first: Int!) { - categories(first: $first) { - edges { node { id name slug } } - } -} -""" - -PRODUCTS_QUERY = """ -query Products($first: Int!, $filter: ProductFilterInput, $sortBy: ProductOrder) { - products(first: $first, filter: $filter, sortBy: $sortBy, channel: "default-channel") { - edges { node { id name slug } } - totalCount - } -} -""" - -PRODUCT_DETAIL_QUERY = """ -query ProductDetail($slug: String!) { - product(slug: $slug, channel: "default-channel") { - id name slug description - variants { id name sku quantityAvailable } - } -} -""" - -SEARCH_QUERY = """ -query Search($search: String!, $first: Int!) { - products(first: $first, filter: { search: $search }, channel: "default-channel") { - edges { node { id name slug } } - totalCount - } -} -""" - -CHECKOUT_CREATE = """ -mutation CheckoutCreate($input: CheckoutCreateInput!) { - checkoutCreate(input: $input) { - checkout { id token lines { id } } - errors { field message } - } -} -""" - -CHECKOUT_LINES_ADD = """ -mutation CheckoutLinesAdd($id: ID!, $lines: [CheckoutLineInput!]!) { - checkoutLinesAdd(id: $id, lines: $lines) { - checkout { id lines { id } totalPrice { gross { amount } } } - errors { field message } - } -} -""" - - -# ============== GRAPHQL MIXIN ============== - -class GraphQLMixin: - def graphql(self, query: str, variables: dict = None, name: str = "GraphQL"): - headers = {"Content-Type": "application/json"} - payload = {"query": query} - if variables: - payload["variables"] = variables - - with self.client.post("/graphql/", json=payload, headers=headers, - name=name, catch_response=True) as response: - if response.status_code != 200: - response.failure(f"HTTP {response.status_code}") - return {} - try: - data = response.json() - if "errors" in data and data["errors"]: - error_msg = data["errors"][0].get("message", "Unknown") - if "not found" not in error_msg.lower(): - response.failure(f"GraphQL: {error_msg}") - return data - except json.JSONDecodeError: - response.failure("Invalid JSON") - return {} - - -# ============== USER PERSONAS ============== - -class BrowserUser(HttpUser, GraphQLMixin): - """Casual browser - read-heavy.""" - wait_time = between(2, 5) - weight = 50 - - def on_start(self): - self.products = [] - self._fetch_products() - - def _fetch_products(self): - data = self.graphql(PRODUCTS_QUERY, {"first": 20}, "[Browser] Products") - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.products = [e["node"] for e in edges] - - @task(30) - @tag("read") - def browse_categories(self): - self.graphql(CATEGORIES_QUERY, {"first": 20}, "[Browser] Categories") - - @task(50) - @tag("read") - def browse_products(self): - self.graphql(PRODUCTS_QUERY, {"first": 20}, "[Browser] Products") - - @task(20) - @tag("read") - def view_product(self): - if not self.products: - self._fetch_products() - if self.products: - product = random.choice(self.products) - self.graphql(PRODUCT_DETAIL_QUERY, {"slug": product["slug"]}, "[Browser] Detail") - - -class SearcherUser(HttpUser, GraphQLMixin): - """Search-focused user.""" - wait_time = between(1, 3) - weight = 25 - - @task(60) - @tag("read", "search") - def search_products(self): - term = random.choice(SEARCH_TERMS) - self.graphql(SEARCH_QUERY, {"search": term, "first": 20}, "[Searcher] Search") - - @task(30) - @tag("read", "search") - def search_with_sort(self): - term = random.choice(SEARCH_TERMS) - sorts = [ - {"field": "NAME", "direction": "ASC"}, - {"field": "PRICE", "direction": "ASC"}, - {"field": "PRICE", "direction": "DESC"}, - ] - self.graphql(PRODUCTS_QUERY, { - "first": 20, - "filter": {"search": term}, - "sortBy": random.choice(sorts) - }, "[Searcher] Search+Sort") - - @task(10) - @tag("read") - def browse(self): - self.graphql(PRODUCTS_QUERY, {"first": 20}, "[Searcher] Browse") - - -class BuyerUser(HttpUser, GraphQLMixin): - """Buyer going through purchase journey.""" - wait_time = between(2, 4) - weight = 20 - - def on_start(self): - self.products = [] - self.checkout_id = None - self._fetch_products() - - def _fetch_products(self): - data = self.graphql(PRODUCTS_QUERY, {"first": 30}, "[Buyer] Products") - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.products = [e["node"] for e in edges] - - def _get_variant(self, slug: str): - data = self.graphql(PRODUCT_DETAIL_QUERY, {"slug": slug}, "[Buyer] GetVariant") - if data and "data" in data: - product = data.get("data", {}).get("product", {}) - if product: - variants = [v for v in product.get("variants", []) - if v.get("quantityAvailable", 0) > 0] - return variants[0]["id"] if variants else None - return None - - @task(30) - @tag("read") - def browse_products(self): - self._fetch_products() - - @task(25) - @tag("read") - def view_product(self): - if self.products: - product = random.choice(self.products) - self.graphql(PRODUCT_DETAIL_QUERY, {"slug": product["slug"]}, "[Buyer] Detail") - - @task(20) - @tag("write", "cart") - def add_to_cart(self): - if not self.products: - return - - product = random.choice(self.products) - variant_id = self._get_variant(product["slug"]) - if not variant_id: - return - - if not self.checkout_id: - data = self.graphql(CHECKOUT_CREATE, { - "input": { - "channel": "default-channel", - "email": random_email(), - "lines": [{"variantId": variant_id, "quantity": 1}] - } - }, "[Buyer] CreateCart") - if data and "data" in data: - checkout = data.get("data", {}).get("checkoutCreate", {}).get("checkout", {}) - self.checkout_id = checkout.get("id") - else: - self.graphql(CHECKOUT_LINES_ADD, { - "id": self.checkout_id, - "lines": [{"variantId": variant_id, "quantity": 1}] - }, "[Buyer] AddToCart") - - @task(5) - @tag("write") - def abandon_cart(self): - if self.checkout_id and random.random() < 0.3: - self.checkout_id = None - - -class AdminUser(HttpUser, GraphQLMixin): - """Admin operations - read-heavy for now.""" - wait_time = between(3, 8) - weight = 5 - - @task(40) - @tag("read", "admin") - def view_products(self): - self.graphql(PRODUCTS_QUERY, {"first": 50}, "[Admin] Products") - - @task(30) - @tag("read", "admin") - def view_categories(self): - self.graphql(CATEGORIES_QUERY, {"first": 50}, "[Admin] Categories") - - @task(30) - @tag("read", "admin") - def view_detail(self): - data = self.graphql(PRODUCTS_QUERY, {"first": 10}, "[Admin] FetchList") - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - if edges: - product = random.choice(edges)["node"] - self.graphql(PRODUCT_DETAIL_QUERY, {"slug": product["slug"]}, "[Admin] Detail") diff --git a/examples/demo-environment/kubernetes/locust/base/service.yaml b/examples/demo-environment/kubernetes/locust/base/service.yaml deleted file mode 100644 index 5a06b32..0000000 --- a/examples/demo-environment/kubernetes/locust/base/service.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: locust-master - namespace: loadtest - labels: - app: locust - role: master -spec: - type: ClusterIP - selector: - app: locust - role: master - ports: - - name: web - port: 8089 - targetPort: 8089 - - name: master - port: 5557 - targetPort: 5557 - - name: master-bind - port: 5558 - targetPort: 5558 - - name: prometheus - port: 9646 - targetPort: 9646 diff --git a/examples/demo-environment/kubernetes/saleor/base/api-deployment.yaml b/examples/demo-environment/kubernetes/saleor/base/api-deployment.yaml deleted file mode 100644 index de6c55b..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/api-deployment.yaml +++ /dev/null @@ -1,134 +0,0 @@ -# ============================================================================= -# Saleor API Deployment -# ============================================================================= -apiVersion: apps/v1 -kind: Deployment -metadata: - name: saleor-api - namespace: saleor - labels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api -spec: - replicas: 2 - selector: - matchLabels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api - template: - metadata: - labels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api - annotations: - # Force rolling update on config changes - checksum/config: "CONFIG_HASH" - spec: - serviceAccountName: saleor-api - - # Security context - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - # Graceful shutdown - terminationGracePeriodSeconds: 30 - - containers: - - name: saleor-api - image: saleor-api - imagePullPolicy: IfNotPresent - - ports: - - name: http - containerPort: 8000 - protocol: TCP - - # Environment from ConfigMap - envFrom: - - configMapRef: - name: saleor-config - - secretRef: - name: saleor-secrets - - # Additional environment variables - env: - - name: PYTHONUNBUFFERED - value: "1" - - name: GUNICORN_WORKERS - value: "4" - - name: GUNICORN_THREADS - value: "2" - - name: GUNICORN_TIMEOUT - value: "300" - - # Resource limits for Autopilot - resources: - requests: - cpu: "500m" - memory: "1Gi" - ephemeral-storage: "1Gi" - limits: - cpu: "2" - memory: "4Gi" - ephemeral-storage: "2Gi" - - # Health checks - livenessProbe: - httpGet: - path: /health/ - port: http - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 - - readinessProbe: - httpGet: - path: /health/ - port: http - initialDelaySeconds: 10 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 3 - - # Startup probe for slow starts - startupProbe: - httpGet: - path: /health/ - port: http - initialDelaySeconds: 10 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 30 - - # Security context - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: false - capabilities: - drop: - - ALL - - # Volume mounts - volumeMounts: - - name: tmp - mountPath: /tmp - - name: media - mountPath: /app/media - - volumes: - - name: tmp - emptyDir: {} - - name: media - emptyDir: {} - - # Topology spread for high availability - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: ScheduleAnyway - labelSelector: - matchLabels: - app.kubernetes.io/name: saleor-api diff --git a/examples/demo-environment/kubernetes/saleor/base/api-service.yaml b/examples/demo-environment/kubernetes/saleor/base/api-service.yaml deleted file mode 100644 index 46294bb..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/api-service.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Saleor API Service -# ============================================================================= -apiVersion: v1 -kind: Service -metadata: - name: saleor-api - namespace: saleor - labels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api - annotations: - # Enable container-native load balancing - cloud.google.com/neg: '{"ingress": true}' -spec: - type: ClusterIP - selector: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api - ports: - - name: http - port: 80 - targetPort: http - protocol: TCP diff --git a/examples/demo-environment/kubernetes/saleor/base/configmap.yaml b/examples/demo-environment/kubernetes/saleor/base/configmap.yaml deleted file mode 100644 index 2eaade0..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/configmap.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# ============================================================================= -# Saleor ConfigMap -# ============================================================================= -apiVersion: v1 -kind: ConfigMap -metadata: - name: saleor-config - namespace: saleor - labels: - app.kubernetes.io/name: saleor - app.kubernetes.io/component: config -data: - # Django settings - DEBUG: "False" - ALLOWED_HOSTS: "*" - - # Database (connection details - password from secret) - DATABASE_HOST: "DB_HOST_PLACEHOLDER" - DATABASE_PORT: "5432" - DATABASE_NAME: "saleor" - - # Redis - REDIS_URL: "REDIS_URL_PLACEHOLDER" - CELERY_BROKER_URL: "REDIS_URL_PLACEHOLDER" - - # GCS Storage - DEFAULT_FILE_STORAGE: "storages.backends.gcloud.GoogleCloudStorage" - GS_BUCKET_NAME: "MEDIA_BUCKET_PLACEHOLDER" - GS_DEFAULT_ACL: "publicRead" - GS_QUERYSTRING_AUTH: "False" - - # Email (use console backend for dev) - EMAIL_BACKEND: "django.core.mail.backends.console.EmailBackend" - - # Saleor specific - DEFAULT_COUNTRY: "US" - DEFAULT_CURRENCY: "USD" - - # GraphQL - GRAPHQL_PLAYGROUND: "True" - - # Logging - LOG_LEVEL: "INFO" - - # JWT - JWT_TTL_ACCESS: "300" # 5 minutes - JWT_TTL_REFRESH: "2592000" # 30 days - - # Performance - ENABLE_DEBUG_TOOLBAR: "False" - - # Plugins - PLUGINS: "saleor.plugins.webhook.plugin.WebhookPlugin" - - # Replace YOUR_GCP_PROJECT_ID with your actual GCP project ID - project_id: "YOUR_GCP_PROJECT_ID" - region: "us-central1" diff --git a/examples/demo-environment/kubernetes/saleor/base/dashboard-deployment.yaml b/examples/demo-environment/kubernetes/saleor/base/dashboard-deployment.yaml deleted file mode 100644 index eac7ca1..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/dashboard-deployment.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# ============================================================================= -# Saleor Dashboard Deployment -# ============================================================================= -apiVersion: apps/v1 -kind: Deployment -metadata: - name: saleor-dashboard - namespace: saleor - labels: - app.kubernetes.io/name: saleor-dashboard - app.kubernetes.io/component: dashboard -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: saleor-dashboard - app.kubernetes.io/component: dashboard - template: - metadata: - labels: - app.kubernetes.io/name: saleor-dashboard - app.kubernetes.io/component: dashboard - spec: - serviceAccountName: saleor-dashboard - - securityContext: - runAsNonRoot: true - runAsUser: 101 # nginx user - fsGroup: 101 - - containers: - - name: saleor-dashboard - image: saleor-dashboard - imagePullPolicy: IfNotPresent - - ports: - - name: http - containerPort: 80 - protocol: TCP - - env: - # API URL - will be the external URL in production - - name: API_URI - value: "API_URI_PLACEHOLDER" - - name: APP_MOUNT_URI - value: "/" - - name: STATIC_URL - value: "/" - - resources: - requests: - cpu: "100m" - memory: "128Mi" - ephemeral-storage: "256Mi" - limits: - cpu: "500m" - memory: "256Mi" - ephemeral-storage: "512Mi" - - livenessProbe: - httpGet: - path: / - port: http - initialDelaySeconds: 10 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 - - readinessProbe: - httpGet: - path: / - port: http - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 3 - - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - - volumeMounts: - - name: nginx-cache - mountPath: /var/cache/nginx - - name: nginx-run - mountPath: /var/run - - name: tmp - mountPath: /tmp - - volumes: - - name: nginx-cache - emptyDir: {} - - name: nginx-run - emptyDir: {} - - name: tmp - emptyDir: {} diff --git a/examples/demo-environment/kubernetes/saleor/base/dashboard-service.yaml b/examples/demo-environment/kubernetes/saleor/base/dashboard-service.yaml deleted file mode 100644 index f657ece..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/dashboard-service.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# ============================================================================= -# Saleor Dashboard Service -# ============================================================================= -apiVersion: v1 -kind: Service -metadata: - name: saleor-dashboard - namespace: saleor - labels: - app.kubernetes.io/name: saleor-dashboard - app.kubernetes.io/component: dashboard - annotations: - cloud.google.com/neg: '{"ingress": true}' -spec: - type: ClusterIP - selector: - app.kubernetes.io/name: saleor-dashboard - app.kubernetes.io/component: dashboard - ports: - - name: http - port: 80 - targetPort: http - protocol: TCP diff --git a/examples/demo-environment/kubernetes/saleor/base/hpa.yaml b/examples/demo-environment/kubernetes/saleor/base/hpa.yaml deleted file mode 100644 index cb2926a..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/hpa.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# ============================================================================= -# Horizontal Pod Autoscaler -# ============================================================================= -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: saleor-api-hpa - namespace: saleor - labels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: saleor-api - - minReplicas: 2 - maxReplicas: 10 - - metrics: - # CPU-based scaling - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 - - # Memory-based scaling (optional) - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: 80 - - behavior: - scaleDown: - stabilizationWindowSeconds: 300 # 5 minutes - policies: - - type: Percent - value: 25 - periodSeconds: 60 - - type: Pods - value: 2 - periodSeconds: 60 - selectPolicy: Min - - scaleUp: - stabilizationWindowSeconds: 0 # Scale up immediately - policies: - - type: Percent - value: 100 - periodSeconds: 15 - - type: Pods - value: 4 - periodSeconds: 15 - selectPolicy: Max ---- -# ============================================================================= -# Worker HPA (optional - for high async workload) -# ============================================================================= -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: saleor-worker-hpa - namespace: saleor - labels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: saleor-worker - - minReplicas: 1 - maxReplicas: 5 - - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 - - behavior: - scaleDown: - stabilizationWindowSeconds: 300 - policies: - - type: Pods - value: 1 - periodSeconds: 60 - scaleUp: - stabilizationWindowSeconds: 60 - policies: - - type: Pods - value: 2 - periodSeconds: 30 diff --git a/examples/demo-environment/kubernetes/saleor/base/ingress.yaml b/examples/demo-environment/kubernetes/saleor/base/ingress.yaml deleted file mode 100644 index 2c53fc8..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/ingress.yaml +++ /dev/null @@ -1,112 +0,0 @@ -# ============================================================================= -# Ingress Configuration -# ============================================================================= -# Uses GKE Ingress with Google-managed SSL certificate -# ============================================================================= -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: saleor-ingress - namespace: saleor - labels: - app.kubernetes.io/name: saleor - app.kubernetes.io/component: ingress - annotations: - # Use GKE Ingress controller - kubernetes.io/ingress.class: "gce" - - # Enable Google-managed SSL certificate - networking.gke.io/managed-certificates: "saleor-certificate" - - # Redirect HTTP to HTTPS - kubernetes.io/ingress.allow-http: "false" - - # Backend configuration for health checks - cloud.google.com/backend-config: '{"default": "saleor-backend-config"}' -spec: - rules: - # API endpoint - - host: "api.DOMAIN_PLACEHOLDER" - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: saleor-api - port: - number: 80 - - # Dashboard endpoint - - host: "dashboard.DOMAIN_PLACEHOLDER" - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: saleor-dashboard - port: - number: 80 - - # GraphQL Playground (optional, same as API) - - host: "graphql.DOMAIN_PLACEHOLDER" - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: saleor-api - port: - number: 80 ---- -# ============================================================================= -# Backend Configuration -# ============================================================================= -apiVersion: cloud.google.com/v1 -kind: BackendConfig -metadata: - name: saleor-backend-config - namespace: saleor -spec: - # Health check configuration - healthCheck: - checkIntervalSec: 15 - timeoutSec: 5 - healthyThreshold: 1 - unhealthyThreshold: 2 - type: HTTP - requestPath: /health/ - port: 8000 - - # Connection draining - connectionDraining: - drainingTimeoutSec: 30 - - # Session affinity (optional, can help with WebSocket connections) - # sessionAffinity: - # affinityType: "CLIENT_IP" - # affinityCookieTtlSec: 3600 - - # Cloud CDN (optional) - # cdn: - # enabled: true - # cachePolicy: - # includeHost: true - # includeProtocol: true - # includeQueryString: false ---- -# ============================================================================= -# Google-Managed Certificate -# ============================================================================= -apiVersion: networking.gke.io/v1 -kind: ManagedCertificate -metadata: - name: saleor-certificate - namespace: saleor -spec: - domains: - - "api.DOMAIN_PLACEHOLDER" - - "dashboard.DOMAIN_PLACEHOLDER" - - "graphql.DOMAIN_PLACEHOLDER" diff --git a/examples/demo-environment/kubernetes/saleor/base/kustomization.yaml b/examples/demo-environment/kubernetes/saleor/base/kustomization.yaml deleted file mode 100644 index 51d6da2..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/kustomization.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# ============================================================================= -# Saleor Kubernetes Deployment - Kustomize Base -# ============================================================================= -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -metadata: - name: saleor-base - -namespace: saleor - -resources: - - namespace.yaml - - serviceaccounts.yaml - - configmap.yaml - - secrets.yaml - - api-deployment.yaml - - api-service.yaml - - worker-deployment.yaml - - dashboard-deployment.yaml - - dashboard-service.yaml - - ingress.yaml - - hpa.yaml - - pdb.yaml - -commonLabels: - app.kubernetes.io/part-of: saleor - app.kubernetes.io/managed-by: kustomize - -images: - - name: saleor-api - newName: ghcr.io/saleor/saleor - newTag: "3.20" - - name: saleor-dashboard - newName: ghcr.io/saleor/saleor-dashboard - newTag: "3.20" diff --git a/examples/demo-environment/kubernetes/saleor/base/namespace.yaml b/examples/demo-environment/kubernetes/saleor/base/namespace.yaml deleted file mode 100644 index 36c8d54..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/namespace.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# ============================================================================= -# Saleor Namespace -# ============================================================================= -apiVersion: v1 -kind: Namespace -metadata: - name: saleor - labels: - name: saleor - app.kubernetes.io/part-of: saleor diff --git a/examples/demo-environment/kubernetes/saleor/base/pdb.yaml b/examples/demo-environment/kubernetes/saleor/base/pdb.yaml deleted file mode 100644 index a27e9b2..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/pdb.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# ============================================================================= -# Pod Disruption Budgets -# ============================================================================= -# Ensures minimum availability during voluntary disruptions -# ============================================================================= -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: saleor-api-pdb - namespace: saleor - labels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api -spec: - minAvailable: 1 - selector: - matchLabels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: saleor-worker-pdb - namespace: saleor - labels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker -spec: - minAvailable: 0 # Workers can all be disrupted (tasks will retry) - selector: - matchLabels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker diff --git a/examples/demo-environment/kubernetes/saleor/base/secrets.yaml b/examples/demo-environment/kubernetes/saleor/base/secrets.yaml deleted file mode 100644 index b8da556..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/secrets.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# ============================================================================= -# Saleor Secrets Template -# ============================================================================= -# NOTE: In production, use External Secrets Operator or Sealed Secrets -# This is a template - actual values should be managed securely -# ============================================================================= -apiVersion: v1 -kind: Secret -metadata: - name: saleor-secrets - namespace: saleor - labels: - app.kubernetes.io/name: saleor - app.kubernetes.io/component: secrets -type: Opaque -stringData: - # Database credentials - DATABASE_USER: "saleor" - DATABASE_PASSWORD: "CHANGE_ME" - - # Full database URL - DATABASE_URL: "postgresql://saleor:CHANGE_ME@DB_HOST:5432/saleor" - - # Django secret key (generate with: openssl rand -hex 32) - SECRET_KEY: "CHANGE_ME_GENERATE_RANDOM_STRING" - - # Redis AUTH password (if enabled) - REDIS_PASSWORD: "" - - # Superuser credentials for initial setup - DJANGO_SUPERUSER_EMAIL: "admin@example.com" - DJANGO_SUPERUSER_PASSWORD: "CHANGE_ME" ---- -# External Secrets template (when using External Secrets Operator) -# apiVersion: external-secrets.io/v1beta1 -# kind: ExternalSecret -# metadata: -# name: saleor-secrets -# namespace: saleor -# spec: -# refreshInterval: 1h -# secretStoreRef: -# kind: ClusterSecretStore -# name: gcp-secret-manager -# target: -# name: saleor-secrets -# creationPolicy: Owner -# data: -# - secretKey: DATABASE_PASSWORD -# remoteRef: -# key: prescale-dev-db-password -# - secretKey: SECRET_KEY -# remoteRef: -# key: prescale-dev-saleor-secret-key diff --git a/examples/demo-environment/kubernetes/saleor/base/serviceaccounts.yaml b/examples/demo-environment/kubernetes/saleor/base/serviceaccounts.yaml deleted file mode 100644 index 2224b60..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/serviceaccounts.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# ============================================================================= -# Service Accounts with Workload Identity -# ============================================================================= ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: saleor-api - namespace: saleor - labels: - app.kubernetes.io/name: saleor-api - app.kubernetes.io/component: api - annotations: - # Workload Identity annotation - will be patched per environment - iam.gke.io/gcp-service-account: "SALEOR_API_SA_EMAIL" ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: saleor-worker - namespace: saleor - labels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker - annotations: - # Workload Identity annotation - will be patched per environment - iam.gke.io/gcp-service-account: "SALEOR_WORKER_SA_EMAIL" ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: saleor-dashboard - namespace: saleor - labels: - app.kubernetes.io/name: saleor-dashboard - app.kubernetes.io/component: dashboard diff --git a/examples/demo-environment/kubernetes/saleor/base/worker-deployment.yaml b/examples/demo-environment/kubernetes/saleor/base/worker-deployment.yaml deleted file mode 100644 index 1d12b6b..0000000 --- a/examples/demo-environment/kubernetes/saleor/base/worker-deployment.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# ============================================================================= -# Saleor Worker (Celery) Deployment -# ============================================================================= -apiVersion: apps/v1 -kind: Deployment -metadata: - name: saleor-worker - namespace: saleor - labels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker - template: - metadata: - labels: - app.kubernetes.io/name: saleor-worker - app.kubernetes.io/component: worker - annotations: - checksum/config: "CONFIG_HASH" - spec: - serviceAccountName: saleor-worker - - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - terminationGracePeriodSeconds: 60 - - containers: - - name: saleor-worker - image: saleor-api - imagePullPolicy: IfNotPresent - - # Override command to run Celery worker - command: - - celery - - -A - - saleor - - worker - - --loglevel=INFO - - --concurrency=2 - - -E - - envFrom: - - configMapRef: - name: saleor-config - - secretRef: - name: saleor-secrets - - env: - - name: PYTHONUNBUFFERED - value: "1" - - name: C_FORCE_ROOT - value: "false" - - resources: - requests: - cpu: "250m" - memory: "512Mi" - ephemeral-storage: "1Gi" - limits: - cpu: "1" - memory: "2Gi" - ephemeral-storage: "2Gi" - - # Liveness probe - check if worker is responsive - livenessProbe: - exec: - command: - - celery - - -A - - saleor - - inspect - - ping - - --timeout=10 - initialDelaySeconds: 60 - periodSeconds: 60 - timeoutSeconds: 15 - failureThreshold: 3 - - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: false - capabilities: - drop: - - ALL - - volumeMounts: - - name: tmp - mountPath: /tmp - - name: media - mountPath: /app/media - - volumes: - - name: tmp - emptyDir: {} - - name: media - emptyDir: {} ---- -# ============================================================================= -# Saleor Beat (Celery Scheduler) Deployment -# ============================================================================= -apiVersion: apps/v1 -kind: Deployment -metadata: - name: saleor-beat - namespace: saleor - labels: - app.kubernetes.io/name: saleor-beat - app.kubernetes.io/component: scheduler -spec: - replicas: 1 - strategy: - type: Recreate # Only one beat instance allowed - selector: - matchLabels: - app.kubernetes.io/name: saleor-beat - app.kubernetes.io/component: scheduler - template: - metadata: - labels: - app.kubernetes.io/name: saleor-beat - app.kubernetes.io/component: scheduler - annotations: - checksum/config: "CONFIG_HASH" - spec: - serviceAccountName: saleor-worker - - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - containers: - - name: saleor-beat - image: saleor-api - imagePullPolicy: IfNotPresent - - command: - - celery - - -A - - saleor - - beat - - --loglevel=INFO - - envFrom: - - configMapRef: - name: saleor-config - - secretRef: - name: saleor-secrets - - env: - - name: PYTHONUNBUFFERED - value: "1" - - resources: - requests: - cpu: "100m" - memory: "256Mi" - ephemeral-storage: "512Mi" - limits: - cpu: "500m" - memory: "512Mi" - ephemeral-storage: "1Gi" - - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: false - capabilities: - drop: - - ALL - - volumeMounts: - - name: tmp - mountPath: /tmp - - volumes: - - name: tmp - emptyDir: {} diff --git a/examples/demo-environment/kubernetes/saleor/jobs/migration-jobs.yaml b/examples/demo-environment/kubernetes/saleor/jobs/migration-jobs.yaml deleted file mode 100644 index e118f14..0000000 --- a/examples/demo-environment/kubernetes/saleor/jobs/migration-jobs.yaml +++ /dev/null @@ -1,216 +0,0 @@ -# ============================================================================= -# Database Migration Job -# ============================================================================= -# Run this job after deploying to initialize/migrate the database -# ============================================================================= -apiVersion: batch/v1 -kind: Job -metadata: - name: saleor-migrate - namespace: saleor - labels: - app.kubernetes.io/name: saleor-migrate - app.kubernetes.io/component: migration -spec: - ttlSecondsAfterFinished: 300 - backoffLimit: 3 - template: - metadata: - labels: - app.kubernetes.io/name: saleor-migrate - app.kubernetes.io/component: migration - spec: - serviceAccountName: saleor-api - restartPolicy: Never - - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - containers: - - name: migrate - image: ghcr.io/saleor/saleor:3.20 - imagePullPolicy: IfNotPresent - - command: - - /bin/bash - - -c - - | - set -e - echo "Running database migrations..." - python manage.py migrate --noinput - - echo "Creating collection point for media..." - python manage.py collectstatic --noinput || true - - echo "Migration completed successfully!" - - envFrom: - - configMapRef: - name: saleor-config - - secretRef: - name: saleor-secrets - - env: - - name: PYTHONUNBUFFERED - value: "1" - - resources: - requests: - cpu: "250m" - memory: "512Mi" - ephemeral-storage: "1Gi" - limits: - cpu: "1" - memory: "1Gi" - ephemeral-storage: "2Gi" - - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: false - capabilities: - drop: - - ALL ---- -# ============================================================================= -# Create Superuser Job -# ============================================================================= -# Run this job once to create the admin superuser -# ============================================================================= -apiVersion: batch/v1 -kind: Job -metadata: - name: saleor-create-superuser - namespace: saleor - labels: - app.kubernetes.io/name: saleor-create-superuser - app.kubernetes.io/component: setup -spec: - ttlSecondsAfterFinished: 300 - backoffLimit: 1 - template: - metadata: - labels: - app.kubernetes.io/name: saleor-create-superuser - app.kubernetes.io/component: setup - spec: - serviceAccountName: saleor-api - restartPolicy: Never - - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - containers: - - name: create-superuser - image: ghcr.io/saleor/saleor:3.20 - imagePullPolicy: IfNotPresent - - command: - - /bin/bash - - -c - - | - set -e - echo "Creating superuser..." - python manage.py createsuperuser --noinput --email=$DJANGO_SUPERUSER_EMAIL || echo "Superuser may already exist" - echo "Superuser setup completed!" - - envFrom: - - configMapRef: - name: saleor-config - - secretRef: - name: saleor-secrets - - env: - - name: PYTHONUNBUFFERED - value: "1" - - resources: - requests: - cpu: "100m" - memory: "256Mi" - ephemeral-storage: "512Mi" - limits: - cpu: "500m" - memory: "512Mi" - ephemeral-storage: "1Gi" - - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: false - capabilities: - drop: - - ALL ---- -# ============================================================================= -# Populate Sample Data Job (Optional) -# ============================================================================= -# Run this job to populate sample products for testing -# ============================================================================= -apiVersion: batch/v1 -kind: Job -metadata: - name: saleor-populate-data - namespace: saleor - labels: - app.kubernetes.io/name: saleor-populate-data - app.kubernetes.io/component: setup -spec: - ttlSecondsAfterFinished: 600 - backoffLimit: 1 - template: - metadata: - labels: - app.kubernetes.io/name: saleor-populate-data - app.kubernetes.io/component: setup - spec: - serviceAccountName: saleor-api - restartPolicy: Never - - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - containers: - - name: populate-data - image: ghcr.io/saleor/saleor:3.20 - imagePullPolicy: IfNotPresent - - command: - - /bin/bash - - -c - - | - set -e - echo "Populating sample data..." - python manage.py populatedb --createsuperuser || echo "Data may already exist" - echo "Sample data populated!" - - envFrom: - - configMapRef: - name: saleor-config - - secretRef: - name: saleor-secrets - - env: - - name: PYTHONUNBUFFERED - value: "1" - - resources: - requests: - cpu: "250m" - memory: "512Mi" - ephemeral-storage: "1Gi" - limits: - cpu: "1" - memory: "2Gi" - ephemeral-storage: "2Gi" - - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: false - capabilities: - drop: - - ALL diff --git a/examples/demo-environment/kubernetes/saleor/overlays/dev/kustomization.yaml b/examples/demo-environment/kubernetes/saleor/overlays/dev/kustomization.yaml deleted file mode 100644 index 28e3ff7..0000000 --- a/examples/demo-environment/kubernetes/saleor/overlays/dev/kustomization.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# ============================================================================= -# Dev Environment Kustomization -# ============================================================================= -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -metadata: - name: saleor-dev - -namespace: saleor - -resources: - - ../../base - -# Image overrides for dev -images: - - name: saleor-api - newName: ghcr.io/saleor/saleor - newTag: "3.20" - - name: saleor-dashboard - newName: ghcr.io/saleor/saleor-dashboard - newTag: "3.20" - -# Labels for dev environment -commonLabels: - environment: dev - -# Config patches -patches: - # Reduce replicas for dev - - target: - kind: Deployment - name: saleor-api - patch: |- - - op: replace - path: /spec/replicas - value: 1 - - - target: - kind: HorizontalPodAutoscaler - name: saleor-api-hpa - patch: |- - - op: replace - path: /spec/minReplicas - value: 1 - - op: replace - path: /spec/maxReplicas - value: 3 - - # Reduce resources for dev - - target: - kind: Deployment - name: saleor-api - patch: |- - - op: replace - path: /spec/template/spec/containers/0/resources/requests/cpu - value: "250m" - - op: replace - path: /spec/template/spec/containers/0/resources/requests/memory - value: "512Mi" - - op: replace - path: /spec/template/spec/containers/0/resources/limits/cpu - value: "1" - - op: replace - path: /spec/template/spec/containers/0/resources/limits/memory - value: "2Gi" - -# ConfigMap generator for dev-specific values -configMapGenerator: - - name: saleor-config - behavior: merge - literals: - - DEBUG=True - - LOG_LEVEL=DEBUG - - GRAPHQL_PLAYGROUND=True - -# Secret generator (example - use external secrets in real deployments) -secretGenerator: - - name: saleor-secrets - behavior: merge - literals: - - SECRET_KEY=dev-secret-key-change-in-production diff --git a/examples/demo-environment/loadtest/Dockerfile b/examples/demo-environment/loadtest/Dockerfile deleted file mode 100644 index 5d3c235..0000000 --- a/examples/demo-environment/loadtest/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM python:3.12-slim - -LABEL maintainer="Helios Team" -LABEL description="Locust load testing for Saleor e-commerce" - -WORKDIR /app - -# Install dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy locust files -COPY locustfiles/ ./locustfiles/ - -# Set Python path -ENV PYTHONPATH=/app/locustfiles - -# Default environment variables -ENV LOCUST_HOST=http://saleor-api.saleor.svc.cluster.local -ENV ADMIN_EMAIL=admin@example.com -ENV ADMIN_PASSWORD=admin123456 - -# Expose Locust web UI and worker ports -EXPOSE 8089 5557 5558 - -# Default command runs web UI mode -CMD ["locust", "-f", "/app/locustfiles/locustfile.py", "--web-host", "0.0.0.0"] diff --git a/examples/demo-environment/loadtest/README.md b/examples/demo-environment/loadtest/README.md deleted file mode 100644 index 059cbbd..0000000 --- a/examples/demo-environment/loadtest/README.md +++ /dev/null @@ -1,184 +0,0 @@ -# Prescale Load Testing with Locust - -Load testing suite for generating realistic e-commerce traffic patterns that Prescale can learn from. - -## Quick Start - -### Local Development - -```powershell -# Install dependencies -cd loadtest -pip install -r requirements.txt - -# Run with web UI (interactive) -cd locustfiles -locust -f locustfile.py --host=http://localhost:8000 - -# Open http://localhost:8089 in browser -``` - -### Kubernetes Deployment - -```powershell -# Deploy Locust to cluster -kubectl apply -k infra/kubernetes/locust/base - -# Port forward the web UI -kubectl port-forward -n loadtest svc/locust-master 8089:8089 - -# Open http://localhost:8089 -``` - -## Traffic Patterns - -### 1. Mixed Traffic (Default) -Realistic mix of user personas: -- **Browsers (50%)**: Casual browsing, category views, product listing -- **Searchers (25%)**: Heavy search, filtering, sorting -- **Buyers (20%)**: Add to cart, checkout flow -- **Admins (5%)**: Product management (write-heavy) - -```powershell -locust -f locustfile.py --host=http://localhost:8000 -``` - -### 2. Read-Heavy Mode -For testing cache effectiveness and read scaling: - -```powershell -locust -f read_heavy.py --host=http://localhost:8000 -``` - -### 3. Write-Heavy Mode -For testing database write scaling: - -```powershell -locust -f write_heavy.py --host=http://localhost:8000 -``` - -## Load Shapes (ML Training Patterns) - -### Ramp Pattern (Trend Detection) -Gradual growth followed by plateau and decline: - -```powershell -locust -f ramp_pattern.py --host=http://localhost:8000 --headless -t 20m -``` - -Pattern: `0 → 100 users (5min) → hold (10min) → 0 users (5min)` - -### Wave Pattern (Seasonality Detection) -Sinusoidal oscillation simulating daily cycles: - -```powershell -locust -f wave_pattern.py --host=http://localhost:8000 --headless -t 30m -``` - -Pattern: `10 ↔ 100 users, 5-minute cycles, 6 cycles total` - -### Spike Pattern (Anomaly Detection) -Flash-sale bursts with recovery periods: - -```powershell -locust -f spike_pattern.py --host=http://localhost:8000 --headless -t 20m -``` - -Pattern: `baseline(20) → spike(200) → recovery, 3 cycles` - -## Headless Mode (CI/CD) - -Run without web UI for automated testing: - -```powershell -# 50 users, 5/sec spawn rate, 10 minute duration -locust -f locustfile.py --host=http://localhost:8000 \ - --headless -u 50 -r 5 -t 10m \ - --csv=results/test -``` - -## Metrics Output - -Locust generates metrics in multiple formats: - -### CSV Reports -``` ---csv=results/test -``` -Generates: -- `test_stats.csv` - Request statistics -- `test_failures.csv` - Failed requests -- `test_stats_history.csv` - Time-series data -- `test_exceptions.csv` - Exceptions - -### JSON Report (Single File) -``` ---json -``` - -### Prometheus Metrics -The Locust web UI exposes metrics at `/metrics` in Prometheus format. - -## Environment Variables - -| Variable | Default | Description | -|----------|---------|-------------| -| `LOCUST_HOST` | - | Target API URL | -| `ADMIN_EMAIL` | admin@example.com | Admin user email | -| `ADMIN_PASSWORD` | admin123456 | Admin user password | -| `LOCUST_USERS` | 50 | Number of simulated users | -| `LOCUST_SPAWN_RATE` | 5 | Users spawned per second | -| `LOCUST_RUN_TIME` | 30m | Test duration | - -## Project Structure - -``` -loadtest/ -├── Dockerfile # Container image -├── requirements.txt # Python dependencies -├── README.md # This file -└── locustfiles/ - ├── locustfile.py # Combined (default) - ├── read_heavy.py # Read-focused - ├── write_heavy.py # Write-focused - ├── ramp_pattern.py # Ramp shape - ├── wave_pattern.py # Wave shape - ├── spike_pattern.py # Spike shape - ├── common/ - │ ├── __init__.py - │ ├── utils.py # Helpers - │ └── graphql_client.py # Saleor queries - ├── personas/ - │ ├── __init__.py - │ ├── browser.py # Casual browsing - │ ├── searcher.py # Search-heavy - │ ├── buyer.py # Purchase flow - │ └── admin.py # Admin ops - └── shapes/ - ├── __init__.py - └── load_shapes.py # Custom shapes -``` - -## Scaling Workers (Kubernetes) - -For higher load, scale the worker pods: - -```powershell -kubectl scale deployment locust-worker -n loadtest --replicas=5 -``` - -Each worker can simulate ~500-1000 users depending on resources. - -## Integration with Prescale - -The generated traffic produces: -1. **QPS metrics** - Requests per second by endpoint -2. **Latency distributions** - p50, p95, p99 response times -3. **Error rates** - Failed request percentages -4. **Throughput patterns** - Time-series data for ML training - -These metrics feed into Prescale for: -- Trend detection and forecasting -- Anomaly detection -- Capacity planning predictions -- Auto-scaling decisions diff --git a/examples/demo-environment/loadtest/locustfiles/common/__init__.py b/examples/demo-environment/loadtest/locustfiles/common/__init__.py deleted file mode 100644 index 67bf087..0000000 --- a/examples/demo-environment/loadtest/locustfiles/common/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Common utilities package -from .utils import ( - GraphQLMixin, - random_email, - random_address, - random_search_term, - random_product_name, - SEARCH_TERMS, -) -from .graphql_client import SaleorGraphQL - -__all__ = [ - "GraphQLMixin", - "SaleorGraphQL", - "random_email", - "random_address", - "random_search_term", - "random_product_name", - "SEARCH_TERMS", -] diff --git a/examples/demo-environment/loadtest/locustfiles/common/graphql_client.py b/examples/demo-environment/loadtest/locustfiles/common/graphql_client.py deleted file mode 100644 index 91d34c3..0000000 --- a/examples/demo-environment/loadtest/locustfiles/common/graphql_client.py +++ /dev/null @@ -1,308 +0,0 @@ -""" -GraphQL client utilities for Saleor API interactions. -""" -import json -from typing import Any, Optional - - -class SaleorGraphQL: - """Helper class for Saleor GraphQL operations.""" - - # ============== QUERIES ============== - - SHOP_INFO = """ - query ShopInfo { - shop { - name - description - defaultCountry { code country } - } - } - """ - - CATEGORIES = """ - query Categories($first: Int!) { - categories(first: $first) { - edges { - node { - id - name - slug - description - products(first: 5) { - totalCount - } - } - } - } - } - """ - - CATEGORY_PRODUCTS = """ - query CategoryProducts($slug: String!, $first: Int!, $after: String) { - category(slug: $slug) { - id - name - products(first: $first, after: $after, channel: "default-channel") { - edges { - node { - id - name - slug - thumbnail { url } - pricing { - priceRange { - start { gross { amount currency } } - } - } - } - } - pageInfo { hasNextPage endCursor } - } - } - } - """ - - PRODUCTS = """ - query Products($first: Int!, $after: String, $filter: ProductFilterInput, $sortBy: ProductOrder) { - products(first: $first, after: $after, filter: $filter, sortBy: $sortBy, channel: "default-channel") { - edges { - node { - id - name - slug - thumbnail { url } - category { name } - pricing { - priceRange { - start { gross { amount currency } } - stop { gross { amount currency } } - } - } - } - } - pageInfo { hasNextPage endCursor } - totalCount - } - } - """ - - PRODUCT_DETAIL = """ - query ProductDetail($slug: String!) { - product(slug: $slug, channel: "default-channel") { - id - name - slug - description - category { id name } - thumbnail { url } - media { url type } - variants { - id - name - sku - pricing { - price { gross { amount currency } } - } - quantityAvailable - } - pricing { - priceRange { - start { gross { amount currency } } - stop { gross { amount currency } } - } - } - } - } - """ - - SEARCH_PRODUCTS = """ - query SearchProducts($search: String!, $first: Int!) { - products(first: $first, filter: { search: $search }, channel: "default-channel") { - edges { - node { - id - name - slug - thumbnail { url } - pricing { - priceRange { - start { gross { amount currency } } - } - } - } - } - totalCount - } - } - """ - - CHECKOUT = """ - query Checkout($id: ID!) { - checkout(id: $id) { - id - token - email - lines { - id - quantity - variant { id name } - totalPrice { gross { amount currency } } - } - subtotalPrice { gross { amount currency } } - totalPrice { gross { amount currency } } - shippingMethods { - id - name - price { amount currency } - } - } - } - """ - - # ============== MUTATIONS ============== - - CHECKOUT_CREATE = """ - mutation CheckoutCreate($input: CheckoutCreateInput!) { - checkoutCreate(input: $input) { - checkout { - id - token - lines { id quantity variant { id name } } - } - errors { field message code } - } - } - """ - - CHECKOUT_LINES_ADD = """ - mutation CheckoutLinesAdd($id: ID!, $lines: [CheckoutLineInput!]!) { - checkoutLinesAdd(id: $id, lines: $lines) { - checkout { - id - lines { id quantity variant { id name } } - totalPrice { gross { amount currency } } - } - errors { field message code } - } - } - """ - - CHECKOUT_EMAIL_UPDATE = """ - mutation CheckoutEmailUpdate($id: ID!, $email: String!) { - checkoutEmailUpdate(id: $id, email: $email) { - checkout { id email } - errors { field message code } - } - } - """ - - CHECKOUT_SHIPPING_ADDRESS_UPDATE = """ - mutation CheckoutShippingAddressUpdate($id: ID!, $address: AddressInput!) { - checkoutShippingAddressUpdate(id: $id, shippingAddress: $address) { - checkout { - id - shippingAddress { firstName lastName streetAddress1 city country { code } } - } - errors { field message code } - } - } - """ - - CHECKOUT_BILLING_ADDRESS_UPDATE = """ - mutation CheckoutBillingAddressUpdate($id: ID!, $address: AddressInput!) { - checkoutBillingAddressUpdate(id: $id, billingAddress: $address) { - checkout { - id - billingAddress { firstName lastName streetAddress1 city country { code } } - } - errors { field message code } - } - } - """ - - CHECKOUT_DELIVERY_METHOD_UPDATE = """ - mutation CheckoutDeliveryMethodUpdate($id: ID!, $deliveryMethodId: ID!) { - checkoutDeliveryMethodUpdate(id: $id, deliveryMethodId: $deliveryMethodId) { - checkout { - id - deliveryMethod { - ... on ShippingMethod { id name } - } - } - errors { field message code } - } - } - """ - - CHECKOUT_COMPLETE = """ - mutation CheckoutComplete($id: ID!) { - checkoutComplete(id: $id) { - order { - id - number - status - total { gross { amount currency } } - } - errors { field message code } - } - } - """ - - # Admin mutations - PRODUCT_CREATE = """ - mutation ProductCreate($input: ProductCreateInput!) { - productCreate(input: $input) { - product { - id - name - slug - } - errors { field message code } - } - } - """ - - PRODUCT_UPDATE = """ - mutation ProductUpdate($id: ID!, $input: ProductInput!) { - productUpdate(id: $id, input: $input) { - product { - id - name - description - } - errors { field message code } - } - } - """ - - PRODUCT_VARIANT_CREATE = """ - mutation ProductVariantCreate($input: ProductVariantCreateInput!) { - productVariantCreate(input: $input) { - productVariant { - id - name - sku - } - errors { field message code } - } - } - """ - - TOKEN_CREATE = """ - mutation TokenCreate($email: String!, $password: String!) { - tokenCreate(email: $email, password: $password) { - token - refreshToken - errors { field message code } - } - } - """ - - @staticmethod - def build_request(query: str, variables: Optional[dict] = None) -> dict: - """Build a GraphQL request payload.""" - payload = {"query": query} - if variables: - payload["variables"] = variables - return payload diff --git a/examples/demo-environment/loadtest/locustfiles/common/utils.py b/examples/demo-environment/loadtest/locustfiles/common/utils.py deleted file mode 100644 index f9a5369..0000000 --- a/examples/demo-environment/loadtest/locustfiles/common/utils.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Common utilities and base classes for Locust load tests. -""" -import json -import random -import string -from typing import Any, Optional, List -from locust import HttpUser, between - - -# Sample data for generating realistic requests -SEARCH_TERMS = [ - "shirt", "shoes", "pants", "jacket", "dress", "hat", "bag", "watch", - "phone", "laptop", "headphones", "camera", "book", "toy", "game", - "kitchen", "furniture", "decor", "outdoor", "sports", "fitness" -] - -FIRST_NAMES = [ - "John", "Jane", "Michael", "Sarah", "David", "Emily", "Chris", "Amanda", - "James", "Jessica", "Robert", "Ashley", "William", "Megan", "Daniel", "Lauren" -] - -LAST_NAMES = [ - "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", - "Rodriguez", "Martinez", "Anderson", "Taylor", "Thomas", "Moore", "Jackson" -] - -CITIES = [ - ("New York", "NY", "10001"), - ("Los Angeles", "CA", "90001"), - ("Chicago", "IL", "60601"), - ("Houston", "TX", "77001"), - ("Phoenix", "AZ", "85001"), - ("Philadelphia", "PA", "19101"), - ("San Antonio", "TX", "78201"), - ("San Diego", "CA", "92101"), -] - -STREET_TYPES = ["Street", "Avenue", "Boulevard", "Drive", "Lane", "Road", "Way"] - - -def random_email() -> str: - """Generate a random email address.""" - username = ''.join(random.choices(string.ascii_lowercase, k=8)) - domain = random.choice(["example.com", "test.com", "loadtest.local"]) - return f"{username}@{domain}" - - -def random_address() -> dict: - """Generate a random US address.""" - city, state, postal = random.choice(CITIES) - street_num = random.randint(100, 9999) - street_name = ''.join(random.choices(string.ascii_uppercase, k=6)).title() - street_type = random.choice(STREET_TYPES) - - return { - "firstName": random.choice(FIRST_NAMES), - "lastName": random.choice(LAST_NAMES), - "streetAddress1": f"{street_num} {street_name} {street_type}", - "city": city, - "postalCode": postal, - "country": "US", - "countryArea": state, - } - - -def random_search_term() -> str: - """Get a random search term.""" - return random.choice(SEARCH_TERMS) - - -def random_product_name() -> str: - """Generate a random product name.""" - adjectives = ["Premium", "Deluxe", "Classic", "Modern", "Vintage", "Ultra", "Pro"] - nouns = ["Widget", "Gadget", "Device", "Tool", "Item", "Product", "Gear"] - return f"{random.choice(adjectives)} {random.choice(nouns)} {random.randint(100, 999)}" - - -class GraphQLMixin: - """Mixin for GraphQL operations in Locust users.""" - - def graphql( - self, - query: str, - variables: Optional[dict] = None, - name: Optional[str] = None, - auth_token: Optional[str] = None - ) -> dict: - """Execute a GraphQL query/mutation.""" - headers = {"Content-Type": "application/json"} - if auth_token: - headers["Authorization"] = f"Bearer {auth_token}" - - payload = {"query": query} - if variables: - payload["variables"] = variables - - with self.client.post( - "/graphql/", - json=payload, - headers=headers, - name=name or "GraphQL", - catch_response=True - ) as response: - if response.status_code != 200: - response.failure(f"HTTP {response.status_code}") - return {} - - try: - data = response.json() - if "errors" in data and data["errors"]: - # Check if it's a critical error - error_msg = data["errors"][0].get("message", "Unknown error") - if "not found" not in error_msg.lower(): - response.failure(f"GraphQL Error: {error_msg}") - return data - except json.JSONDecodeError: - response.failure("Invalid JSON response") - return {} - - def extract_ids(self, data: dict, path: str) -> List[str]: - """Extract IDs from a GraphQL response using a dot-notation path.""" - try: - parts = path.split(".") - current = data - for part in parts: - if part == "edges": - return [edge["node"]["id"] for edge in current.get("edges", [])] - current = current.get(part, {}) - return [] - except (KeyError, TypeError): - return [] diff --git a/examples/demo-environment/loadtest/locustfiles/locustfile.py b/examples/demo-environment/loadtest/locustfiles/locustfile.py deleted file mode 100644 index 9d17fc2..0000000 --- a/examples/demo-environment/loadtest/locustfiles/locustfile.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -Combined locustfile with all personas. -Use this for realistic mixed traffic patterns. - -Usage: - locust -f locustfile.py --host=http://localhost:8000 -""" -from personas import BrowserUser, SearcherUser, BuyerUser, AdminUser - -# All user classes are automatically discovered by Locust -# Weight distribution (defined in each class): -# BrowserUser: 50 (casual browsing) -# SearcherUser: 25 (search-heavy) -# BuyerUser: 20 (purchase journey) -# AdminUser: 5 (admin operations) diff --git a/examples/demo-environment/loadtest/locustfiles/personas/__init__.py b/examples/demo-environment/loadtest/locustfiles/personas/__init__.py deleted file mode 100644 index 37e6d94..0000000 --- a/examples/demo-environment/loadtest/locustfiles/personas/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Persona exports -from .browser import BrowserUser -from .searcher import SearcherUser -from .buyer import BuyerUser -from .admin import AdminUser - -__all__ = ["BrowserUser", "SearcherUser", "BuyerUser", "AdminUser"] diff --git a/examples/demo-environment/loadtest/locustfiles/personas/admin.py b/examples/demo-environment/loadtest/locustfiles/personas/admin.py deleted file mode 100644 index 976ef41..0000000 --- a/examples/demo-environment/loadtest/locustfiles/personas/admin.py +++ /dev/null @@ -1,203 +0,0 @@ -""" -Admin Persona - Administrative operations. - -Simulates admin users who: -- Create and update products -- Manage inventory -- Generate write pressure on the system -""" -import random -import string -from locust import HttpUser, task, between, tag - -from common import GraphQLMixin, SaleorGraphQL, random_product_name - - -class AdminUser(HttpUser, GraphQLMixin): - """ - Simulates an admin user performing write-heavy operations. - Requires authentication. - """ - - # Admins work more slowly and deliberately - wait_time = between(3, 8) - weight = 5 # Rare but impactful - - def on_start(self): - """Initialize admin session and authenticate.""" - self.auth_token = None - self.product_types = [] - self.categories = [] - self.created_products = [] - - # Authenticate - self._authenticate() - - # Fetch product types and categories - if self.auth_token: - self._fetch_metadata() - - def _authenticate(self): - """Authenticate as admin user.""" - # Use environment variables in production - import os - email = os.environ.get("ADMIN_EMAIL", "admin@example.com") - password = os.environ.get("ADMIN_PASSWORD", "admin123456") - - data = self.graphql( - SaleorGraphQL.TOKEN_CREATE, - variables={"email": email, "password": password}, - name="[Admin] Login" - ) - - if data and "data" in data: - token_data = data.get("data", {}).get("tokenCreate", {}) - self.auth_token = token_data.get("token") - - def _fetch_metadata(self): - """Fetch product types and categories for creating products.""" - # Fetch categories - data = self.graphql( - SaleorGraphQL.CATEGORIES, - variables={"first": 50}, - name="[Admin] Fetch Categories", - auth_token=self.auth_token - ) - - if data and "data" in data: - edges = data.get("data", {}).get("categories", {}).get("edges", []) - self.categories = [e["node"]["id"] for e in edges] - - @task(15) - @tag("read", "admin") - def view_products(self): - """View product listing (admin dashboard).""" - if not self.auth_token: - self._authenticate() - return - - self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 50}, - name="[Admin] View Products", - auth_token=self.auth_token - ) - - @task(10) - @tag("write", "admin", "product") - def create_product(self): - """Create a new product.""" - if not self.auth_token: - self._authenticate() - return - - if not self.categories: - self._fetch_metadata() - return - - product_name = random_product_name() - slug = product_name.lower().replace(" ", "-") + "-" + ''.join(random.choices(string.ascii_lowercase, k=4)) - - # Note: This requires a product type ID which we'd need to fetch - # For now, this demonstrates the pattern - data = self.graphql( - """ - mutation CreateProduct($input: ProductCreateInput!) { - productCreate(input: $input) { - product { id name slug } - errors { field message } - } - } - """, - variables={ - "input": { - "name": product_name, - "slug": slug, - "description": f"Load test product: {product_name}", - "category": random.choice(self.categories) if self.categories else None, - } - }, - name="[Admin] Create Product", - auth_token=self.auth_token - ) - - if data and "data" in data: - product = data.get("data", {}).get("productCreate", {}).get("product", {}) - if product: - self.created_products.append(product.get("id")) - - @task(20) - @tag("write", "admin", "product") - def update_product(self): - """Update an existing product.""" - if not self.auth_token: - self._authenticate() - return - - # First fetch a product to update - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 10}, - name="[Admin] Fetch for Update", - auth_token=self.auth_token - ) - - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - if edges: - product = random.choice(edges)["node"] - - # Update the product - self.graphql( - SaleorGraphQL.PRODUCT_UPDATE, - variables={ - "id": product["id"], - "input": { - "description": f"Updated at {random.randint(1000, 9999)} by load test" - } - }, - name="[Admin] Update Product", - auth_token=self.auth_token - ) - - @task(5) - @tag("read", "admin") - def view_categories(self): - """View category listing.""" - if not self.auth_token: - self._authenticate() - return - - self.graphql( - SaleorGraphQL.CATEGORIES, - variables={"first": 50}, - name="[Admin] View Categories", - auth_token=self.auth_token - ) - - @task(3) - @tag("read", "admin") - def view_product_detail(self): - """View detailed product info (admin editing).""" - if not self.auth_token: - self._authenticate() - return - - # Fetch products first - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 20}, - name="[Admin] Fetch Product List", - auth_token=self.auth_token - ) - - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - if edges: - product = random.choice(edges)["node"] - self.graphql( - SaleorGraphQL.PRODUCT_DETAIL, - variables={"slug": product["slug"]}, - name="[Admin] View Product Detail", - auth_token=self.auth_token - ) diff --git a/examples/demo-environment/loadtest/locustfiles/personas/browser.py b/examples/demo-environment/loadtest/locustfiles/personas/browser.py deleted file mode 100644 index 46d572e..0000000 --- a/examples/demo-environment/loadtest/locustfiles/personas/browser.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Browser Persona - Casual browsing behavior. - -Simulates users who: -- Browse category listings -- View product details -- Look at multiple pages -- Don't necessarily buy -""" -import random -from locust import HttpUser, task, between, tag - -from common import GraphQLMixin, SaleorGraphQL - - -class BrowserUser(HttpUser, GraphQLMixin): - """ - Simulates a casual browser who browses categories and views products. - This is read-heavy traffic - no cart or checkout operations. - """ - - # Wait 2-5 seconds between requests (simulates reading time) - wait_time = between(2, 5) - weight = 50 # Most common user type - - def on_start(self): - """Initialize user session and fetch available categories/products.""" - self.categories = [] - self.products = [] - self.product_slugs = [] - - # Fetch categories on start - self._fetch_categories() - - def _fetch_categories(self): - """Fetch available categories.""" - data = self.graphql( - SaleorGraphQL.CATEGORIES, - variables={"first": 20}, - name="[Browser] Get Categories" - ) - - if data and "data" in data: - edges = data.get("data", {}).get("categories", {}).get("edges", []) - self.categories = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"], "name": e["node"]["name"]} - for e in edges - ] - - def _fetch_products(self, category_slug: str = None): - """Fetch products, optionally filtered by category.""" - if category_slug: - data = self.graphql( - SaleorGraphQL.CATEGORY_PRODUCTS, - variables={"slug": category_slug, "first": 12}, - name="[Browser] Category Products" - ) - if data and "data" in data: - products = data.get("data", {}).get("category", {}).get("products", {}).get("edges", []) - self.products = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"], "name": e["node"]["name"]} - for e in products - ] - else: - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 20}, - name="[Browser] All Products" - ) - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.products = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"], "name": e["node"]["name"]} - for e in edges - ] - - @task(10) - @tag("read", "browse") - def browse_categories(self): - """Browse the category listing.""" - self.graphql( - SaleorGraphQL.CATEGORIES, - variables={"first": 20}, - name="[Browser] Browse Categories" - ) - - @task(30) - @tag("read", "browse") - def browse_category_products(self): - """Browse products in a specific category.""" - if not self.categories: - self._fetch_categories() - - if self.categories: - category = random.choice(self.categories) - self._fetch_products(category["slug"]) - - @task(40) - @tag("read", "browse") - def view_product_list(self): - """View the main product listing with pagination.""" - page_size = random.choice([12, 24, 48]) - - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={ - "first": page_size, - "sortBy": random.choice([ - {"field": "NAME", "direction": "ASC"}, - {"field": "PRICE", "direction": "ASC"}, - {"field": "PRICE", "direction": "DESC"}, - {"field": "DATE", "direction": "DESC"}, - ]) - }, - name="[Browser] Product List" - ) - - # Cache some products for detail views - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.products = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"], "name": e["node"]["name"]} - for e in edges - ] - - @task(20) - @tag("read", "browse", "detail") - def view_product_detail(self): - """View a specific product's detail page.""" - if not self.products: - self._fetch_products() - - if self.products: - product = random.choice(self.products) - self.graphql( - SaleorGraphQL.PRODUCT_DETAIL, - variables={"slug": product["slug"]}, - name="[Browser] Product Detail" - ) - - @task(5) - @tag("read") - def get_shop_info(self): - """Fetch shop information (header/footer data).""" - self.graphql( - SaleorGraphQL.SHOP_INFO, - name="[Browser] Shop Info" - ) diff --git a/examples/demo-environment/loadtest/locustfiles/personas/buyer.py b/examples/demo-environment/loadtest/locustfiles/personas/buyer.py deleted file mode 100644 index c8169d3..0000000 --- a/examples/demo-environment/loadtest/locustfiles/personas/buyer.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -Buyer Persona - Full purchase journey. - -Simulates users who: -- Browse products -- Add items to cart -- Go through checkout process -- Complete (or abandon) purchases -""" -import random -from locust import HttpUser, task, between, tag, events - -from common import GraphQLMixin, SaleorGraphQL, random_email, random_address - - -class BuyerUser(HttpUser, GraphQLMixin): - """ - Simulates a buyer going through the full purchase journey. - Mix of read and write operations. - """ - - # Moderate wait times - buyers are engaged but thoughtful - wait_time = between(2, 4) - weight = 20 # Less common but high value - - def on_start(self): - """Initialize buyer session.""" - self.products = [] - self.product_variants = {} # product_id -> [variant_ids] - self.checkout_id = None - self.checkout_token = None - self.cart_items = 0 - - # Pre-fetch some products - self._fetch_products() - - def _fetch_products(self): - """Fetch available products with variants.""" - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 30}, - name="[Buyer] Fetch Products" - ) - - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.products = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"], "name": e["node"]["name"]} - for e in edges - ] - - def _get_product_variants(self, product_slug: str) -> list: - """Get variants for a product.""" - if product_slug in self.product_variants: - return self.product_variants[product_slug] - - data = self.graphql( - SaleorGraphQL.PRODUCT_DETAIL, - variables={"slug": product_slug}, - name="[Buyer] Get Variants" - ) - - variants = [] - if data and "data" in data: - product = data.get("data", {}).get("product", {}) - if product: - variants = [ - {"id": v["id"], "name": v["name"]} - for v in product.get("variants", []) - if v.get("quantityAvailable", 0) > 0 - ] - self.product_variants[product_slug] = variants - - return variants - - def _create_checkout(self, variant_id: str, quantity: int = 1): - """Create a new checkout with an item.""" - email = random_email() - - data = self.graphql( - SaleorGraphQL.CHECKOUT_CREATE, - variables={ - "input": { - "channel": "default-channel", - "email": email, - "lines": [{"variantId": variant_id, "quantity": quantity}] - } - }, - name="[Buyer] Create Checkout" - ) - - if data and "data" in data: - checkout = data.get("data", {}).get("checkoutCreate", {}).get("checkout", {}) - if checkout: - self.checkout_id = checkout.get("id") - self.checkout_token = checkout.get("token") - self.cart_items = quantity - return True - - return False - - @task(25) - @tag("read", "browse") - def browse_products(self): - """Browse products (buyer still browses).""" - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 20}, - name="[Buyer] Browse Products" - ) - - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.products = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"], "name": e["node"]["name"]} - for e in edges - ] - - @task(20) - @tag("read", "browse", "detail") - def view_product_detail(self): - """View product detail (pre-purchase research).""" - if not self.products: - self._fetch_products() - - if self.products: - product = random.choice(self.products) - self._get_product_variants(product["slug"]) - - @task(15) - @tag("write", "cart") - def add_to_cart(self): - """Add item to cart.""" - if not self.products: - self._fetch_products() - - if not self.products: - return - - product = random.choice(self.products) - variants = self._get_product_variants(product["slug"]) - - if not variants: - return - - variant = random.choice(variants) - quantity = random.randint(1, 3) - - if not self.checkout_id: - # Create new checkout - self._create_checkout(variant["id"], quantity) - else: - # Add to existing checkout - data = self.graphql( - SaleorGraphQL.CHECKOUT_LINES_ADD, - variables={ - "id": self.checkout_id, - "lines": [{"variantId": variant["id"], "quantity": quantity}] - }, - name="[Buyer] Add to Cart" - ) - - if data and "data" in data: - checkout = data.get("data", {}).get("checkoutLinesAdd", {}).get("checkout", {}) - if checkout: - self.cart_items += quantity - - @task(10) - @tag("read", "cart") - def view_cart(self): - """View current cart.""" - if self.checkout_id: - self.graphql( - SaleorGraphQL.CHECKOUT, - variables={"id": self.checkout_id}, - name="[Buyer] View Cart" - ) - - @task(8) - @tag("write", "checkout") - def set_shipping_address(self): - """Set shipping address on checkout.""" - if not self.checkout_id or self.cart_items == 0: - return - - address = random_address() - - self.graphql( - SaleorGraphQL.CHECKOUT_SHIPPING_ADDRESS_UPDATE, - variables={ - "id": self.checkout_id, - "address": address - }, - name="[Buyer] Set Shipping Address" - ) - - @task(8) - @tag("write", "checkout") - def set_billing_address(self): - """Set billing address on checkout.""" - if not self.checkout_id or self.cart_items == 0: - return - - address = random_address() - - self.graphql( - SaleorGraphQL.CHECKOUT_BILLING_ADDRESS_UPDATE, - variables={ - "id": self.checkout_id, - "address": address - }, - name="[Buyer] Set Billing Address" - ) - - @task(5) - @tag("write", "checkout") - def complete_checkout(self): - """Attempt to complete the checkout (order placement).""" - if not self.checkout_id or self.cart_items == 0: - return - - # Set addresses first - address = random_address() - - self.graphql( - SaleorGraphQL.CHECKOUT_SHIPPING_ADDRESS_UPDATE, - variables={"id": self.checkout_id, "address": address}, - name="[Buyer] Checkout - Shipping" - ) - - self.graphql( - SaleorGraphQL.CHECKOUT_BILLING_ADDRESS_UPDATE, - variables={"id": self.checkout_id, "address": address}, - name="[Buyer] Checkout - Billing" - ) - - # Complete checkout - data = self.graphql( - SaleorGraphQL.CHECKOUT_COMPLETE, - variables={"id": self.checkout_id}, - name="[Buyer] Complete Checkout" - ) - - # Reset cart regardless of outcome - self.checkout_id = None - self.checkout_token = None - self.cart_items = 0 - - @task(4) - @tag("write", "cart") - def abandon_cart(self): - """Abandon current cart (simulate cart abandonment).""" - if self.checkout_id and random.random() < 0.3: # 30% abandon rate - self.checkout_id = None - self.checkout_token = None - self.cart_items = 0 diff --git a/examples/demo-environment/loadtest/locustfiles/personas/searcher.py b/examples/demo-environment/loadtest/locustfiles/personas/searcher.py deleted file mode 100644 index 7d468ad..0000000 --- a/examples/demo-environment/loadtest/locustfiles/personas/searcher.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Searcher Persona - Search-focused behavior. - -Simulates users who: -- Use search functionality heavily -- Apply filters and sorting -- Compare products -- May or may not proceed to purchase -""" -import random -from locust import HttpUser, task, between, tag - -from common import GraphQLMixin, SaleorGraphQL, random_search_term, SEARCH_TERMS - - -class SearcherUser(HttpUser, GraphQLMixin): - """ - Simulates a user who heavily uses search and filtering. - Read-heavy with emphasis on search operations. - """ - - # Shorter wait times - searchers are more impatient - wait_time = between(1, 3) - weight = 25 # Common but less than browsers - - def on_start(self): - """Initialize search session.""" - self.recent_searches = [] - self.found_products = [] - - @task(40) - @tag("read", "search") - def search_products(self): - """Perform a product search.""" - search_term = random_search_term() - - data = self.graphql( - SaleorGraphQL.SEARCH_PRODUCTS, - variables={"search": search_term, "first": 20}, - name=f"[Searcher] Search" - ) - - if data and "data" in data: - edges = data.get("data", {}).get("products", {}).get("edges", []) - self.found_products = [ - {"id": e["node"]["id"], "slug": e["node"]["slug"]} - for e in edges - ] - self.recent_searches.append(search_term) - # Keep only last 5 searches - self.recent_searches = self.recent_searches[-5:] - - @task(20) - @tag("read", "search", "filter") - def search_with_filter(self): - """Search with price filters.""" - search_term = random_search_term() - - # Random price range - min_price = random.choice([0, 10, 25, 50, 100]) - max_price = min_price + random.choice([50, 100, 200, 500]) - - self.graphql( - SaleorGraphQL.PRODUCTS, - variables={ - "first": 20, - "filter": { - "search": search_term, - "price": { - "gte": min_price, - "lte": max_price - } - } - }, - name="[Searcher] Search + Filter" - ) - - @task(15) - @tag("read", "search", "sort") - def search_with_sort(self): - """Search with sorting applied.""" - search_term = random_search_term() - - sort_options = [ - {"field": "NAME", "direction": "ASC"}, - {"field": "NAME", "direction": "DESC"}, - {"field": "PRICE", "direction": "ASC"}, - {"field": "PRICE", "direction": "DESC"}, - {"field": "RATING", "direction": "DESC"}, - {"field": "DATE", "direction": "DESC"}, - ] - - self.graphql( - SaleorGraphQL.PRODUCTS, - variables={ - "first": 20, - "filter": {"search": search_term}, - "sortBy": random.choice(sort_options) - }, - name="[Searcher] Search + Sort" - ) - - @task(10) - @tag("read", "search") - def refine_search(self): - """Refine a previous search (simulate user adjusting query).""" - if self.recent_searches: - base_term = random.choice(self.recent_searches) - # Add a modifier - modifiers = ["best", "cheap", "premium", "new", "sale", "top"] - refined_term = f"{random.choice(modifiers)} {base_term}" - - self.graphql( - SaleorGraphQL.SEARCH_PRODUCTS, - variables={"search": refined_term, "first": 20}, - name="[Searcher] Refined Search" - ) - else: - self.search_products() - - @task(15) - @tag("read", "search", "detail") - def view_search_result(self): - """View a product from search results.""" - if not self.found_products: - self.search_products() - - if self.found_products: - product = random.choice(self.found_products) - self.graphql( - SaleorGraphQL.PRODUCT_DETAIL, - variables={"slug": product["slug"]}, - name="[Searcher] View Result" - ) - - @task(5) - @tag("read", "search") - def paginate_results(self): - """Paginate through search results.""" - search_term = random_search_term() - - # First page - data = self.graphql( - SaleorGraphQL.PRODUCTS, - variables={"first": 12, "filter": {"search": search_term}}, - name="[Searcher] Results Page 1" - ) - - # Get next page cursor - if data and "data" in data: - page_info = data.get("data", {}).get("products", {}).get("pageInfo", {}) - if page_info.get("hasNextPage"): - cursor = page_info.get("endCursor") - self.graphql( - SaleorGraphQL.PRODUCTS, - variables={ - "first": 12, - "after": cursor, - "filter": {"search": search_term} - }, - name="[Searcher] Results Page 2" - ) diff --git a/examples/demo-environment/loadtest/locustfiles/ramp_pattern.py b/examples/demo-environment/loadtest/locustfiles/ramp_pattern.py deleted file mode 100644 index 44ddfe9..0000000 --- a/examples/demo-environment/loadtest/locustfiles/ramp_pattern.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Ramp load pattern with mixed traffic. -Gradual growth - perfect for trend detection training. - -Usage: - locust -f ramp_pattern.py --host=http://localhost:8000 --headless -t 20m -""" -from personas import BrowserUser, SearcherUser, BuyerUser -from shapes import RampLoadShape - - -class RampShape(RampLoadShape): - """Ramp from 0 to 100 users over 5 minutes, hold 10 minutes, ramp down.""" - max_users = 100 - ramp_duration = 300 # 5 min ramp up - hold_duration = 600 # 10 min hold - spawn_rate = 5 diff --git a/examples/demo-environment/loadtest/locustfiles/read_heavy.py b/examples/demo-environment/loadtest/locustfiles/read_heavy.py deleted file mode 100644 index a17599d..0000000 --- a/examples/demo-environment/loadtest/locustfiles/read_heavy.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Read-heavy traffic pattern. -Use this to test read scaling and cache effectiveness. - -Usage: - locust -f read_heavy.py --host=http://localhost:8000 -""" -from locust import HttpUser, between -from personas.browser import BrowserUser -from personas.searcher import SearcherUser - - -class ReadOnlyBrowserUser(BrowserUser): - """Browser with higher weight for read-heavy tests.""" - weight = 70 - - -class ReadOnlySearcherUser(SearcherUser): - """Searcher with adjusted weight for read-heavy tests.""" - weight = 30 diff --git a/examples/demo-environment/loadtest/locustfiles/shapes/__init__.py b/examples/demo-environment/loadtest/locustfiles/shapes/__init__.py deleted file mode 100644 index b468bf2..0000000 --- a/examples/demo-environment/loadtest/locustfiles/shapes/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .load_shapes import ( - RampLoadShape, - WaveLoadShape, - SpikeLoadShape, - StepLoadShape, - ChaosLoadShape, -) - -__all__ = [ - "RampLoadShape", - "WaveLoadShape", - "SpikeLoadShape", - "StepLoadShape", - "ChaosLoadShape", -] diff --git a/examples/demo-environment/loadtest/locustfiles/shapes/load_shapes.py b/examples/demo-environment/loadtest/locustfiles/shapes/load_shapes.py deleted file mode 100644 index 2397cfb..0000000 --- a/examples/demo-environment/loadtest/locustfiles/shapes/load_shapes.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -Load shape definitions for different traffic patterns. - -These shapes control HOW users are spawned over time, creating -patterns that are useful for ML training: -- Ramp: Gradual growth (predictable trend) -- Wave: Periodic surges (seasonality) -- Spike: Flash-sale burst (shock event) -""" -import math -from locust import LoadTestShape - - -class RampLoadShape(LoadTestShape): - """ - Gradual ramp-up pattern - simulates organic traffic growth. - - Perfect for testing: - - Trend detection in time series - - Gradual scaling behavior - - Capacity planning predictions - - Pattern: - - Linear increase from 0 to max_users over ramp_duration - - Hold at max_users for hold_duration - - Linear decrease back to 0 - """ - - # Configuration (can be overridden via environment variables) - max_users = 100 # Peak number of users - ramp_duration = 300 # 5 minutes to ramp up - hold_duration = 600 # 10 minutes at peak - spawn_rate = 5 # Users per second during ramp - - def tick(self): - run_time = self.get_run_time() - - total_duration = self.ramp_duration * 2 + self.hold_duration - - if run_time > total_duration: - return None # Stop the test - - if run_time < self.ramp_duration: - # Ramp up phase - current_users = int(self.max_users * (run_time / self.ramp_duration)) - return (max(1, current_users), self.spawn_rate) - - elif run_time < self.ramp_duration + self.hold_duration: - # Hold phase - return (self.max_users, self.spawn_rate) - - else: - # Ramp down phase - ramp_down_time = run_time - self.ramp_duration - self.hold_duration - current_users = int(self.max_users * (1 - ramp_down_time / self.ramp_duration)) - return (max(1, current_users), self.spawn_rate) - - -class WaveLoadShape(LoadTestShape): - """ - Sinusoidal wave pattern - simulates daily/weekly traffic cycles. - - Perfect for testing: - - Seasonality detection - - Periodic scaling patterns - - Time-based capacity adjustments - - Pattern: - - Sine wave oscillation between min_users and max_users - - Period controls the cycle length - - Multiple cycles for better pattern learning - """ - - min_users = 10 # Valley (low traffic) - max_users = 100 # Peak (high traffic) - period = 300 # 5 minutes per cycle - num_cycles = 6 # Number of complete cycles - spawn_rate = 10 # Users per second - - def tick(self): - run_time = self.get_run_time() - total_duration = self.period * self.num_cycles - - if run_time > total_duration: - return None - - # Sine wave: oscillates between -1 and 1 - # Scale to oscillate between min_users and max_users - amplitude = (self.max_users - self.min_users) / 2 - midpoint = self.min_users + amplitude - - current_users = int(midpoint + amplitude * math.sin(2 * math.pi * run_time / self.period)) - current_users = max(self.min_users, min(self.max_users, current_users)) - - return (current_users, self.spawn_rate) - - -class SpikeLoadShape(LoadTestShape): - """ - Spike/burst pattern - simulates flash sales or viral events. - - Perfect for testing: - - Anomaly detection - - Burst handling capacity - - Recovery behavior - - Pattern: - - Baseline traffic - - Sudden spike to peak - - Gradual recovery - - Multiple spikes possible - """ - - baseline_users = 20 # Normal traffic - spike_users = 200 # Spike peak - baseline_duration = 120 # 2 minutes baseline before spike - spike_duration = 60 # 1 minute spike - recovery_duration = 180 # 3 minutes recovery - num_spikes = 3 # Number of spikes - spawn_rate = 50 # Fast spawn during spike - - def tick(self): - run_time = self.get_run_time() - cycle_duration = self.baseline_duration + self.spike_duration + self.recovery_duration - total_duration = cycle_duration * self.num_spikes - - if run_time > total_duration: - return None - - # Determine which phase we're in within the current cycle - cycle_time = run_time % cycle_duration - - if cycle_time < self.baseline_duration: - # Baseline phase - return (self.baseline_users, self.spawn_rate // 5) - - elif cycle_time < self.baseline_duration + self.spike_duration: - # Spike phase - rapid increase - spike_progress = (cycle_time - self.baseline_duration) / self.spike_duration - if spike_progress < 0.3: - # Rapid rise (30% of spike duration) - current_users = int(self.baseline_users + (self.spike_users - self.baseline_users) * (spike_progress / 0.3)) - else: - # Hold at peak (70% of spike duration) - current_users = self.spike_users - return (current_users, self.spawn_rate) - - else: - # Recovery phase - gradual decrease - recovery_progress = (cycle_time - self.baseline_duration - self.spike_duration) / self.recovery_duration - current_users = int(self.spike_users - (self.spike_users - self.baseline_users) * recovery_progress) - return (max(self.baseline_users, current_users), self.spawn_rate // 2) - - -class StepLoadShape(LoadTestShape): - """ - Step function pattern - discrete load levels. - - Perfect for testing: - - Finding breaking points - - Capacity thresholds - - Step-change behavior - """ - - step_users = [10, 25, 50, 75, 100, 150, 200] # User levels - step_duration = 180 # 3 minutes per step - spawn_rate = 20 - - def tick(self): - run_time = self.get_run_time() - total_duration = self.step_duration * len(self.step_users) - - if run_time > total_duration: - return None - - current_step = int(run_time // self.step_duration) - current_step = min(current_step, len(self.step_users) - 1) - - return (self.step_users[current_step], self.spawn_rate) - - -class ChaosLoadShape(LoadTestShape): - """ - Chaotic/random pattern - unpredictable traffic. - - Perfect for testing: - - Robustness to noise - - Model generalization - - Real-world unpredictability - """ - - min_users = 10 - max_users = 150 - change_interval = 30 # Change every 30 seconds - duration = 1800 # 30 minutes total - spawn_rate = 20 - - import random - _random = random.Random(42) # Seeded for reproducibility - - def tick(self): - run_time = self.get_run_time() - - if run_time > self.duration: - return None - - # Seed based on time interval for reproducibility - interval = int(run_time // self.change_interval) - self._random.seed(42 + interval) - - current_users = self._random.randint(self.min_users, self.max_users) - - return (current_users, self.spawn_rate) diff --git a/examples/demo-environment/loadtest/locustfiles/spike_pattern.py b/examples/demo-environment/loadtest/locustfiles/spike_pattern.py deleted file mode 100644 index 8efa5ee..0000000 --- a/examples/demo-environment/loadtest/locustfiles/spike_pattern.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Spike load pattern with mixed traffic. -Flash-sale simulation - perfect for anomaly detection training. - -Usage: - locust -f spike_pattern.py --host=http://localhost:8000 --headless -t 20m -""" -from personas import BrowserUser, SearcherUser, BuyerUser -from shapes import SpikeLoadShape - - -class SpikeShape(SpikeLoadShape): - """3 spikes with 2 min baseline, 1 min spike, 3 min recovery each.""" - baseline_users = 20 - spike_users = 200 - baseline_duration = 120 # 2 min baseline - spike_duration = 60 # 1 min spike - recovery_duration = 180 # 3 min recovery - num_spikes = 3 - spawn_rate = 50 diff --git a/examples/demo-environment/loadtest/locustfiles/wave_pattern.py b/examples/demo-environment/loadtest/locustfiles/wave_pattern.py deleted file mode 100644 index 29d4d77..0000000 --- a/examples/demo-environment/loadtest/locustfiles/wave_pattern.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Wave load pattern with mixed traffic. -Sinusoidal pattern - perfect for seasonality detection training. - -Usage: - locust -f wave_pattern.py --host=http://localhost:8000 --headless -t 30m -""" -from personas import BrowserUser, SearcherUser, BuyerUser -from shapes import WaveLoadShape - - -class WaveShape(WaveLoadShape): - """6 sine wave cycles over 30 minutes.""" - min_users = 10 - max_users = 100 - period = 300 # 5 minute cycles - num_cycles = 6 # 30 minutes total - spawn_rate = 10 diff --git a/examples/demo-environment/loadtest/locustfiles/write_heavy.py b/examples/demo-environment/loadtest/locustfiles/write_heavy.py deleted file mode 100644 index a2760cb..0000000 --- a/examples/demo-environment/loadtest/locustfiles/write_heavy.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Write-heavy traffic pattern. -Use this to test database write scaling and transaction handling. - -Usage: - locust -f write_heavy.py --host=http://localhost:8000 -""" -from personas.buyer import BuyerUser -from personas.admin import AdminUser - - -class HighVolumeBuyerUser(BuyerUser): - """Buyer with increased purchase frequency for write-heavy tests.""" - weight = 60 - - # Faster actions for more write pressure - from locust import between - wait_time = between(1, 2) - - -class ActiveAdminUser(AdminUser): - """Admin with higher weight for write-heavy tests.""" - weight = 40 - - # More frequent admin actions - from locust import between - wait_time = between(2, 4) diff --git a/examples/demo-environment/loadtest/probe_metrics.py b/examples/demo-environment/loadtest/probe_metrics.py deleted file mode 100644 index cfe6e27..0000000 --- a/examples/demo-environment/loadtest/probe_metrics.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 -# probe_metrics.py -# Usage: python probe_metrics.py --project YOUR_PROJECT --hours 1 - -import argparse, json -from datetime import datetime, timedelta -from google.cloud import monitoring_v3 - -METRIC_CANDIDATES = { - "rps": [ - "prometheus.googleapis.com/locust_requests_total", - "prometheus.googleapis.com/http_server_requests_total", - "prometheus.googleapis.com/requests_total", - ], - "latency_histogram": [ - "prometheus.googleapis.com/locust_request_latency_seconds", - "prometheus.googleapis.com/http_server_request_latency_seconds", - "prometheus.googleapis.com/request_latency_seconds", - ], - "cpu": [ - "kubernetes.io/container/cpu/core_usage_time", - "kubernetes.io/container/cpu/usage_time", - ], - "memory": [ - "kubernetes.io/container/memory/used_bytes", - "kubernetes.io/container/memory/working_set_bytes", - ], - "db_connections": [ - "cloudsql.googleapis.com/database/postgresql/num_backends", - "cloudsql.googleapis.com/database/connections", - ], -} - -def probe(project: str, hours: int = 1, namespace: str | None = None): - client = monitoring_v3.MetricServiceClient() - now = datetime.utcnow() - interval = monitoring_v3.TimeInterval({ - "end_time": {"seconds": int(now.timestamp())}, - "start_time": {"seconds": int((now - timedelta(hours=hours)).timestamp())}, - }) - - results = {} - for friendly, candidates in METRIC_CANDIDATES.items(): - results[friendly] = [] - for metric_type in candidates: - # Build filter; optionally restrict to namespace if provided - if namespace: - filter_expr = f'metric.type="{metric_type}" AND resource.labels.namespace_name="{namespace}"' - else: - filter_expr = f'metric.type="{metric_type}"' - - series = client.list_time_series( - request={ - "name": f"projects/{project}", - "filter": filter_expr, - "interval": interval, - "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.HEADERS, - "page_size": 5, - } - ) - - pts = 0 - resources = set() - for ts in series: - pts += len(ts.points) - resources.add(tuple(sorted(ts.resource.labels.items()))) - results[friendly].append({ - "metric_type": metric_type, - "series_found": pts > 0, - "points_count": pts, - "resources_sample": [dict(r) for r in list(resources)[:3]], - }) - print(json.dumps(results, indent=2)) - -if __name__ == "__main__": - p = argparse.ArgumentParser() - p.add_argument("--project", required=True) - p.add_argument("--hours", type=int, default=1) - p.add_argument("--namespace", default=None, help="K8s namespace to filter (e.g. saleor)") - args = p.parse_args() - probe(args.project, args.hours, args.namespace) \ No newline at end of file diff --git a/examples/demo-environment/loadtest/requirements.txt b/examples/demo-environment/loadtest/requirements.txt deleted file mode 100644 index dc39d75..0000000 --- a/examples/demo-environment/loadtest/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -locust>=2.20.0 -gevent>=23.9.0 -requests>=2.31.0 diff --git a/examples/demo-environment/loadtest/run-tests.ps1 b/examples/demo-environment/loadtest/run-tests.ps1 deleted file mode 100644 index 2a2e6aa..0000000 --- a/examples/demo-environment/loadtest/run-tests.ps1 +++ /dev/null @@ -1,114 +0,0 @@ -# Helios Load Test Runner Scripts -# Quick scripts to run different test scenarios - -param( - [Parameter(Position=0)] - [ValidateSet("mixed", "read", "write", "ramp", "wave", "spike", "help")] - [string]$Pattern = "help", - - [string]$Host = "http://localhost:8000", - [int]$Users = 50, - [int]$SpawnRate = 5, - [string]$Duration = "10m", - [switch]$Headless, - [string]$CsvOutput = "" -) - -$ErrorActionPreference = "Stop" -$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path -$LocustDir = Join-Path $ScriptDir "locustfiles" - -function Show-Help { - Write-Host @" - -Helios Load Test Runner -======================= - -Usage: .\run-tests.ps1 [options] - -Patterns: - mixed - Realistic mix of all user types (default behavior) - read - Read-heavy traffic (browsers + searchers) - write - Write-heavy traffic (buyers + admins) - ramp - Gradual ramp up/down pattern - wave - Sinusoidal wave pattern - spike - Flash-sale spike pattern - help - Show this help - -Options: - -Host Target API URL (default: http://localhost:8000) - -Users Number of users (default: 50) - -SpawnRate Users per second (default: 5) - -Duration