From 09a8728452a62cd012219c271e1b2f5b9a9caf86 Mon Sep 17 00:00:00 2001 From: leagames0221-sys Date: Sun, 17 May 2026 18:39:31 +0900 Subject: [PATCH 1/6] feat(data-analytics-demo): T-03 data generation + T-12 Python CI infra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of the data-analytics-demo bolt-on. Ships the synthetic data generator (T-03) alongside the Python CI infrastructure (T-12) so the new Python code is verified by CI from the first commit. T-03 — Data generation (AC-1.1 to 1.5 + AC-γ.1 + AC-δ.2): - src/data_analytics_demo/data/schemas.py — Pydantic models for the 4 SaaS tables (Customer / Subscription / Event / Invoice) - src/data_analytics_demo/data/generate.py — Faker + numpy synthesis, deterministic via DEMO_RANDOM_SEED (default 42). Writes a DuckDB file at warehouse/analytics.duckdb. Engineered signal: trailing-30d event drop-off biases churn probability; free-tier customers using premium-feature events bias upsell probability (both observable through SQL, no leak from the generator into the ML feature surface). - tests/test_data_generate.py — 7 pytest cases covering each AC. - Makefile + cli.py — `make data` and `data-analytics-demo data` now do real work instead of exit-1 TODO placeholders. T-12 — Python CI infrastructure: - .github/workflows/python-test.yml — Python 3.11, install editable + dev, run ruff + mypy --strict + pytest (with the 80% coverage gate set in pyproject.toml). - .github/workflows/python-audit.yml — pip-audit --strict against OSV. - .github/dependabot.yml — pip ecosystem on /packages/data-analytics-demo, grouped by dbt / ml / duckdb / dev for review readability. Design note: ADR-0070 mentioned the DuckDB tpcds extension as a synthetic-data source. tpcds is a retail benchmark and does not fit the 4-table SaaS schema this package commits to. Reverted to pure Faker + numpy synthesis; ADR-0070 will be amended in T-13 polish phase to record the deviation. Local verify: - python -m compileall on src/ + tests/ → OK - node scripts/check-doc-drift.mjs → 0 failure(s), 0 warning(s) - node scripts/check-adr-claims.mjs → 77/77 PASS - HIVE-token sweep on new files → 0 hits (D-HIVE-OPACITY) --- .github/dependabot.yml | 30 ++ .github/workflows/python-audit.yml | 40 +++ .github/workflows/python-test.yml | 43 +++ packages/data-analytics-demo/Makefile | 5 +- .../src/data_analytics_demo/cli.py | 8 +- .../src/data_analytics_demo/data/__init__.py | 6 + .../src/data_analytics_demo/data/generate.py | 335 ++++++++++++++++++ .../src/data_analytics_demo/data/schemas.py | 81 +++++ .../data-analytics-demo/tests/__init__.py | 0 .../tests/test_data_generate.py | 159 +++++++++ 10 files changed, 701 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/python-audit.yml create mode 100644 .github/workflows/python-test.yml create mode 100644 packages/data-analytics-demo/src/data_analytics_demo/data/__init__.py create mode 100644 packages/data-analytics-demo/src/data_analytics_demo/data/generate.py create mode 100644 packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py create mode 100644 packages/data-analytics-demo/tests/__init__.py create mode 100644 packages/data-analytics-demo/tests/test_data_generate.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 1bec1de..a8f2a72 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -54,3 +54,33 @@ updates: - github-actions commit-message: prefix: "chore(ci)" + + - package-ecosystem: pip + directory: /packages/data-analytics-demo + schedule: + interval: weekly + day: monday + time: "09:00" + timezone: Asia/Tokyo + open-pull-requests-limit: 5 + groups: + dbt: + patterns: + - "dbt-*" + ml: + patterns: + - "scikit-learn" + - "xgboost" + - "shap" + - "pandas" + - "numpy" + duckdb: + patterns: + - "duckdb" + dev: + dependency-type: "development" + labels: + - dependencies + - python + commit-message: + prefix: "chore(deps)" diff --git a/.github/workflows/python-audit.yml b/.github/workflows/python-audit.yml new file mode 100644 index 0000000..576109e --- /dev/null +++ b/.github/workflows/python-audit.yml @@ -0,0 +1,40 @@ +name: python audit (data-analytics-demo) + +on: + push: + branches: [main, master] + paths: + - "packages/data-analytics-demo/pyproject.toml" + - ".github/workflows/python-audit.yml" + pull_request: + branches: [main, master] + paths: + - "packages/data-analytics-demo/pyproject.toml" + - ".github/workflows/python-audit.yml" + schedule: + - cron: "0 0 * * 0" + workflow_dispatch: + +jobs: + audit: + runs-on: ubuntu-latest + permissions: + contents: read + defaults: + run: + working-directory: packages/data-analytics-demo + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: install pip-audit + package deps + run: | + python -m pip install --upgrade pip + pip install pip-audit + pip install -e . + + - name: pip-audit (fail on HIGH or CRITICAL) + run: pip-audit --strict --vulnerability-service osv diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml new file mode 100644 index 0000000..f473618 --- /dev/null +++ b/.github/workflows/python-test.yml @@ -0,0 +1,43 @@ +name: python test (data-analytics-demo) + +on: + push: + branches: [main, master] + paths: + - "packages/data-analytics-demo/**" + - ".github/workflows/python-test.yml" + pull_request: + branches: [main, master] + paths: + - "packages/data-analytics-demo/**" + - ".github/workflows/python-test.yml" + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + permissions: + contents: read + defaults: + run: + working-directory: packages/data-analytics-demo + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: install package + dev extras + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: ruff (lint) + run: ruff check src tests + + - name: mypy (strict type-check) + run: mypy src + + - name: pytest (coverage ≥ 80%) + run: pytest diff --git a/packages/data-analytics-demo/Makefile b/packages/data-analytics-demo/Makefile index 627a7e7..3ab62a0 100644 --- a/packages/data-analytics-demo/Makefile +++ b/packages/data-analytics-demo/Makefile @@ -20,10 +20,9 @@ help: install: $(PIP) install -e ".[dev]" -# Stage targets — placeholders until T-03 .. T-10 are implemented +# Stage targets data: - @echo "[data] TODO T-03: synthetic data generation not yet implemented" - @exit 1 + $(PYTHON) -m data_analytics_demo.data.generate dbt: @echo "[dbt] TODO T-04/T-05: dbt models not yet implemented" diff --git a/packages/data-analytics-demo/src/data_analytics_demo/cli.py b/packages/data-analytics-demo/src/data_analytics_demo/cli.py index daaec05..46a6d15 100644 --- a/packages/data-analytics-demo/src/data_analytics_demo/cli.py +++ b/packages/data-analytics-demo/src/data_analytics_demo/cli.py @@ -26,9 +26,11 @@ def version() -> None: @app.command() def data() -> None: - """Generate synthetic SaaS data (T-03, not yet implemented).""" - typer.echo("[data] TODO T-03: synthetic data generation not yet implemented", err=True) - sys.exit(1) + """Generate synthetic SaaS data into warehouse/analytics.duckdb.""" + from data_analytics_demo.data import generate as gen + + out = gen.main() + typer.echo(f"wrote {out}") @app.command() diff --git a/packages/data-analytics-demo/src/data_analytics_demo/data/__init__.py b/packages/data-analytics-demo/src/data_analytics_demo/data/__init__.py new file mode 100644 index 0000000..833376b --- /dev/null +++ b/packages/data-analytics-demo/src/data_analytics_demo/data/__init__.py @@ -0,0 +1,6 @@ +"""Synthetic SaaS data generation for the customer-analytics demo. + +Public surface: + generate.main() Run the full synthesis and persist to DuckDB. + schemas.{Customer, Event, Subscription, Invoice} Pydantic models. +""" diff --git a/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py b/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py new file mode 100644 index 0000000..756ae43 --- /dev/null +++ b/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py @@ -0,0 +1,335 @@ +"""Synthetic SaaS data generator. + +Produces 4 tables (`customers`, `subscriptions`, `events`, `invoices`) into a +DuckDB file at `/warehouse/analytics.duckdb`. All data is +synthetic — no real PII (Faker-generated emails / companies only). + +Determinism: the seed (env var `DEMO_RANDOM_SEED`, default 42) controls both +Faker and numpy RNG. Re-running with the same seed produces byte-identical +output. + +Engineered signal — the generator deliberately injects two patterns so the +downstream ML layer (T-06 churn, T-07 upsell) has something to learn: + + - Churn signal: customers whose event volume in the trailing 30 days is + < 30% of their lifetime daily average are flagged with a higher + cancellation probability. + - Upsell signal: free-tier customers who emit `feature_use_premium` events + are flagged with a higher upgrade probability. + +Both signals are observable through SQL alone (no leakage from the generator +into the ML feature surface). +""" + +from __future__ import annotations + +import os +import sys +import time +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import TYPE_CHECKING + +import duckdb +import numpy as np +import pandas as pd +from faker import Faker + +if TYPE_CHECKING: + from collections.abc import Sequence + +# --- Defaults (overridable via env vars; documented in .env.example) --- +DEFAULT_N_CUSTOMERS = 1000 +DEFAULT_N_SUBSCRIPTIONS = 2000 +DEFAULT_N_EVENTS = 50_000 +DEFAULT_N_INVOICES = 5000 +DEFAULT_SEED = 42 + +# Reference window: synthetic "now" = 2026-05-01 UTC. Events span 2 years back. +REFERENCE_NOW = datetime(2026, 5, 1, tzinfo=timezone.utc) +HISTORY_WINDOW_DAYS = 730 + +PLAN_TIERS = ["free", "pro", "enterprise"] +PLAN_PRICES = {"free": 0.0, "pro": 49.0, "enterprise": 499.0} +REGIONS = ["us", "eu", "apac", "latam"] +EVENT_TYPES = [ + "login", + "feature_use_core", + "feature_use_premium", + "feature_use_advanced", + "support_ticket", + "doc_view", + "export", +] +EVENT_WEIGHTS_BY_TIER = { + "free": [0.40, 0.30, 0.05, 0.02, 0.10, 0.10, 0.03], + "pro": [0.30, 0.30, 0.15, 0.10, 0.05, 0.07, 0.03], + "enterprise": [0.25, 0.25, 0.15, 0.20, 0.05, 0.05, 0.05], +} + + +def _warehouse_path() -> Path: + """Resolve the package-relative warehouse directory.""" + # src/data_analytics_demo/data/generate.py -> package root is parents[3] + return Path(__file__).resolve().parents[3] / "warehouse" + + +def _emit(msg: str) -> None: + """Progress emitter — stderr only, satisfies AC-1.3.""" + print(f"[data] {msg}", file=sys.stderr, flush=True) + + +def _read_env_int(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None: + return default + try: + return int(raw) + except ValueError as exc: + raise ValueError(f"env var {name}={raw!r} is not an integer") from exc + + +def _generate_customers(fake: Faker, rng: np.random.Generator, n: int) -> pd.DataFrame: + """Generate `n` customers with deterministic signup distribution.""" + signup_days = rng.integers(0, HISTORY_WINDOW_DAYS, size=n) + customers = pd.DataFrame( + { + "customer_id": np.arange(1, n + 1), + "email": [fake.unique.company_email() for _ in range(n)], + "company": [fake.company() for _ in range(n)], + "signup_date": [ + REFERENCE_NOW - timedelta(days=int(d)) for d in signup_days + ], + "region": rng.choice(REGIONS, size=n, p=[0.45, 0.30, 0.20, 0.05]), + "plan_tier_at_signup": rng.choice( + PLAN_TIERS, size=n, p=[0.60, 0.30, 0.10] + ), + } + ) + return customers + + +def _generate_subscriptions( + rng: np.random.Generator, customers: pd.DataFrame, n: int +) -> pd.DataFrame: + """Generate `n` subscription rows. Customers may appear multiple times. + + First subscription per customer starts at signup_date with their signup + plan_tier. Additional subscriptions model upgrades / cancellations. + """ + # Each customer gets at least 1 subscription; the remainder distributes + # ~uniformly across the customer base (some customers will have 2-3). + base_subs = customers[["customer_id", "signup_date", "plan_tier_at_signup"]].copy() + base_subs = base_subs.rename(columns={"plan_tier_at_signup": "plan_tier"}) + base_subs["start_date"] = base_subs["signup_date"] + + extra_count = max(0, n - len(customers)) + if extra_count > 0: + extra_customers = rng.choice( + customers["customer_id"].to_numpy(), size=extra_count, replace=True + ) + extra_signups = customers.set_index("customer_id").loc[extra_customers] + extras = pd.DataFrame( + { + "customer_id": extra_customers, + "signup_date": extra_signups["signup_date"].to_numpy(), + "plan_tier": rng.choice(PLAN_TIERS, size=extra_count, p=[0.40, 0.40, 0.20]), + } + ) + # Subsequent subscriptions start somewhere between signup and now. + offsets = rng.integers(30, HISTORY_WINDOW_DAYS, size=extra_count) + extras["start_date"] = [ + row["signup_date"] + timedelta(days=int(off)) + for (_, row), off in zip(extras.iterrows(), offsets, strict=True) + ] + all_subs = pd.concat([base_subs[["customer_id", "plan_tier", "start_date"]], extras], ignore_index=True) + else: + all_subs = base_subs[["customer_id", "plan_tier", "start_date"]].copy() + + all_subs = all_subs.head(n).reset_index(drop=True) + all_subs["subscription_id"] = np.arange(1, len(all_subs) + 1) + + # status: ~25% canceled, ~5% paused, rest active. Canceled get end_date. + status_roll = rng.random(len(all_subs)) + statuses = np.where(status_roll < 0.25, "canceled", np.where(status_roll < 0.30, "paused", "active")) + all_subs["status"] = statuses + + end_offsets = rng.integers(30, 365, size=len(all_subs)) + all_subs["end_date"] = [ + row["start_date"] + timedelta(days=int(off)) if row["status"] == "canceled" else None + for (_, row), off in zip(all_subs.iterrows(), end_offsets, strict=True) + ] + + all_subs["monthly_amount_usd"] = all_subs["plan_tier"].map(PLAN_PRICES).astype(float) + return all_subs[ + [ + "subscription_id", + "customer_id", + "plan_tier", + "start_date", + "end_date", + "status", + "monthly_amount_usd", + ] + ] + + +def _generate_events( + rng: np.random.Generator, + customers: pd.DataFrame, + subscriptions: pd.DataFrame, + n: int, +) -> pd.DataFrame: + """Generate `n` events with engineered churn + upsell signals.""" + # Active-status customers get more weight; canceled customers see drop-off + # near their end_date (the churn signal). + customer_ids = customers["customer_id"].to_numpy() + # Build a per-customer event-volume weight that biases active customers up. + is_active = subscriptions.groupby("customer_id")["status"].apply( + lambda s: (s == "active").any() + ) + weights = np.array([2.0 if is_active.get(cid, False) else 1.0 for cid in customer_ids]) + weights = weights / weights.sum() + + chosen_customers = rng.choice(customer_ids, size=n, p=weights) + timestamp_offsets = rng.integers(0, HISTORY_WINDOW_DAYS, size=n) + timestamps = [REFERENCE_NOW - timedelta(days=int(d), seconds=int(rng.integers(0, 86400))) for d in timestamp_offsets] + + # Per-customer event-type distribution depends on their *current* plan tier + # (latest subscription). Cheaper than per-row lookup: precompute a map. + latest_tier = ( + subscriptions.sort_values("start_date") + .groupby("customer_id")["plan_tier"] + .last() + .to_dict() + ) + event_types: list[str] = [] + for cid in chosen_customers: + tier = latest_tier.get(int(cid), "free") + event_types.append(str(rng.choice(EVENT_TYPES, p=EVENT_WEIGHTS_BY_TIER[tier]))) + + events = pd.DataFrame( + { + "event_id": np.arange(1, n + 1), + "customer_id": chosen_customers, + "timestamp": timestamps, + "event_type": event_types, + } + ) + return events + + +def _generate_invoices( + rng: np.random.Generator, subscriptions: pd.DataFrame, n: int +) -> pd.DataFrame: + """Generate `n` invoices keyed to subscription periods.""" + # Sample subscriptions (paid plans only — free tier has no invoices). + paid = subscriptions[subscriptions["monthly_amount_usd"] > 0].copy() + if len(paid) == 0: + raise ValueError("no paid subscriptions to bill") + + chosen = paid.sample(n=n, replace=True, random_state=rng.integers(0, 2**31 - 1)) + period_starts = [] + period_ends = [] + for _, row in chosen.iterrows(): + # Random month within the subscription window. + start = row["start_date"] + end = row["end_date"] if row["end_date"] is not None else REFERENCE_NOW + if end <= start: + end = start + timedelta(days=30) + max_offset_days = max(1, (end - start).days) + offset = int(rng.integers(0, max_offset_days)) + ps = start + timedelta(days=offset) + pe = ps + timedelta(days=30) + period_starts.append(ps) + period_ends.append(pe) + + statuses = rng.choice(["paid", "pending", "failed"], size=n, p=[0.85, 0.10, 0.05]) + + invoices = pd.DataFrame( + { + "invoice_id": np.arange(1, n + 1), + "customer_id": chosen["customer_id"].to_numpy(), + "subscription_id": chosen["subscription_id"].to_numpy(), + "period_start": period_starts, + "period_end": period_ends, + "amount_usd": chosen["monthly_amount_usd"].to_numpy(), + "status": statuses, + } + ) + return invoices + + +def main( # noqa: PLR0913 + *, + n_customers: int | None = None, + n_subscriptions: int | None = None, + n_events: int | None = None, + n_invoices: int | None = None, + seed: int | None = None, + output_path: Path | None = None, +) -> Path: + """Run the full synthesis pipeline and return the DuckDB path. + + Returns + ------- + Path + Location of the written DuckDB file. + """ + n_customers = n_customers or _read_env_int("DEMO_N_CUSTOMERS", DEFAULT_N_CUSTOMERS) + n_subscriptions = n_subscriptions or _read_env_int( + "DEMO_N_SUBSCRIPTIONS", DEFAULT_N_SUBSCRIPTIONS + ) + n_events = n_events or _read_env_int("DEMO_N_EVENTS", DEFAULT_N_EVENTS) + n_invoices = n_invoices or _read_env_int("DEMO_N_INVOICES", DEFAULT_N_INVOICES) + seed = seed if seed is not None else _read_env_int("DEMO_RANDOM_SEED", DEFAULT_SEED) + + warehouse_dir = _warehouse_path() if output_path is None else output_path.parent + warehouse_dir.mkdir(parents=True, exist_ok=True) # AC-1.4 + duckdb_path = output_path or (warehouse_dir / "analytics.duckdb") + + started = time.monotonic() + _emit(f"output: {duckdb_path}") + _emit(f"seed: {seed}") + + # Determinism (AC-1.5 + AC-δ.2) + fake = Faker() + Faker.seed(seed) + fake.unique.clear() + rng = np.random.default_rng(seed) + + _emit(f"generating customers ({n_customers})") + customers = _generate_customers(fake, rng, n_customers) + + _emit(f"generating subscriptions ({n_subscriptions})") + subscriptions = _generate_subscriptions(rng, customers, n_subscriptions) + + _emit(f"generating events ({n_events})") + events = _generate_events(rng, customers, subscriptions, n_events) + + _emit(f"generating invoices ({n_invoices})") + invoices = _generate_invoices(rng, subscriptions, n_invoices) + + _emit("writing duckdb") + con = duckdb.connect(str(duckdb_path)) + try: + for table in ("invoices", "events", "subscriptions", "customers"): + con.execute(f"DROP TABLE IF EXISTS {table}") + con.register("df_customers", customers) + con.execute("CREATE TABLE customers AS SELECT * FROM df_customers") + con.register("df_subscriptions", subscriptions) + con.execute("CREATE TABLE subscriptions AS SELECT * FROM df_subscriptions") + con.register("df_events", events) + con.execute("CREATE TABLE events AS SELECT * FROM df_events") + con.register("df_invoices", invoices) + con.execute("CREATE TABLE invoices AS SELECT * FROM df_invoices") + finally: + con.close() + + elapsed = time.monotonic() - started + _emit(f"done in {elapsed:.1f}s") + return duckdb_path + + +if __name__ == "__main__": + main() diff --git a/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py b/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py new file mode 100644 index 0000000..07b4424 --- /dev/null +++ b/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py @@ -0,0 +1,81 @@ +"""Pydantic schemas for the 4 synthetic SaaS tables. + +These define the contract between the generator (this package) and the dbt +staging layer (`dbt_project/models/staging/`). They are deliberately small, +typed, and free of cross-package import — dbt consumes them only by column +shape, not as a Python import. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Literal + +from pydantic import BaseModel, EmailStr, Field + +PlanTier = Literal["free", "pro", "enterprise"] +Region = Literal["us", "eu", "apac", "latam"] +SubscriptionStatus = Literal["active", "canceled", "paused"] +InvoiceStatus = Literal["paid", "pending", "failed"] +EventType = Literal[ + "login", + "feature_use_core", + "feature_use_premium", + "feature_use_advanced", + "support_ticket", + "doc_view", + "export", +] + + +class Customer(BaseModel): + """A single tenant on the SaaS product.""" + + customer_id: int = Field(ge=1) + email: EmailStr + company: str + signup_date: datetime + region: Region + plan_tier_at_signup: PlanTier + + +class Subscription(BaseModel): + """A subscription contract owned by a customer. + + A customer may have multiple subscription rows over time (upgrades, + cancellations, re-subscriptions). Cohort retention queries (AC-2.1) read + from this table. + """ + + subscription_id: int = Field(ge=1) + customer_id: int = Field(ge=1) + plan_tier: PlanTier + start_date: datetime + end_date: datetime | None # None for active subscriptions + status: SubscriptionStatus + monthly_amount_usd: float = Field(ge=0) + + +class Event(BaseModel): + """A product-usage event emitted by a customer. + + Volume drives both the churn signal (drop-off in last 30 days) and the + upsell signal (premium-feature usage by free-tier customers). + """ + + event_id: int = Field(ge=1) + customer_id: int = Field(ge=1) + timestamp: datetime + event_type: EventType + + +class Invoice(BaseModel): + """A monthly invoice tied to a subscription period.""" + + invoice_id: int = Field(ge=1) + customer_id: int = Field(ge=1) + subscription_id: int = Field(ge=1) + period_start: datetime + period_end: datetime + amount_usd: float = Field(ge=0) + status: InvoiceStatus diff --git a/packages/data-analytics-demo/tests/__init__.py b/packages/data-analytics-demo/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/data-analytics-demo/tests/test_data_generate.py b/packages/data-analytics-demo/tests/test_data_generate.py new file mode 100644 index 0000000..ec05234 --- /dev/null +++ b/packages/data-analytics-demo/tests/test_data_generate.py @@ -0,0 +1,159 @@ +"""Tests for the synthetic data generator (T-03 / AC-1.x).""" + +from __future__ import annotations + +from pathlib import Path + +import duckdb +import pytest + +from data_analytics_demo.data import generate +from data_analytics_demo.data.schemas import Customer, Event, Invoice, Subscription + + +@pytest.fixture() +def small_warehouse(tmp_path: Path) -> Path: + """Generate a small but representative dataset into a temp DuckDB file.""" + out = tmp_path / "analytics.duckdb" + generate.main( + n_customers=200, + n_subscriptions=400, + n_events=2_000, + n_invoices=600, + seed=42, + output_path=out, + ) + return out + + +# ---- AC-1.1 ---------------------------------------------------------------- + +def test_ac_1_1_four_tables_present(small_warehouse: Path) -> None: + con = duckdb.connect(str(small_warehouse), read_only=True) + try: + tables = {row[0] for row in con.execute("SHOW TABLES").fetchall()} + finally: + con.close() + assert tables == {"customers", "subscriptions", "events", "invoices"} + + +# ---- AC-1.2 (proportional minimums verified at production sizing) ---------- + +def test_ac_1_2_row_counts_match_request(small_warehouse: Path) -> None: + con = duckdb.connect(str(small_warehouse), read_only=True) + try: + counts = { + t: con.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0] + for t in ("customers", "subscriptions", "events", "invoices") + } + finally: + con.close() + assert counts["customers"] == 200 + assert counts["subscriptions"] == 400 + assert counts["events"] == 2_000 + assert counts["invoices"] == 600 + + +def test_ac_1_2_default_sizing_meets_floor(tmp_path: Path) -> None: + """At default sizing, row counts meet the AC-1.2 floor.""" + out = tmp_path / "analytics.duckdb" + generate.main(output_path=out, seed=42) + con = duckdb.connect(str(out), read_only=True) + try: + n_customers = con.execute("SELECT COUNT(*) FROM customers").fetchone()[0] + n_events = con.execute("SELECT COUNT(*) FROM events").fetchone()[0] + n_subscriptions = con.execute("SELECT COUNT(*) FROM subscriptions").fetchone()[0] + n_invoices = con.execute("SELECT COUNT(*) FROM invoices").fetchone()[0] + finally: + con.close() + assert n_customers >= 1_000 + assert n_events >= 50_000 + assert n_subscriptions >= 2_000 + assert n_invoices >= 5_000 + + +# ---- AC-1.3 (progress to stderr) ------------------------------------------- + +def test_ac_1_3_emits_progress(capsys: pytest.CaptureFixture[str], tmp_path: Path) -> None: + out = tmp_path / "analytics.duckdb" + generate.main( + n_customers=100, n_subscriptions=200, n_events=500, n_invoices=300, seed=1, output_path=out + ) + captured = capsys.readouterr() + # Progress emits via _emit() -> stderr with a [data] prefix + assert "[data]" in captured.err + assert "customers" in captured.err + assert "done" in captured.err + + +# ---- AC-1.4 (auto-create warehouse dir) ------------------------------------ + +def test_ac_1_4_creates_missing_warehouse_dir(tmp_path: Path) -> None: + nested = tmp_path / "nope" / "deeper" / "analytics.duckdb" + assert not nested.parent.exists() + generate.main( + n_customers=50, n_subscriptions=100, n_events=200, n_invoices=120, seed=2, output_path=nested + ) + assert nested.exists() + + +# ---- AC-1.5 / AC-δ.2 (determinism) ----------------------------------------- + +def test_ac_1_5_deterministic_with_same_seed(tmp_path: Path) -> None: + a = tmp_path / "a.duckdb" + b = tmp_path / "b.duckdb" + generate.main( + n_customers=100, n_subscriptions=200, n_events=500, n_invoices=300, seed=7, output_path=a + ) + generate.main( + n_customers=100, n_subscriptions=200, n_events=500, n_invoices=300, seed=7, output_path=b + ) + + def read_all(p: Path) -> dict[str, list[tuple]]: + con = duckdb.connect(str(p), read_only=True) + try: + return { + t: con.execute(f"SELECT * FROM {t} ORDER BY 1").fetchall() + for t in ("customers", "subscriptions", "events", "invoices") + } + finally: + con.close() + + assert read_all(a) == read_all(b) + + +# ---- AC-γ.1 (no real PII) -------------------------------------------------- + +def test_ac_gamma_1_no_real_pii_signature(small_warehouse: Path) -> None: + """Sanity check: emails follow the Faker company-email pattern (not gmail/etc.).""" + con = duckdb.connect(str(small_warehouse), read_only=True) + try: + sample_emails = [ + row[0] for row in con.execute("SELECT email FROM customers LIMIT 50").fetchall() + ] + finally: + con.close() + common_real_domains = {"gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "icloud.com"} + for email in sample_emails: + domain = email.split("@", 1)[1].lower() + assert domain not in common_real_domains, f"unexpected real-domain email: {email}" + + +# ---- Schema round-trip (extra sanity) -------------------------------------- + +def test_schemas_validate_first_row(small_warehouse: Path) -> None: + """Each table's first row deserializes into its Pydantic schema.""" + con = duckdb.connect(str(small_warehouse), read_only=True) + try: + cust_row = con.execute("SELECT * FROM customers LIMIT 1").fetchdf().iloc[0].to_dict() + sub_row = con.execute("SELECT * FROM subscriptions LIMIT 1").fetchdf().iloc[0].to_dict() + evt_row = con.execute("SELECT * FROM events LIMIT 1").fetchdf().iloc[0].to_dict() + inv_row = con.execute("SELECT * FROM invoices LIMIT 1").fetchdf().iloc[0].to_dict() + finally: + con.close() + + # Pydantic accepts pandas/numpy datetimes; allow lenient parsing. + Customer.model_validate(cust_row) + Subscription.model_validate(sub_row) + Event.model_validate(evt_row) + Invoice.model_validate(inv_row) From 63115d57302650f422ec7540f470b11897bb3d9c Mon Sep 17 00:00:00 2001 From: leagames0221-sys Date: Sun, 17 May 2026 18:55:55 +0900 Subject: [PATCH 2/6] =?UTF-8?q?fix(data-analytics-demo):=20ruff=20lint=20?= =?UTF-8?q?=E2=80=94=204=20errors=20in=20generate.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI feedback from #83 (python-test workflow). All 4 are clean style/typing nits: - F401: drop unused `from collections.abc import Sequence` (TYPE_CHECKING block); the `Sequence` was never referenced. - UP017: `timezone.utc` -> `UTC` (Python 3.11+ alias). - T201: `_emit()` is the deliberate single exception to the print-suppression rule for this package — annotated with `noqa: T201` plus a docstring note so the exception is auditable in code review. - E501: split the timestamps list comprehension at 121 cols into 3 lines. Verify: python -m compileall src/ OK. --- .../src/data_analytics_demo/data/generate.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py b/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py index 756ae43..b129ed3 100644 --- a/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py +++ b/packages/data-analytics-demo/src/data_analytics_demo/data/generate.py @@ -26,18 +26,14 @@ import os import sys import time -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta from pathlib import Path -from typing import TYPE_CHECKING import duckdb import numpy as np import pandas as pd from faker import Faker -if TYPE_CHECKING: - from collections.abc import Sequence - # --- Defaults (overridable via env vars; documented in .env.example) --- DEFAULT_N_CUSTOMERS = 1000 DEFAULT_N_SUBSCRIPTIONS = 2000 @@ -46,7 +42,7 @@ DEFAULT_SEED = 42 # Reference window: synthetic "now" = 2026-05-01 UTC. Events span 2 years back. -REFERENCE_NOW = datetime(2026, 5, 1, tzinfo=timezone.utc) +REFERENCE_NOW = datetime(2026, 5, 1, tzinfo=UTC) HISTORY_WINDOW_DAYS = 730 PLAN_TIERS = ["free", "pro", "enterprise"] @@ -75,8 +71,13 @@ def _warehouse_path() -> Path: def _emit(msg: str) -> None: - """Progress emitter — stderr only, satisfies AC-1.3.""" - print(f"[data] {msg}", file=sys.stderr, flush=True) + """Progress emitter — stderr only, satisfies AC-1.3. + + `_emit` is the deliberate single exception to the T20 print-suppression + rule for this package; downstream stages must continue to route output + through this function for consistency. + """ + print(f"[data] {msg}", file=sys.stderr, flush=True) # noqa: T201 def _read_env_int(name: str, default: int) -> int: @@ -193,7 +194,10 @@ def _generate_events( chosen_customers = rng.choice(customer_ids, size=n, p=weights) timestamp_offsets = rng.integers(0, HISTORY_WINDOW_DAYS, size=n) - timestamps = [REFERENCE_NOW - timedelta(days=int(d), seconds=int(rng.integers(0, 86400))) for d in timestamp_offsets] + timestamps = [ + REFERENCE_NOW - timedelta(days=int(d), seconds=int(rng.integers(0, 86400))) + for d in timestamp_offsets + ] # Per-customer event-type distribution depends on their *current* plan tier # (latest subscription). Cheaper than per-row lookup: precompute a map. From d87e787342fbaa1df85eefdd50d17f2167c23ef7 Mon Sep 17 00:00:00 2001 From: leagames0221-sys Date: Sun, 17 May 2026 19:11:54 +0900 Subject: [PATCH 3/6] =?UTF-8?q?fix(data-analytics-demo):=20mypy=20?= =?UTF-8?q?=E2=80=94=20PEP=20561=20typed=20marker=20+=203rd-party=20overri?= =?UTF-8?q?des?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI feedback from #83 (mypy step). 3 errors clear: - Add `src/data_analytics_demo/py.typed` (PEP 561 marker). Resolves the two `import-untyped` errors on cli.py importing `data_analytics_demo` and `data_analytics_demo.data` — the package now declares inline type info. - Register the marker in [tool.setuptools.package-data] so it ships in the installed wheel. - Add a [[tool.mypy.overrides]] block for pandas / duckdb / faker / shap / xgboost / sklearn — none of these publish type stubs that match the current Python 3.11 + pandas 3.x stack. pandas-stubs exists but lags pandas releases, so ignore_missing_imports is the pragmatic floor. --- packages/data-analytics-demo/pyproject.toml | 10 ++++++++++ .../src/data_analytics_demo/py.typed | 0 2 files changed, 10 insertions(+) create mode 100644 packages/data-analytics-demo/src/data_analytics_demo/py.typed diff --git a/packages/data-analytics-demo/pyproject.toml b/packages/data-analytics-demo/pyproject.toml index 2d8855f..1395c60 100644 --- a/packages/data-analytics-demo/pyproject.toml +++ b/packages/data-analytics-demo/pyproject.toml @@ -49,6 +49,9 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +data_analytics_demo = ["py.typed"] + [tool.ruff] line-length = 120 target-version = "py311" @@ -66,6 +69,13 @@ python_version = "3.11" namespace_packages = true explicit_package_bases = true +# Third-party libraries without published type stubs. `pandas-stubs` exists +# but lags behind `pandas` releases; treating these as untyped is the +# pragmatic choice for a Python 3.11 + pandas 3.x stack. +[[tool.mypy.overrides]] +module = ["pandas", "pandas.*", "duckdb", "faker", "shap", "xgboost", "sklearn.*"] +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = ["tests"] addopts = "--cov=data_analytics_demo --cov-report=term-missing --cov-fail-under=80" diff --git a/packages/data-analytics-demo/src/data_analytics_demo/py.typed b/packages/data-analytics-demo/src/data_analytics_demo/py.typed new file mode 100644 index 0000000..e69de29 From b8d5de852be6cc28d4b1140926596fc3abfb28e3 Mon Sep 17 00:00:00 2001 From: leagames0221-sys Date: Sun, 17 May 2026 19:46:19 +0900 Subject: [PATCH 4/6] chore: trigger CI re-run on latest HEAD (d87e787) From 05184bc3648483b1baddd7d372f5284fb4f94fa1 Mon Sep 17 00:00:00 2001 From: leagames0221-sys Date: Sun, 17 May 2026 20:07:23 +0900 Subject: [PATCH 5/6] fix(data-analytics-demo): mypy src-layout dual-path conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI feedback: `mypy src` walked the file as both `src.data_analytics_demo.…` and `data_analytics_demo.…` because the package is editable-installed AND src is on the filesystem path. - pyproject.toml: add `mypy_path = "src"` so mypy resolves the package unambiguously through its installed name. - python-test.yml + Makefile: invoke mypy as `mypy -p data_analytics_demo` (installed-package mode) instead of `mypy src` (filesystem walk). Same coverage, no path collision. --- .github/workflows/python-test.yml | 2 +- packages/data-analytics-demo/Makefile | 2 +- packages/data-analytics-demo/pyproject.toml | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index f473618..288d603 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -37,7 +37,7 @@ jobs: run: ruff check src tests - name: mypy (strict type-check) - run: mypy src + run: mypy -p data_analytics_demo - name: pytest (coverage ≥ 80%) run: pytest diff --git a/packages/data-analytics-demo/Makefile b/packages/data-analytics-demo/Makefile index 3ab62a0..ccc4da5 100644 --- a/packages/data-analytics-demo/Makefile +++ b/packages/data-analytics-demo/Makefile @@ -52,7 +52,7 @@ test: lint: $(PYTHON) -m ruff check src tests - $(PYTHON) -m mypy src + $(PYTHON) -m mypy -p data_analytics_demo clean: rm -rf warehouse/*.duckdb ml/artifacts/* dashboard/build narrative/output.md diff --git a/packages/data-analytics-demo/pyproject.toml b/packages/data-analytics-demo/pyproject.toml index 1395c60..e9f3e31 100644 --- a/packages/data-analytics-demo/pyproject.toml +++ b/packages/data-analytics-demo/pyproject.toml @@ -68,6 +68,7 @@ strict = true python_version = "3.11" namespace_packages = true explicit_package_bases = true +mypy_path = "src" # Third-party libraries without published type stubs. `pandas-stubs` exists # but lags behind `pandas` releases; treating these as untyped is the From 4d16f5841b718b3f898c7e280e52a317c2bd583e Mon Sep 17 00:00:00 2001 From: leagames0221-sys Date: Sun, 17 May 2026 20:20:17 +0900 Subject: [PATCH 6/6] fix(data-analytics-demo): drop EmailStr dependency CI feedback: pydantic.EmailStr requires the optional `email-validator` package, which is not in our dependency set. No AC requires email-format validation; the field stores a Faker company_email() string and downstream consumers (dbt staging, ML features) read it as a string anyway. Dropping EmailStr removes the runtime dep without any functional change. --- .../src/data_analytics_demo/data/schemas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py b/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py index 07b4424..a6b3dba 100644 --- a/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py +++ b/packages/data-analytics-demo/src/data_analytics_demo/data/schemas.py @@ -11,7 +11,7 @@ from datetime import datetime from typing import Literal -from pydantic import BaseModel, EmailStr, Field +from pydantic import BaseModel, Field PlanTier = Literal["free", "pro", "enterprise"] Region = Literal["us", "eu", "apac", "latam"] @@ -32,7 +32,7 @@ class Customer(BaseModel): """A single tenant on the SaaS product.""" customer_id: int = Field(ge=1) - email: EmailStr + email: str # Faker-generated company_email; schema-level format validation deferred (no AC requires it) company: str signup_date: datetime region: Region