diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 56becfc..4d5b85f 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -606,6 +606,7 @@ datasight quality [OPTIONS] | --- | --- | | `--project-dir` | Project directory containing .env and config files. Default: `.`. | | `--table` | Audit a specific table. | +| `--deep` | Run expensive detectors: whole-row and PK-shaped duplicates, text whitespace/empty-string flags, IQR-based numeric outliers, and orphan foreign-key-shaped values. | | `--format` | Output format (default: table). Default: `table`. | | `--output`, `-o` | Write the quality audit to a file instead of stdout. | diff --git a/docs/use/how-to/audit-data-quality.md b/docs/use/how-to/audit-data-quality.md index 9592f62..75fae68 100644 --- a/docs/use/how-to/audit-data-quality.md +++ b/docs/use/how-to/audit-data-quality.md @@ -65,6 +65,40 @@ Use it to spot: - quick notes worth turning into follow-up questions - temporal completeness issues when [`time_series.yaml`](../../project-setup/how-to/declare-time-series.md) is present +The default pass batches per-column null and numeric scans into one query +per table, so it stays cheap on wide schemas. + +### Deeper checks: `datasight quality --deep` + +Add `--deep` to run the more expensive detectors and emit previewable +cleanup SQL alongside each finding: + +```bash +datasight quality --deep +datasight quality --deep --format markdown -o quality-deep.md +``` + +`--deep` adds: + +- **Whole-row duplicates** — rows that are exact duplicates across all columns +- **Primary-key-shaped duplicates** — values appearing more than once in + any `id`, `*_id`, or `id_*` column +- **Text cleanliness** — counts of values with leading or trailing + whitespace, and counts of empty strings used in place of NULL +- **Numeric outliers (IQR)** — counts of values outside the + `[Q1 − 1.5·IQR, Q3 + 1.5·IQR]` fence (skipped on SQLite, which has no + percentile aggregate) +- **Orphan foreign-key-shaped values** — values in `_id` columns + that don't appear in `.`, detected against any parent + table with exactly one ID-shaped column + +Each finding includes a `cleanup_sql` field with a previewable `SELECT` +that shows the candidate rows. Destructive forms +(`UPDATE`, `CREATE OR REPLACE TABLE`) appear only as comments inside the +preview — datasight never auto-mutates your tables. The CLI table and +markdown outputs render the cleanup SQL in a dedicated +**Suggested Cleanup** section. + ## Detect untidy column shapes !!! warning "Experimental" @@ -398,7 +432,7 @@ directory name appears in the report title. For a thorough data quality audit, run the commands in this order: 1. **Profile** — understand the shape of the data -2. **Quality** — find nulls, range issues, date gaps, and untidy column shapes +2. **Quality** — find nulls, range issues, date gaps, and untidy column shapes (add `--deep` for duplicates, text cleanliness, outliers, and orphan-FK checks with previewable cleanup SQL) 3. **Integrity** — verify primary keys, foreign keys, and join behavior 4. **Distribution** — inspect percentiles, outliers, and temporal spikes 5. **Measures** — identify metrics and verify aggregation defaults diff --git a/src/datasight/audit_report.py b/src/datasight/audit_report.py index 840d280..32d885e 100644 --- a/src/datasight/audit_report.py +++ b/src/datasight/audit_report.py @@ -20,10 +20,15 @@ async def build_audit_report( validation_rules: list[dict[str, Any]] | None = None, declared_joins: list[dict[str, Any]] | None = None, project_name: str | None = None, + *, + sql_dialect: str = "duckdb", + deep: bool = False, ) -> dict[str, Any]: """Run all audit checks and assemble a composite report.""" dataset_overview = await build_dataset_overview(schema_info, run_sql) - quality = await build_quality_overview(schema_info, run_sql) + quality = await build_quality_overview( + schema_info, run_sql, sql_dialect=sql_dialect, deep=deep + ) integrity = await build_integrity_overview(schema_info, run_sql, declared_joins) distribution = await build_distribution_overview(schema_info, run_sql, overrides) diff --git a/src/datasight/cleanup.py b/src/datasight/cleanup.py new file mode 100644 index 0000000..58c3f20 --- /dev/null +++ b/src/datasight/cleanup.py @@ -0,0 +1,130 @@ +"""Emit previewable cleanup SQL for findings from ``build_quality_overview``. + +Mirrors the ``tidy`` module's preview-first ethos: each function returns a +single SQL string that lets the user *see* the rows in question, never an +UPDATE/DELETE that auto-mutates the table. The returned SQL is safe to copy +into a query window or attach to a quality report. + +For destructive operations (deduplication, NULLIF rewrites, TRIM rewrites) +the preview is a ``SELECT`` that shows the candidate rows or the rewritten +column alongside the original; the caller can wrap it in an +``UPDATE``/``CREATE OR REPLACE TABLE`` once they've reviewed the preview. + +Dialects supported: ``duckdb``, ``sqlite``, ``postgres``. Where a dialect +lacks a feature (e.g. SQLite has no ``QUALIFY`` and no percentile +aggregates), the function falls back to portable SQL. +""" + +from __future__ import annotations + +from datasight.schema import _quote_identifier + + +def empty_string_preview(table: str, column: str, dialect: str) -> str: + """Show rows where the column is the empty string, candidates for NULLIF.""" + qt = _quote_identifier(table) + qc = _quote_identifier(column) + return ( + f"-- Rows where {column!r} is an empty string. " + f"To fix: UPDATE {qt} SET {qc} = NULL WHERE {qc} = '';\n" + f"SELECT * FROM {qt} WHERE {qc} = '';" + ) + + +def whitespace_preview(table: str, column: str, dialect: str) -> str: + """Show rows whose column value has surrounding whitespace.""" + qt = _quote_identifier(table) + qc = _quote_identifier(column) + return ( + f"-- Rows where {column!r} has leading/trailing whitespace. " + f"To fix: UPDATE {qt} SET {qc} = TRIM({qc}) WHERE {qc} <> TRIM({qc});\n" + f"SELECT {qc} AS original, TRIM({qc}) AS trimmed FROM {qt} " + f"WHERE {qc} IS NOT NULL AND {qc} <> TRIM({qc});" + ) + + +def whole_row_dedup_preview(table: str, dialect: str) -> str: + """Preview a deduplicated copy of the table.""" + qt = _quote_identifier(table) + if dialect == "duckdb": + materialize = ( + f"-- To materialize: CREATE OR REPLACE TABLE {qt} AS SELECT DISTINCT * FROM {qt};" + ) + else: + materialize = ( + f"-- To materialize: BEGIN; DROP TABLE IF EXISTS {qt}_deduped; " + f"CREATE TABLE {qt}_deduped AS SELECT DISTINCT * FROM {qt}; COMMIT;" + ) + return f"{materialize}\nSELECT DISTINCT * FROM {qt};" + + +def pk_dedup_preview(table: str, pk_column: str, dialect: str) -> str: + """Show one canonical row per duplicate PK value. + + DuckDB uses ``QUALIFY`` for a one-liner; Postgres uses a CTE with + ``ROW_NUMBER``; SQLite falls back to ``MIN(rowid)``. + """ + qt = _quote_identifier(table) + qc = _quote_identifier(pk_column) + if dialect == "duckdb": + return ( + f"-- One canonical row per duplicate {pk_column!r} value.\n" + f"SELECT * FROM {qt} " + f"QUALIFY ROW_NUMBER() OVER (PARTITION BY {qc} ORDER BY {qc}) = 1;" + ) + if dialect == "postgres": + return ( + f"-- One canonical row per duplicate {pk_column!r} value.\n" + f"WITH ranked AS (\n" + f" SELECT *, ROW_NUMBER() OVER (PARTITION BY {qc} ORDER BY {qc}) AS rn " + f"FROM {qt}\n" + f") SELECT * FROM ranked WHERE rn = 1;" + ) + # sqlite + return ( + f"-- One canonical row per duplicate {pk_column!r} value (SQLite uses rowid).\n" + f"SELECT * FROM {qt} WHERE rowid IN " + f"(SELECT MIN(rowid) FROM {qt} GROUP BY {qc});" + ) + + +def outlier_preview(table: str, column: str, q1: str | None, q3: str | None, dialect: str) -> str: + """Show rows whose value falls outside the IQR fence. + + ``q1`` / ``q3`` are stringified scalars from the original detector + query so we can inline them as literals instead of re-running the + percentile aggregate. + """ + qt = _quote_identifier(table) + qc = _quote_identifier(column) + if q1 is None or q3 is None: + return ( + f"-- Inspect outliers in {column!r} (recompute IQR fence as needed).\n" + f"SELECT * FROM {qt} WHERE {qc} IS NOT NULL ORDER BY {qc} DESC LIMIT 20;" + ) + return ( + f"-- Rows in {column!r} outside the IQR fence [q1={q1}, q3={q3}].\n" + f"SELECT * FROM {qt} WHERE {qc} IS NOT NULL " + f"AND ({qc} < {q1} - 1.5 * ({q3} - {q1}) " + f"OR {qc} > {q3} + 1.5 * ({q3} - {q1}));" + ) + + +def orphan_fk_preview( + table: str, + column: str, + parent_table: str, + parent_column: str, + dialect: str, +) -> str: + """Show distinct child values not present in the parent's PK column.""" + qt = _quote_identifier(table) + qc = _quote_identifier(column) + qpt = _quote_identifier(parent_table) + qpc = _quote_identifier(parent_column) + return ( + f"-- Distinct {table}.{column} values not present in {parent_table}.{parent_column}.\n" + f"SELECT DISTINCT {qc} FROM {qt} " + f"WHERE {qc} IS NOT NULL " + f"AND {qc} NOT IN (SELECT {qpc} FROM {qpt} WHERE {qpc} IS NOT NULL);" + ) diff --git a/src/datasight/cli.py b/src/datasight/cli.py index 525b549..4955f76 100644 --- a/src/datasight/cli.py +++ b/src/datasight/cli.py @@ -496,6 +496,73 @@ def render_quality_markdown(quality_data: dict[str, Any]) -> str: # noqa: C901 lines.extend(["", "## Wide Tables"]) for item in quality_data["wide_tables"]: lines.append(f"- `{item['table']}`: {item['reason']}") + + cleanup_blocks: list[tuple[str, str]] = [] + + if quality_data.get("duplicate_rows"): + lines.extend(["", "## Whole-Row Duplicates"]) + for item in quality_data["duplicate_rows"]: + lines.append(f"- `{item['table']}`: {item['duplicate_count']} duplicate row(s)") + if item.get("cleanup_sql"): + cleanup_blocks.append((f"{item['table']} (whole-row dedup)", item["cleanup_sql"])) + if quality_data.get("pk_duplicates"): + lines.extend(["", "## Primary-Key-Shaped Duplicates"]) + for item in quality_data["pk_duplicates"]: + sample = ", ".join(f"{e['value']} (×{e['count']})" for e in item["examples"][:3]) + lines.append(f"- `{item['table']}.{item['column']}`: duplicate values — {sample}") + if item.get("cleanup_sql"): + cleanup_blocks.append( + (f"{item['table']}.{item['column']} (PK dedup)", item["cleanup_sql"]) + ) + if quality_data.get("text_flags"): + lines.extend(["", "## Text Cleanliness"]) + for item in quality_data["text_flags"]: + lines.append( + f"- `{item['table']}.{item['column']}`: {item['issue']} ({item['count']} row(s))" + ) + if item.get("cleanup_sql"): + cleanup_blocks.append( + ( + f"{item['table']}.{item['column']} ({item['issue']})", + item["cleanup_sql"], + ) + ) + if quality_data.get("outlier_flags"): + lines.extend(["", "## Numeric Outliers (IQR)"]) + for item in quality_data["outlier_flags"]: + lines.append( + f"- `{item['table']}.{item['column']}`: {item['outlier_count']} row(s) outside " + f"IQR fence [q1={item.get('q1')}, q3={item.get('q3')}]" + ) + if item.get("cleanup_sql"): + cleanup_blocks.append( + (f"{item['table']}.{item['column']} (outliers)", item["cleanup_sql"]) + ) + if quality_data.get("orphan_flags"): + lines.extend(["", "## Orphan Foreign-Key-Shaped Values"]) + for item in quality_data["orphan_flags"]: + lines.append( + f"- `{item['table']}.{item['column']}` → `{item['parent_table']}.{item['parent_column']}`: " + f"{item['orphan_count']} orphan value(s)" + ) + if item.get("cleanup_sql"): + cleanup_blocks.append( + ( + f"{item['table']}.{item['column']} (orphans → {item['parent_table']})", + item["cleanup_sql"], + ) + ) + + if cleanup_blocks: + lines.extend(["", "## Suggested Cleanup"]) + for title, sql in cleanup_blocks: + lines.append("") + lines.append(f"### {title}") + lines.append("") + lines.append("```sql") + lines.extend(sql.splitlines()) + lines.append("```") + if quality_data["notes"]: lines.extend(["", "## Notes"]) for item in quality_data["notes"]: diff --git a/src/datasight/cli_commands/audit_report.py b/src/datasight/cli_commands/audit_report.py index 17f4731..421f513 100644 --- a/src/datasight/cli_commands/audit_report.py +++ b/src/datasight/cli_commands/audit_report.py @@ -104,6 +104,7 @@ async def _run_audit_report(): validation_rules, declared_joins, project_name=Path(project_dir).name, + sql_dialect=settings.database.sql_dialect, ) report_data = asyncio.run(_run_audit_report()) diff --git a/src/datasight/cli_commands/inspect.py b/src/datasight/cli_commands/inspect.py index 06b2e66..dd56bbd 100644 --- a/src/datasight/cli_commands/inspect.py +++ b/src/datasight/cli_commands/inspect.py @@ -94,7 +94,12 @@ async def _run_all(): "profiling tables", build_dataset_overview(schema_info, runner.run_sql) ) quality_data = await _run_phase( - "running quality checks", build_quality_overview(schema_info, runner.run_sql) + "running quality checks", + build_quality_overview( + schema_info, + runner.run_sql, + sql_dialect=(db_settings.sql_dialect if db_settings else "duckdb"), + ), ) measure_data = await _run_phase( "discovering measures", diff --git a/src/datasight/cli_commands/quality.py b/src/datasight/cli_commands/quality.py index b46b1b2..0e1c006 100644 --- a/src/datasight/cli_commands/quality.py +++ b/src/datasight/cli_commands/quality.py @@ -35,6 +35,16 @@ help="Project directory containing .env and config files.", ) @click.option("--table", default=None, help="Audit a specific table.") +@click.option( + "--deep", + is_flag=True, + default=False, + help=( + "Run expensive detectors: whole-row and PK-shaped duplicates, text " + "whitespace/empty-string flags, IQR-based numeric outliers, and " + "orphan foreign-key-shaped values." + ), +) @click.option( "--format", "output_format", @@ -50,7 +60,7 @@ default=None, help="Write the quality audit to a file instead of stdout.", ) -def quality(project_dir, table, output_format, output_path): # noqa: C901 +def quality(project_dir, table, deep, output_format, output_path): # noqa: C901 """Audit data quality - nulls, suspicious ranges, and date coverage. Also checks temporal completeness when time_series.yaml defines expected @@ -79,7 +89,12 @@ async def _run_quality(): msg = f"Table not found: {table}" raise click.ClickException(msg) schema_info = [table_info] - base = await build_quality_overview(schema_info, sql_runner.run_sql) + base = await build_quality_overview( + schema_info, + sql_runner.run_sql, + sql_dialect=settings.database.sql_dialect, + deep=deep, + ) ts_configs = time_series_configs if table and ts_configs: ts_configs = [c for c in ts_configs if c["table"].lower() == table.lower()] @@ -220,6 +235,121 @@ async def _run_quality(): ], ) ) + cleanup_blocks: list[tuple[str, str]] = [] + if quality_data.get("duplicate_rows"): + console.print( + cli.build_profile_detail_table( + "Whole-Row Duplicates", + [("Table", "left"), ("Duplicate rows", "right")], + [ + [item["table"], str(item["duplicate_count"])] + for item in quality_data["duplicate_rows"] + ], + ) + ) + for item in quality_data["duplicate_rows"]: + if item.get("cleanup_sql"): + cleanup_blocks.append((f"{item['table']} (whole-row dedup)", item["cleanup_sql"])) + if quality_data.get("pk_duplicates"): + console.print( + cli.build_profile_detail_table( + "Primary-Key-Shaped Duplicates", + [("Column", "left"), ("Sample duplicates", "left")], + [ + [ + f"{item['table']}.{item['column']}", + ", ".join(f"{e['value']} (×{e['count']})" for e in item["examples"][:3]), + ] + for item in quality_data["pk_duplicates"] + ], + ) + ) + for item in quality_data["pk_duplicates"]: + if item.get("cleanup_sql"): + cleanup_blocks.append( + (f"{item['table']}.{item['column']} (PK dedup)", item["cleanup_sql"]) + ) + if quality_data.get("text_flags"): + console.print( + cli.build_profile_detail_table( + "Text Cleanliness", + [("Column", "left"), ("Issue", "left"), ("Rows", "right")], + [ + [ + f"{item['table']}.{item['column']}", + item["issue"], + str(item["count"]), + ] + for item in quality_data["text_flags"] + ], + ) + ) + for item in quality_data["text_flags"]: + if item.get("cleanup_sql"): + cleanup_blocks.append( + ( + f"{item['table']}.{item['column']} ({item['issue']})", + item["cleanup_sql"], + ) + ) + if quality_data.get("outlier_flags"): + console.print( + cli.build_profile_detail_table( + "Numeric Outliers (IQR)", + [("Column", "left"), ("Rows", "right"), ("Q1", "right"), ("Q3", "right")], + [ + [ + f"{item['table']}.{item['column']}", + str(item["outlier_count"]), + cli.format_profile_value(item.get("q1")), + cli.format_profile_value(item.get("q3")), + ] + for item in quality_data["outlier_flags"] + ], + ) + ) + for item in quality_data["outlier_flags"]: + if item.get("cleanup_sql"): + cleanup_blocks.append( + (f"{item['table']}.{item['column']} (outliers)", item["cleanup_sql"]) + ) + if quality_data.get("orphan_flags"): + console.print( + cli.build_profile_detail_table( + "Orphan Foreign-Key-Shaped Values", + [("Child", "left"), ("Parent", "left"), ("Orphans", "right")], + [ + [ + f"{item['table']}.{item['column']}", + f"{item['parent_table']}.{item['parent_column']}", + str(item["orphan_count"]), + ] + for item in quality_data["orphan_flags"] + ], + ) + ) + for item in quality_data["orphan_flags"]: + if item.get("cleanup_sql"): + cleanup_blocks.append( + ( + f"{item['table']}.{item['column']} (orphans → {item['parent_table']})", + item["cleanup_sql"], + ) + ) + if cleanup_blocks: + from rich.panel import Panel + from rich.syntax import Syntax + + console.print() + console.print("[bold]Suggested Cleanup[/bold]") + for title, sql in cleanup_blocks: + console.print( + Panel( + Syntax(sql, "sql", theme="ansi_dark", word_wrap=True), + title=title, + border_style="dim", + ) + ) if quality_data["notes"]: console.print( cli.build_profile_detail_table( diff --git a/src/datasight/data_profile.py b/src/datasight/data_profile.py index d8f2abf..033b7d6 100644 --- a/src/datasight/data_profile.py +++ b/src/datasight/data_profile.py @@ -2,6 +2,7 @@ from __future__ import annotations +import math from typing import Any import yaml @@ -841,28 +842,50 @@ def format_measure_prompt_context(measure_data: dict[str, Any]) -> str: async def build_quality_overview( # noqa: C901 schema_info: list[dict[str, Any]], run_sql: RunSql, + *, + sql_dialect: str = "duckdb", + deep: bool = False, ) -> dict[str, Any]: - """Build a deterministic overview focused on data quality signals.""" + """Build a deterministic overview focused on data quality signals. + + ``sql_dialect`` selects between ``duckdb``, ``sqlite``, and ``postgres`` + SQL where the checks need dialect-specific syntax (percentile, qualify, + multi-column distinct). ``deep=True`` enables the more expensive + detectors: whole-row duplicates, PK-shaped duplicates, text-cleanliness + flags, IQR-based numeric outliers, and orphan foreign-key-shaped values. + """ null_columns: list[dict[str, Any]] = [] numeric_flags: list[dict[str, Any]] = [] date_columns: list[dict[str, Any]] = [] + duplicate_rows: list[dict[str, Any]] = [] + pk_duplicates: list[dict[str, Any]] = [] + text_flags: list[dict[str, Any]] = [] + outlier_flags: list[dict[str, Any]] = [] + orphan_flags: list[dict[str, Any]] = [] notes: list[str] = [] + parent_keys = _index_pk_shaped_columns(schema_info) if deep else {} + for table in schema_info: table_name = table["name"] - row_count = table.get("row_count") - for column in table.get("columns", []): + schema_row_count = table.get("row_count") + columns = table.get("columns", []) + if not columns: + continue + + batch = await _batched_column_scan( + run_sql, table_name, columns, deep=deep, sql_dialect=sql_dialect + ) + row_count = schema_row_count if schema_row_count is not None else batch.get("__row_count") + + for column in columns: column_name = column["name"] dtype = column.get("dtype", "") + stats = batch.get(column_name) + if not stats: + continue - null_count = await _run_scalar( - run_sql, - ( - f"SELECT SUM(CASE WHEN {_quote_identifier(column_name)} IS NULL THEN 1 ELSE 0 END) " - f"AS value FROM {_quote_identifier(table_name)}" - ), - "value", - ) + null_count = stats.get("null_count") if null_count and row_count: try: null_rate = round((float(null_count or 0) / row_count) * 100, 1) @@ -879,40 +902,94 @@ async def build_quality_overview( # noqa: C901 ) if _is_numeric_dtype(dtype) and not _looks_like_identifier(column_name): - stats = await _get_numeric_stats(run_sql, table_name, column_name) - if stats: - min_value = stats.get("min") - max_value = stats.get("max") - avg_value = stats.get("avg") - if min_value == max_value and min_value is not None: - numeric_flags.append( - { - "table": table_name, - "column": column_name, - "issue": f"constant numeric value ({min_value})", - } - ) - elif avg_value in {min_value, max_value} and min_value != max_value: - numeric_flags.append( - { - "table": table_name, - "column": column_name, - "issue": f"average sits on boundary ({avg_value})", - } - ) + min_value = stats.get("min") + max_value = stats.get("max") + avg_value = stats.get("avg") + if min_value == max_value and min_value is not None: + numeric_flags.append( + { + "table": table_name, + "column": column_name, + "issue": f"constant numeric value ({min_value})", + } + ) + elif avg_value in {min_value, max_value} and min_value != max_value: + numeric_flags.append( + { + "table": table_name, + "column": column_name, + "issue": f"average sits on boundary ({avg_value})", + } + ) elif _is_date_dtype(dtype): coverage = await _get_date_coverage(run_sql, table_name, column_name) if coverage: date_columns.append(coverage) + if deep and _is_text_dtype(dtype): + from datasight.cleanup import ( + empty_string_preview, + whitespace_preview, + ) + + ws = stats.get("whitespace_count") + empty = stats.get("empty_count") + if ws: + text_flags.append( + { + "table": table_name, + "column": column_name, + "issue": "leading/trailing whitespace", + "count": int(ws), + "cleanup_sql": whitespace_preview( + table_name, column_name, sql_dialect + ), + } + ) + if empty: + text_flags.append( + { + "table": table_name, + "column": column_name, + "issue": "empty string used in place of NULL", + "count": int(empty), + "cleanup_sql": empty_string_preview( + table_name, column_name, sql_dialect + ), + } + ) + + if deep: + duplicate_rows.extend( + await _detect_whole_row_duplicates(run_sql, table_name, sql_dialect) + ) + pk_duplicates.extend( + await _detect_pk_duplicates(run_sql, table_name, columns, sql_dialect) + ) + outlier_flags.extend( + await _detect_numeric_outliers(run_sql, table_name, columns, sql_dialect) + ) + orphan_flags.extend( + await _detect_orphan_fks(run_sql, table_name, columns, parent_keys, sql_dialect) + ) + if not null_columns: notes.append("No null-heavy columns detected in the sampled profiling pass.") if not date_columns: notes.append("No obvious date columns detected for freshness checks.") if not numeric_flags: notes.append("No obviously degenerate numeric ranges detected.") + if deep: + if not duplicate_rows and not pk_duplicates: + notes.append("No duplicate rows or duplicate primary-key-shaped values detected.") + if not text_flags: + notes.append("No text-cleanliness issues (whitespace, empty strings) detected.") + if not outlier_flags: + notes.append("No IQR outliers detected in numeric columns.") + if not orphan_flags: + notes.append("No orphan foreign-key-shaped values detected.") - return { + result: dict[str, Any] = { "table_count": len(schema_info), "null_columns": sorted( null_columns, @@ -923,6 +1000,334 @@ async def build_quality_overview( # noqa: C901 "date_columns": date_columns[:6], "notes": notes, } + if deep: + result["deep"] = True + result["duplicate_rows"] = duplicate_rows + result["pk_duplicates"] = pk_duplicates[:8] + result["text_flags"] = text_flags[:12] + result["outlier_flags"] = outlier_flags[:12] + result["orphan_flags"] = orphan_flags[:12] + return result + + +# --------------------------------------------------------------------------- +# Batched per-table column scan +# --------------------------------------------------------------------------- + + +async def _batched_column_scan( + run_sql: RunSql, + table_name: str, + columns: list[dict[str, Any]], + *, + deep: bool, + sql_dialect: str, +) -> dict[str, Any]: + """Project counts and (for numeric/text columns) stats in a single query. + + Returns a mapping of column name → dict with keys ``null_count``, + ``min``, ``max``, ``avg``, ``whitespace_count``, ``empty_count``. The + special key ``__row_count`` holds ``COUNT(*)`` for the table. Missing + keys mean the check didn't apply to that column. + """ + select_parts: list[str] = ["COUNT(*) AS __row_count"] + plans: list[tuple[int, dict[str, Any]]] = [] + + for idx, column in enumerate(columns): + name = column["name"] + dtype = column.get("dtype", "") + quoted = _quote_identifier(name) + select_parts.append(f"COUNT({quoted}) AS nn_{idx}") + plan: dict[str, Any] = {"name": name, "is_numeric": False, "is_text": False} + if _is_numeric_dtype(dtype) and not _looks_like_identifier(name): + select_parts.append(f"MIN({quoted}) AS mn_{idx}") + select_parts.append(f"MAX({quoted}) AS mx_{idx}") + select_parts.append(f"AVG({quoted}) AS av_{idx}") + plan["is_numeric"] = True + if deep and _is_text_dtype(dtype): + select_parts.append( + f"SUM(CASE WHEN {quoted} IS NOT NULL AND {quoted} <> TRIM({quoted}) " + f"THEN 1 ELSE 0 END) AS ws_{idx}" + ) + select_parts.append(f"SUM(CASE WHEN {quoted} = '' THEN 1 ELSE 0 END) AS em_{idx}") + plan["is_text"] = True + plans.append((idx, plan)) + + sql = f"SELECT {', '.join(select_parts)} FROM {_quote_identifier(table_name)}" + try: + df = await run_sql(sql) + except Exception as exc: + logger.debug(f"Batched column scan failed for {table_name}: {exc}") + return {} + if df.empty: + return {} + + row = df.iloc[0] + row_count = _to_int_or_none(row.get("__row_count")) + nonnull_total = row_count if row_count is not None else 0 + + out: dict[str, Any] = {"__row_count": row_count} + for idx, plan in plans: + name = plan["name"] + nn = _to_int_or_none(row.get(f"nn_{idx}")) + entry: dict[str, Any] = {} + if nn is not None and row_count is not None: + entry["null_count"] = max(0, nonnull_total - nn) + if plan["is_numeric"]: + entry["min"] = _scalar_or_none(row.get(f"mn_{idx}")) + entry["max"] = _scalar_or_none(row.get(f"mx_{idx}")) + entry["avg"] = _scalar_or_none(row.get(f"av_{idx}")) + if plan["is_text"]: + entry["whitespace_count"] = _to_int_or_none(row.get(f"ws_{idx}")) or 0 + entry["empty_count"] = _to_int_or_none(row.get(f"em_{idx}")) or 0 + out[name] = entry + return out + + +def _scalar_or_none(value: Any) -> str | None: + """Convert a SQL scalar to a stringified value or None.""" + if value is None: + return None + if isinstance(value, float) and math.isnan(value): + return None + return str(value) + + +# --------------------------------------------------------------------------- +# Deep detectors +# --------------------------------------------------------------------------- + + +def _index_pk_shaped_columns( + schema_info: list[dict[str, Any]], +) -> dict[str, tuple[str, str]]: + """Map ```` → ``(parent_table, parent_column)``. + + A table contributes if it has exactly one ID-shaped column. Used for the + orphan-FK detector: a child column named ``_id`` (or ````) + can be checked against the parent's PK-shaped column. + """ + parent_keys: dict[str, tuple[str, str]] = {} + for table in schema_info: + table_name = table["name"] + id_cols = [ + c["name"] for c in table.get("columns", []) if _looks_like_identifier(c["name"]) + ] + if len(id_cols) == 1: + parent_keys[table_name.lower()] = (table_name, id_cols[0]) + return parent_keys + + +async def _detect_whole_row_duplicates( + run_sql: RunSql, table_name: str, sql_dialect: str +) -> list[dict[str, Any]]: + """Count rows that are exact duplicates across all columns.""" + qt = _quote_identifier(table_name) + if sql_dialect == "duckdb": + sql = ( + f"SELECT (SELECT COUNT(*) FROM {qt}) - " + f"(SELECT COUNT(*) FROM (SELECT DISTINCT * FROM {qt})) AS dup_count" + ) + else: + # SQLite and Postgres both accept this subquery form. + sql = ( + f"SELECT (SELECT COUNT(*) FROM {qt}) - " + f"(SELECT COUNT(*) FROM (SELECT DISTINCT * FROM {qt}) AS _d) " + f"AS dup_count" + ) + try: + df = await run_sql(sql) + except Exception as exc: + logger.debug(f"Whole-row duplicate check failed for {table_name}: {exc}") + return [] + if df.empty: + return [] + dup = _to_int_or_none(df.iloc[0].get("dup_count")) + if not dup: + return [] + from datasight.cleanup import whole_row_dedup_preview + + return [ + { + "table": table_name, + "duplicate_count": dup, + "cleanup_sql": whole_row_dedup_preview(table_name, sql_dialect), + } + ] + + +async def _detect_pk_duplicates( + run_sql: RunSql, + table_name: str, + columns: list[dict[str, Any]], + sql_dialect: str, +) -> list[dict[str, Any]]: + """For each ID-shaped column, find values appearing more than once.""" + findings: list[dict[str, Any]] = [] + qt = _quote_identifier(table_name) + for column in columns: + name = column["name"] + if not _looks_like_identifier(name): + continue + qc = _quote_identifier(name) + sql = ( + f"SELECT {qc} AS value, COUNT(*) AS n FROM {qt} " + f"WHERE {qc} IS NOT NULL " + f"GROUP BY {qc} HAVING COUNT(*) > 1 " + f"ORDER BY COUNT(*) DESC LIMIT 5" + ) + try: + df = await run_sql(sql) + except Exception as exc: + logger.debug(f"PK duplicate check failed for {table_name}.{name}: {exc}") + continue + if df.empty: + continue + examples = [{"value": str(r["value"]), "count": int(r["n"])} for _, r in df.iterrows()] + from datasight.cleanup import pk_dedup_preview + + findings.append( + { + "table": table_name, + "column": name, + "examples": examples, + "cleanup_sql": pk_dedup_preview(table_name, name, sql_dialect), + } + ) + return findings + + +async def _detect_numeric_outliers( + run_sql: RunSql, + table_name: str, + columns: list[dict[str, Any]], + sql_dialect: str, +) -> list[dict[str, Any]]: + """Flag numeric columns whose IQR-based outlier fence is exceeded. + + Skipped for SQLite (no percentile aggregate available without + extensions). Identifier-shaped columns are skipped to avoid false + positives on sparse ID ranges. + """ + if sql_dialect == "sqlite": + return [] + findings: list[dict[str, Any]] = [] + qt = _quote_identifier(table_name) + for column in columns: + name = column["name"] + dtype = column.get("dtype", "") + if not _is_numeric_dtype(dtype) or _looks_like_identifier(name): + continue + qc = _quote_identifier(name) + if sql_dialect == "postgres": + q1_expr = f"percentile_cont(0.25) WITHIN GROUP (ORDER BY {qc})" + q3_expr = f"percentile_cont(0.75) WITHIN GROUP (ORDER BY {qc})" + else: # duckdb + q1_expr = f"quantile_cont({qc}, 0.25)" + q3_expr = f"quantile_cont({qc}, 0.75)" + sql = ( + f"WITH q AS (SELECT {q1_expr} AS q1, {q3_expr} AS q3 FROM {qt}) " + f"SELECT q.q1 AS q1, q.q3 AS q3, " + f"(SELECT COUNT(*) FROM {qt}, q " + f"WHERE {qc} IS NOT NULL " + f"AND ({qc} < q.q1 - 1.5 * (q.q3 - q.q1) " + f"OR {qc} > q.q3 + 1.5 * (q.q3 - q.q1))) AS outlier_count " + f"FROM q" + ) + try: + df = await run_sql(sql) + except Exception as exc: + logger.debug(f"Outlier check failed for {table_name}.{name}: {exc}") + continue + if df.empty: + continue + row = df.iloc[0] + count = _to_int_or_none(row.get("outlier_count")) + if not count: + continue + q1 = _scalar_or_none(row.get("q1")) + q3 = _scalar_or_none(row.get("q3")) + from datasight.cleanup import outlier_preview + + findings.append( + { + "table": table_name, + "column": name, + "outlier_count": count, + "q1": q1, + "q3": q3, + "cleanup_sql": outlier_preview(table_name, name, q1, q3, sql_dialect), + } + ) + return findings + + +async def _detect_orphan_fks( # noqa: C901 + run_sql: RunSql, + table_name: str, + columns: list[dict[str, Any]], + parent_keys: dict[str, tuple[str, str]], + sql_dialect: str, +) -> list[dict[str, Any]]: + """For columns shaped like ``_id``, count values not in parent. + + Self-references (parent == child) are skipped to avoid double-counting + a table's own PK. + """ + findings: list[dict[str, Any]] = [] + qt = _quote_identifier(table_name) + for column in columns: + name = column["name"] + if not _looks_like_identifier(name): + continue + parent_lookup = name.lower() + if parent_lookup.endswith("_id"): + parent_lookup = parent_lookup[:-3] + elif parent_lookup.startswith("id_"): + parent_lookup = parent_lookup[3:] + elif parent_lookup == "id": + continue + # Try both singular and a naive pluralization. + candidates = [parent_lookup, parent_lookup + "s"] + match = next((parent_keys[c] for c in candidates if c in parent_keys), None) + if not match: + continue + parent_table, parent_column = match + if parent_table.lower() == table_name.lower(): + continue + qc = _quote_identifier(name) + qpt = _quote_identifier(parent_table) + qpc = _quote_identifier(parent_column) + sql = ( + f"SELECT COUNT(DISTINCT {qc}) AS orphan_count FROM {qt} " + f"WHERE {qc} IS NOT NULL " + f"AND {qc} NOT IN (SELECT {qpc} FROM {qpt} WHERE {qpc} IS NOT NULL)" + ) + try: + df = await run_sql(sql) + except Exception as exc: + logger.debug(f"Orphan FK check failed for {table_name}.{name}: {exc}") + continue + if df.empty: + continue + count = _to_int_or_none(df.iloc[0].get("orphan_count")) + if not count: + continue + from datasight.cleanup import orphan_fk_preview + + findings.append( + { + "table": table_name, + "column": name, + "parent_table": parent_table, + "parent_column": parent_column, + "orphan_count": count, + "cleanup_sql": orphan_fk_preview( + table_name, name, parent_table, parent_column, sql_dialect + ), + } + ) + return findings async def build_trend_overview( diff --git a/src/datasight/web/app.py b/src/datasight/web/app.py index 62f80ae..89644ca 100644 --- a/src/datasight/web/app.py +++ b/src/datasight/web/app.py @@ -1979,7 +1979,9 @@ async def get_quality_overview(table: str | None = None, state: AppState = Depen overview, cached = await _get_cached_insight( state, cache_key, - lambda: build_quality_overview(schema_info, sql_runner.run_sql), + lambda: build_quality_overview( + schema_info, sql_runner.run_sql, sql_dialect=state.sql_dialect + ), ) return {"overview": overview, "cached": cached} diff --git a/tests/test_cli_tools.py b/tests/test_cli_tools.py index 500620f..c732b3c 100644 --- a/tests/test_cli_tools.py +++ b/tests/test_cli_tools.py @@ -184,6 +184,162 @@ def test_quality_markdown_output_writes_file(project_dir, tmp_path): assert "## Date Coverage" in text +def _fake_deep_quality_data() -> dict: + """Synthesized output from build_quality_overview with deep=True.""" + return { + "table_count": 2, + "null_columns": [ + {"table": "orders", "column": "notes", "null_count": 9, "null_rate": 90.0} + ], + "numeric_flags": [ + {"table": "orders", "column": "qty", "issue": "constant numeric value (1)"} + ], + "date_columns": [ + {"table": "orders", "column": "order_date", "min": "2024-01-01", "max": "2024-12-31"} + ], + "notes": ["All checks ran."], + "deep": True, + "duplicate_rows": [ + { + "table": "orders", + "duplicate_count": 3, + "cleanup_sql": 'SELECT DISTINCT * FROM "orders";', + } + ], + "pk_duplicates": [ + { + "table": "orders", + "column": "id", + "examples": [{"value": "7", "count": 2}, {"value": "9", "count": 2}], + "cleanup_sql": 'SELECT * FROM "orders" QUALIFY ROW_NUMBER() OVER (PARTITION BY "id" ORDER BY "id") = 1;', + } + ], + "text_flags": [ + { + "table": "orders", + "column": "region", + "issue": "leading/trailing whitespace", + "count": 4, + "cleanup_sql": 'SELECT "region" AS original, TRIM("region") AS trimmed FROM "orders";', + }, + { + "table": "orders", + "column": "region", + "issue": "empty string used in place of NULL", + "count": 2, + "cleanup_sql": 'SELECT * FROM "orders" WHERE "region" = \'\';', + }, + ], + "outlier_flags": [ + { + "table": "orders", + "column": "amount", + "outlier_count": 11, + "q1": "10.0", + "q3": "30.0", + "cleanup_sql": ( + '-- Rows in \'amount\' outside the IQR fence [q1=10.0, q3=30.0].\n' + 'SELECT * FROM "orders" WHERE "amount" IS NOT NULL;' + ), + } + ], + "orphan_flags": [ + { + "table": "orders", + "column": "product_id", + "parent_table": "products", + "parent_column": "id", + "orphan_count": 2, + "cleanup_sql": ( + "SELECT DISTINCT \"product_id\" FROM \"orders\" " + "WHERE \"product_id\" NOT IN (SELECT \"id\" FROM \"products\");" + ), + } + ], + } + + +def test_render_quality_markdown_deep_sections(): + """The markdown renderer surfaces every deep finding plus cleanup SQL.""" + from datasight.cli import render_quality_markdown + + md = render_quality_markdown(_fake_deep_quality_data()) + assert "## Whole-Row Duplicates" in md + assert "3 duplicate row(s)" in md + assert "## Primary-Key-Shaped Duplicates" in md + assert "7 (×2)" in md + assert "## Text Cleanliness" in md + assert "leading/trailing whitespace" in md + assert "empty string used in place of NULL" in md + assert "## Numeric Outliers (IQR)" in md + assert "11 row(s) outside" in md + assert "## Orphan Foreign-Key-Shaped Values" in md + assert "products.id" in md + assert "## Suggested Cleanup" in md + # Each cleanup SQL block should appear under a level-3 heading. + assert "### orders (whole-row dedup)" in md + assert "### orders.id (PK dedup)" in md + assert "### orders.region (leading/trailing whitespace)" in md + assert "### orders.amount (outliers)" in md + assert "### orders.product_id (orphans → products)" in md + # ```sql fences enclose at least one SQL snippet. + assert md.count("```sql") >= 5 + + +def test_quality_cli_deep_renders_all_sections(project_dir, monkeypatch): + """End-to-end: --deep CLI output renders the Rich tables and cleanup panel.""" + from datasight.cli_commands import quality as quality_cmd + + async def fake_overview(schema_info, run_sql, **kwargs): # noqa: ARG001 + assert kwargs.get("deep") is True + assert kwargs.get("sql_dialect") in {"duckdb", "sqlite", "postgres"} + return _fake_deep_quality_data() + + monkeypatch.setattr(quality_cmd, "build_quality_overview", fake_overview) + + runner = CliRunner() + result = runner.invoke(cli, ["quality", "--project-dir", project_dir, "--deep"]) + assert result.exit_code == 0, result.output + output = result.output + assert "Whole-Row Duplicates" in output + assert "Primary-Key-Shaped Duplicates" in output + assert "Text Cleanliness" in output + assert "Numeric Outliers" in output + assert "Orphan Foreign-Key-Shaped Values" in output + assert "Suggested Cleanup" in output + # One of the previewed cleanup SQL snippets should appear in the panel. + assert "SELECT DISTINCT" in output + + +def test_quality_cli_deep_markdown_output(project_dir, monkeypatch, tmp_path): + """--deep --format markdown writes deep sections to file.""" + from datasight.cli_commands import quality as quality_cmd + + async def fake_overview(schema_info, run_sql, **_): # noqa: ARG001 + return _fake_deep_quality_data() + + monkeypatch.setattr(quality_cmd, "build_quality_overview", fake_overview) + output_path = tmp_path / "deep.md" + runner = CliRunner() + result = runner.invoke( + cli, + [ + "quality", + "--project-dir", + project_dir, + "--deep", + "--format", + "markdown", + "--output", + str(output_path), + ], + ) + assert result.exit_code == 0, result.output + text = output_path.read_text(encoding="utf-8") + assert "## Suggested Cleanup" in text + assert "### orders (whole-row dedup)" in text + + def test_quality_json_output_writes_file(project_dir, tmp_path): output_path = tmp_path / "quality.json" runner = CliRunner() diff --git a/tests/test_data_profile_extra.py b/tests/test_data_profile_extra.py index 97b0cf6..e876241 100644 --- a/tests/test_data_profile_extra.py +++ b/tests/test_data_profile_extra.py @@ -511,3 +511,213 @@ async def test_build_dimension_overview(energy_conn): out = await build_dimension_overview(_schema_info(), _rs(energy_conn)) assert "dimension_columns" in out assert "join_hints" in out + + +# --------------------------------------------------------------------------- +# Deep-mode quality checks +# --------------------------------------------------------------------------- + + +@pytest.fixture +def messy_conn(tmp_path): + """Fixture that intentionally exercises each deep detector.""" + db = tmp_path / "m.duckdb" + conn = duckdb.connect(str(db)) + conn.execute("CREATE TABLE plants (plant_id INTEGER, plant_name VARCHAR)") + conn.execute("INSERT INTO plants VALUES (1, 'Alpha'), (2, 'Beta'), (3, 'Gamma')") + conn.execute( + "CREATE TABLE generation (plant_id INTEGER, fuel_type VARCHAR, state VARCHAR, mwh DOUBLE)" + ) + # Most rows are tight; one outlier on mwh. + base_rows = [(i % 3 + 1, "coal", "CA", 100.0 + i) for i in range(40)] + # Whole-row duplicate (entire row appears twice) + base_rows.append((1, "gas", "OR", 50.0)) + base_rows.append((1, "gas", "OR", 50.0)) + # PK-shaped duplicate on plant_id (already true via base rows, but make + # sure at least one value is duplicated with differing other columns). + base_rows.append((4, "gas", "WA", 200.0)) + base_rows.append((4, "gas", "TX", 210.0)) + # Whitespace and empty-string in fuel_type/state + base_rows.append((1, " coal ", "CA", 100.0)) + base_rows.append((2, "", "CA", 100.0)) + # Numeric outlier (well outside IQR). + base_rows.append((1, "coal", "CA", 99999.0)) + # Orphan FK: plant_id=99 not in plants. + base_rows.append((99, "coal", "CA", 120.0)) + conn.executemany("INSERT INTO generation VALUES (?, ?, ?, ?)", base_rows) + yield conn + conn.close() + + +def _messy_schema_info() -> list[dict[str, object]]: + return [ + { + "name": "plants", + "row_count": 3, + "columns": [ + {"name": "plant_id", "dtype": "INTEGER"}, + {"name": "plant_name", "dtype": "VARCHAR"}, + ], + }, + { + "name": "generation", + "row_count": 47, + "columns": [ + {"name": "plant_id", "dtype": "INTEGER"}, + {"name": "fuel_type", "dtype": "VARCHAR"}, + {"name": "state", "dtype": "VARCHAR"}, + {"name": "mwh", "dtype": "DOUBLE"}, + ], + }, + ] + + +@pytest.mark.asyncio +async def test_quality_overview_shallow_omits_deep_keys(energy_conn): + out = await build_quality_overview(_schema_info(), _rs(energy_conn)) + assert "duplicate_rows" not in out + assert "outlier_flags" not in out + assert out.get("deep") is not True + + +@pytest.mark.asyncio +async def test_quality_overview_deep_finds_everything(messy_conn): + out = await build_quality_overview( + _messy_schema_info(), + _rs(messy_conn), + sql_dialect="duckdb", + deep=True, + ) + assert out.get("deep") is True + + # Whole-row dup: the (1, gas, OR, 50) pair. + assert any( + item["table"] == "generation" and item["duplicate_count"] >= 1 + for item in out["duplicate_rows"] + ) + assert all("cleanup_sql" in item for item in out["duplicate_rows"]) + + # PK duplicates on generation.plant_id (multiple rows per plant_id). + pk = [ + item + for item in out["pk_duplicates"] + if item["table"] == "generation" and item["column"] == "plant_id" + ] + assert pk and pk[0]["examples"] + assert "cleanup_sql" in pk[0] + + # Text flags: whitespace on fuel_type, empty string on fuel_type. + issues = {(item["column"], item["issue"]) for item in out["text_flags"]} + assert ("fuel_type", "leading/trailing whitespace") in issues + assert ("fuel_type", "empty string used in place of NULL") in issues + for item in out["text_flags"]: + assert "cleanup_sql" in item + + # Outlier on mwh. + outliers = [item for item in out["outlier_flags"] if item["column"] == "mwh"] + assert outliers and outliers[0]["outlier_count"] >= 1 + assert "cleanup_sql" in outliers[0] + + # Orphan FK: generation.plant_id=99 has no parent in plants. + orphans = [ + item + for item in out["orphan_flags"] + if item["table"] == "generation" and item["column"] == "plant_id" + ] + assert orphans and orphans[0]["parent_table"] == "plants" + assert orphans[0]["orphan_count"] >= 1 + assert "cleanup_sql" in orphans[0] + + +@pytest.mark.asyncio +async def test_quality_overview_batched_scan_single_query(messy_conn): + """The batched-scan refactor should issue one scan SQL per table.""" + seen: list[str] = [] + + async def tracking_run(sql): + seen.append(sql) + return await _rs(messy_conn)(sql) + + await build_quality_overview(_messy_schema_info(), tracking_run) + # One COUNT/MIN/MAX/AVG SELECT per table — find them by the marker alias. + batched = [s for s in seen if "__row_count" in s] + assert len(batched) == 2 # one per table + + +def test_cleanup_dedup_sql_dialects(): + from datasight.cleanup import pk_dedup_preview, whole_row_dedup_preview + + assert "QUALIFY" in pk_dedup_preview("t", "id", "duckdb") + assert "ROW_NUMBER" in pk_dedup_preview("t", "id", "postgres") + assert "rowid" in pk_dedup_preview("t", "id", "sqlite") + assert "DISTINCT" in whole_row_dedup_preview("t", "duckdb") + assert "DISTINCT" in whole_row_dedup_preview("t", "sqlite") + + +def test_cleanup_text_and_outlier_and_orphan_previews(): + from datasight.cleanup import ( + empty_string_preview, + orphan_fk_preview, + outlier_preview, + whitespace_preview, + ) + + assert "= ''" in empty_string_preview("t", "c", "duckdb") + assert "TRIM" in whitespace_preview("t", "c", "duckdb") + # Outlier preview inlines q1/q3 as literals. + sql = outlier_preview("t", "c", "1.0", "9.0", "duckdb") + assert "1.0" in sql and "9.0" in sql + # Fallback when q1/q3 are unknown. + assert "ORDER BY" in outlier_preview("t", "c", None, None, "duckdb") + fk = orphan_fk_preview("child", "fk", "parent", "id", "duckdb") + assert "NOT IN" in fk and "parent" in fk + + +@pytest.mark.asyncio +async def test_deep_detectors_swallow_query_errors(): + """Each detector should return [] when the underlying SQL fails.""" + from datasight.data_profile import ( + _detect_numeric_outliers, + _detect_orphan_fks, + _detect_pk_duplicates, + _detect_whole_row_duplicates, + ) + + async def boom(sql): # noqa: ARG001 + msg = "no such table" + raise RuntimeError(msg) + + cols = [{"name": "plant_id", "dtype": "INTEGER"}, {"name": "mwh", "dtype": "DOUBLE"}] + parents = {"plants": ("plants", "plant_id")} + assert await _detect_whole_row_duplicates(boom, "t", "duckdb") == [] + assert await _detect_pk_duplicates(boom, "t", cols, "duckdb") == [] + assert await _detect_numeric_outliers(boom, "t", cols, "duckdb") == [] + assert await _detect_orphan_fks(boom, "t", cols, parents, "duckdb") == [] + + +@pytest.mark.asyncio +async def test_outlier_detector_skipped_on_sqlite(): + from datasight.data_profile import _detect_numeric_outliers + + async def boom(sql): # noqa: ARG001 — should never be called + msg = "SQL should not run on sqlite" + raise AssertionError(msg) + + cols = [{"name": "mwh", "dtype": "DOUBLE"}] + assert await _detect_numeric_outliers(boom, "t", cols, "sqlite") == [] + + +@pytest.mark.asyncio +async def test_orphan_detector_skips_self_and_unmatched(messy_conn): + """Orphan check requires a parent table with one ID-shaped column.""" + from datasight.data_profile import _detect_orphan_fks + + # No parent indexed → no findings, regardless of column shape. + cols = [{"name": "plant_id", "dtype": "INTEGER"}] + assert await _detect_orphan_fks(_rs(messy_conn), "generation", cols, {}, "duckdb") == [] + # Self-reference (child is also the parent) is skipped. + parents = {"generation": ("generation", "plant_id")} + assert ( + await _detect_orphan_fks(_rs(messy_conn), "generation", cols, parents, "duckdb") + == [] + ) diff --git a/tests/test_web_app.py b/tests/test_web_app.py index e820dcc..74144e9 100644 --- a/tests/test_web_app.py +++ b/tests/test_web_app.py @@ -821,7 +821,7 @@ async def run_sql(self, sql): # noqa: ARG002 monkeypatch.setattr( web_app, "build_quality_overview", - lambda schema_info, run_sql: _fake_quality_overview(schema_info, run_sql), # noqa: ARG005 + lambda schema_info, run_sql, **_: _fake_quality_overview(schema_info, run_sql), # noqa: ARG005 ) web_app._state.project_loaded = True @@ -853,7 +853,7 @@ async def run_sql(self, sql): # noqa: ARG002 original_sql_runner = web_app._state.sql_runner original_insight_cache = dict(web_app._state._insight_cache) - async def fake_overview(schema_info, run_sql): # noqa: ARG001 + async def fake_overview(schema_info, run_sql, **_): # noqa: ARG001 captured["tables"] = [table["name"] for table in schema_info] return await _fake_quality_overview(schema_info, run_sql)