dsgrid · daniel-thom · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -606,6 +606,7 @@ datasight quality [OPTIONS]
 | --- | --- |
 | `--project-dir` | Project directory containing .env and config files. Default: `.`. |
 | `--table` | Audit a specific table. |
+| `--deep` | Run expensive detectors: whole-row and PK-shaped duplicates, text whitespace/empty-string flags, IQR-based numeric outliers, and orphan foreign-key-shaped values. |
 | `--format` | Output format (default: table). Default: `table`. |
 | `--output`, `-o` | Write the quality audit to a file instead of stdout. |
 

diff --git a/docs/use/how-to/audit-data-quality.md b/docs/use/how-to/audit-data-quality.md
@@ -65,6 +65,40 @@ Use it to spot:
 - quick notes worth turning into follow-up questions
 - temporal completeness issues when [`time_series.yaml`](../../project-setup/how-to/declare-time-series.md) is present
 
+The default pass batches per-column null and numeric scans into one query
+per table, so it stays cheap on wide schemas.
+
+### Deeper checks: `datasight quality --deep`
+
+Add `--deep` to run the more expensive detectors and emit previewable
+cleanup SQL alongside each finding:
+
+```bash
+datasight quality --deep
+datasight quality --deep --format markdown -o quality-deep.md
+```
+
+`--deep` adds:
+
+- **Whole-row duplicates** — rows that are exact duplicates across all columns
+- **Primary-key-shaped duplicates** — values appearing more than once in
+  any `id`, `*_id`, or `id_*` column
+- **Text cleanliness** — counts of values with leading or trailing
+  whitespace, and counts of empty strings used in place of NULL
+- **Numeric outliers (IQR)** — counts of values outside the
+  `[Q1 − 1.5·IQR, Q3 + 1.5·IQR]` fence (skipped on SQLite, which has no
+  percentile aggregate)
+- **Orphan foreign-key-shaped values** — values in `<parent>_id` columns
+  that don't appear in `<parent>.<id>`, detected against any parent
+  table with exactly one ID-shaped column
+
+Each finding includes a `cleanup_sql` field with a previewable `SELECT`
+that shows the candidate rows. Destructive forms
+(`UPDATE`, `CREATE OR REPLACE TABLE`) appear only as comments inside the
+preview — datasight never auto-mutates your tables. The CLI table and
+markdown outputs render the cleanup SQL in a dedicated
+**Suggested Cleanup** section.
+
 ## Detect untidy column shapes
 
 !!! warning "Experimental"
@@ -398,7 +432,7 @@ directory name appears in the report title.
 For a thorough data quality audit, run the commands in this order:
 
 1. **Profile** — understand the shape of the data
-2. **Quality** — find nulls, range issues, date gaps, and untidy column shapes
+2. **Quality** — find nulls, range issues, date gaps, and untidy column shapes (add `--deep` for duplicates, text cleanliness, outliers, and orphan-FK checks with previewable cleanup SQL)
 3. **Integrity** — verify primary keys, foreign keys, and join behavior
 4. **Distribution** — inspect percentiles, outliers, and temporal spikes
 5. **Measures** — identify metrics and verify aggregation defaults

diff --git a/src/datasight/audit_report.py b/src/datasight/audit_report.py
@@ -20,10 +20,15 @@ async def build_audit_report(
     validation_rules: list[dict[str, Any]] | None = None,
     declared_joins: list[dict[str, Any]] | None = None,
     project_name: str | None = None,
+    *,
+    sql_dialect: str = "duckdb",
+    deep: bool = False,
 ) -> dict[str, Any]:
     """Run all audit checks and assemble a composite report."""
     dataset_overview = await build_dataset_overview(schema_info, run_sql)
-    quality = await build_quality_overview(schema_info, run_sql)
+    quality = await build_quality_overview(
+        schema_info, run_sql, sql_dialect=sql_dialect, deep=deep
+    )
     integrity = await build_integrity_overview(schema_info, run_sql, declared_joins)
     distribution = await build_distribution_overview(schema_info, run_sql, overrides)
 

diff --git a/src/datasight/cleanup.py b/src/datasight/cleanup.py
@@ -0,0 +1,130 @@
+"""Emit previewable cleanup SQL for findings from ``build_quality_overview``.
+
+Mirrors the ``tidy`` module's preview-first ethos: each function returns a
+single SQL string that lets the user *see* the rows in question, never an
+UPDATE/DELETE that auto-mutates the table. The returned SQL is safe to copy
+into a query window or attach to a quality report.
+
+For destructive operations (deduplication, NULLIF rewrites, TRIM rewrites)
+the preview is a ``SELECT`` that shows the candidate rows or the rewritten
+column alongside the original; the caller can wrap it in an
+``UPDATE``/``CREATE OR REPLACE TABLE`` once they've reviewed the preview.
+
+Dialects supported: ``duckdb``, ``sqlite``, ``postgres``. Where a dialect
+lacks a feature (e.g. SQLite has no ``QUALIFY`` and no percentile
+aggregates), the function falls back to portable SQL.
+"""
+
+from __future__ import annotations
+
+from datasight.schema import _quote_identifier
+
+
+def empty_string_preview(table: str, column: str, dialect: str) -> str:
+    """Show rows where the column is the empty string, candidates for NULLIF."""
+    qt = _quote_identifier(table)
+    qc = _quote_identifier(column)
+    return (
+        f"-- Rows where {column!r} is an empty string. "
+        f"To fix: UPDATE {qt} SET {qc} = NULL WHERE {qc} = '';\n"
+        f"SELECT * FROM {qt} WHERE {qc} = '';"
+    )
+
+
+def whitespace_preview(table: str, column: str, dialect: str) -> str:
+    """Show rows whose column value has surrounding whitespace."""
+    qt = _quote_identifier(table)
+    qc = _quote_identifier(column)
+    return (
+        f"-- Rows where {column!r} has leading/trailing whitespace. "
+        f"To fix: UPDATE {qt} SET {qc} = TRIM({qc}) WHERE {qc} <> TRIM({qc});\n"
+        f"SELECT {qc} AS original, TRIM({qc}) AS trimmed FROM {qt} "
+        f"WHERE {qc} IS NOT NULL AND {qc} <> TRIM({qc});"
+    )
+
+
+def whole_row_dedup_preview(table: str, dialect: str) -> str:
+    """Preview a deduplicated copy of the table."""
+    qt = _quote_identifier(table)
+    if dialect == "duckdb":
+        materialize = (
+            f"-- To materialize: CREATE OR REPLACE TABLE {qt} AS SELECT DISTINCT * FROM {qt};"
+        )
+    else:
+        materialize = (
+            f"-- To materialize: BEGIN; DROP TABLE IF EXISTS {qt}_deduped; "
+            f"CREATE TABLE {qt}_deduped AS SELECT DISTINCT * FROM {qt}; COMMIT;"
+        )
+    return f"{materialize}\nSELECT DISTINCT * FROM {qt};"
+
+
+def pk_dedup_preview(table: str, pk_column: str, dialect: str) -> str:
+    """Show one canonical row per duplicate PK value.
+
+    DuckDB uses ``QUALIFY`` for a one-liner; Postgres uses a CTE with
+    ``ROW_NUMBER``; SQLite falls back to ``MIN(rowid)``.
+    """
+    qt = _quote_identifier(table)
+    qc = _quote_identifier(pk_column)
+    if dialect == "duckdb":
+        return (
+            f"-- One canonical row per duplicate {pk_column!r} value.\n"
+            f"SELECT * FROM {qt} "
+            f"QUALIFY ROW_NUMBER() OVER (PARTITION BY {qc} ORDER BY {qc}) = 1;"
+        )
+    if dialect == "postgres":
+        return (
+            f"-- One canonical row per duplicate {pk_column!r} value.\n"
+            f"WITH ranked AS (\n"
+            f"  SELECT *, ROW_NUMBER() OVER (PARTITION BY {qc} ORDER BY {qc}) AS rn "
+            f"FROM {qt}\n"
+            f") SELECT * FROM ranked WHERE rn = 1;"
+        )
+    # sqlite
+    return (
+        f"-- One canonical row per duplicate {pk_column!r} value (SQLite uses rowid).\n"
+        f"SELECT * FROM {qt} WHERE rowid IN "
+        f"(SELECT MIN(rowid) FROM {qt} GROUP BY {qc});"
+    )
+
+
+def outlier_preview(table: str, column: str, q1: str | None, q3: str | None, dialect: str) -> str:
+    """Show rows whose value falls outside the IQR fence.
+
+    ``q1`` / ``q3`` are stringified scalars from the original detector
+    query so we can inline them as literals instead of re-running the
+    percentile aggregate.
+    """
+    qt = _quote_identifier(table)
+    qc = _quote_identifier(column)
+    if q1 is None or q3 is None:
+        return (
+            f"-- Inspect outliers in {column!r} (recompute IQR fence as needed).\n"
+            f"SELECT * FROM {qt} WHERE {qc} IS NOT NULL ORDER BY {qc} DESC LIMIT 20;"
+        )
+    return (
+        f"-- Rows in {column!r} outside the IQR fence [q1={q1}, q3={q3}].\n"
+        f"SELECT * FROM {qt} WHERE {qc} IS NOT NULL "
+        f"AND ({qc} < {q1} - 1.5 * ({q3} - {q1}) "
+        f"OR {qc} > {q3} + 1.5 * ({q3} - {q1}));"
+    )
+
+
+def orphan_fk_preview(
+    table: str,
+    column: str,
+    parent_table: str,
+    parent_column: str,
+    dialect: str,
+) -> str:
+    """Show distinct child values not present in the parent's PK column."""
+    qt = _quote_identifier(table)
+    qc = _quote_identifier(column)
+    qpt = _quote_identifier(parent_table)
+    qpc = _quote_identifier(parent_column)
+    return (
+        f"-- Distinct {table}.{column} values not present in {parent_table}.{parent_column}.\n"
+        f"SELECT DISTINCT {qc} FROM {qt} "
+        f"WHERE {qc} IS NOT NULL "
+        f"AND {qc} NOT IN (SELECT {qpc} FROM {qpt} WHERE {qpc} IS NOT NULL);"
+    )
diff --git a/src/datasight/cli.py b/src/datasight/cli.py
@@ -496,6 +496,73 @@ def render_quality_markdown(quality_data: dict[str, Any]) -> str:  # noqa: C901
         lines.extend(["", "## Wide Tables"])
         for item in quality_data["wide_tables"]:
             lines.append(f"- `{item['table']}`: {item['reason']}")
+
+    cleanup_blocks: list[tuple[str, str]] = []
+
+    if quality_data.get("duplicate_rows"):
+        lines.extend(["", "## Whole-Row Duplicates"])
+        for item in quality_data["duplicate_rows"]:
+            lines.append(f"- `{item['table']}`: {item['duplicate_count']} duplicate row(s)")
+            if item.get("cleanup_sql"):
+                cleanup_blocks.append((f"{item['table']} (whole-row dedup)", item["cleanup_sql"]))
+    if quality_data.get("pk_duplicates"):
+        lines.extend(["", "## Primary-Key-Shaped Duplicates"])
+        for item in quality_data["pk_duplicates"]:
+            sample = ", ".join(f"{e['value']} (×{e['count']})" for e in item["examples"][:3])
+            lines.append(f"- `{item['table']}.{item['column']}`: duplicate values — {sample}")
+            if item.get("cleanup_sql"):
+                cleanup_blocks.append(
+                    (f"{item['table']}.{item['column']} (PK dedup)", item["cleanup_sql"])
+                )
+    if quality_data.get("text_flags"):
+        lines.extend(["", "## Text Cleanliness"])
+        for item in quality_data["text_flags"]:
+            lines.append(
+                f"- `{item['table']}.{item['column']}`: {item['issue']} ({item['count']} row(s))"
+            )
+            if item.get("cleanup_sql"):
+                cleanup_blocks.append(
+                    (
+                        f"{item['table']}.{item['column']} ({item['issue']})",
+                        item["cleanup_sql"],
+                    )
+                )
+    if quality_data.get("outlier_flags"):
+        lines.extend(["", "## Numeric Outliers (IQR)"])
+        for item in quality_data["outlier_flags"]:
+            lines.append(
+                f"- `{item['table']}.{item['column']}`: {item['outlier_count']} row(s) outside "
+                f"IQR fence [q1={item.get('q1')}, q3={item.get('q3')}]"
+            )
+            if item.get("cleanup_sql"):
+                cleanup_blocks.append(
+                    (f"{item['table']}.{item['column']} (outliers)", item["cleanup_sql"])
+                )
+    if quality_data.get("orphan_flags"):
+        lines.extend(["", "## Orphan Foreign-Key-Shaped Values"])
+        for item in quality_data["orphan_flags"]:
+            lines.append(
+                f"- `{item['table']}.{item['column']}` → `{item['parent_table']}.{item['parent_column']}`: "
+                f"{item['orphan_count']} orphan value(s)"
+            )
+            if item.get("cleanup_sql"):
+                cleanup_blocks.append(
+                    (
+                        f"{item['table']}.{item['column']} (orphans → {item['parent_table']})",
+                        item["cleanup_sql"],
+                    )
+                )
+
+    if cleanup_blocks:
+        lines.extend(["", "## Suggested Cleanup"])
+        for title, sql in cleanup_blocks:
+            lines.append("")
+            lines.append(f"### {title}")
+            lines.append("")
+            lines.append("```sql")
+            lines.extend(sql.splitlines())
+            lines.append("```")
+
     if quality_data["notes"]:
         lines.extend(["", "## Notes"])
         for item in quality_data["notes"]:

diff --git a/src/datasight/cli_commands/audit_report.py b/src/datasight/cli_commands/audit_report.py
@@ -104,6 +104,7 @@ async def _run_audit_report():
             validation_rules,
             declared_joins,
             project_name=Path(project_dir).name,
+            sql_dialect=settings.database.sql_dialect,
         )
 
     report_data = asyncio.run(_run_audit_report())

diff --git a/src/datasight/cli_commands/inspect.py b/src/datasight/cli_commands/inspect.py
@@ -94,7 +94,12 @@ async def _run_all():
             "profiling tables", build_dataset_overview(schema_info, runner.run_sql)
         )
         quality_data = await _run_phase(
-            "running quality checks", build_quality_overview(schema_info, runner.run_sql)
+            "running quality checks",
+            build_quality_overview(
+                schema_info,
+                runner.run_sql,
+                sql_dialect=(db_settings.sql_dialect if db_settings else "duckdb"),
+            ),
         )
         measure_data = await _run_phase(
             "discovering measures",