Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/reference/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ datasight quality [OPTIONS]
| --- | --- |
| `--project-dir` | Project directory containing .env and config files. Default: `.`. |
| `--table` | Audit a specific table. |
| `--deep` | Run expensive detectors: whole-row and PK-shaped duplicates, text whitespace/empty-string flags, IQR-based numeric outliers, and orphan foreign-key-shaped values. |
| `--format` | Output format (default: table). Default: `table`. |
| `--output`, `-o` | Write the quality audit to a file instead of stdout. |

Expand Down
36 changes: 35 additions & 1 deletion docs/use/how-to/audit-data-quality.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,40 @@ Use it to spot:
- quick notes worth turning into follow-up questions
- temporal completeness issues when [`time_series.yaml`](../../project-setup/how-to/declare-time-series.md) is present

The default pass batches per-column null and numeric scans into one query
per table, so it stays cheap on wide schemas.

### Deeper checks: `datasight quality --deep`

Add `--deep` to run the more expensive detectors and emit previewable
cleanup SQL alongside each finding:

```bash
datasight quality --deep
datasight quality --deep --format markdown -o quality-deep.md
```

`--deep` adds:

- **Whole-row duplicates** — rows that are exact duplicates across all columns
- **Primary-key-shaped duplicates** — values appearing more than once in
any `id`, `*_id`, or `id_*` column
- **Text cleanliness** — counts of values with leading or trailing
whitespace, and counts of empty strings used in place of NULL
- **Numeric outliers (IQR)** — counts of values outside the
`[Q1 − 1.5·IQR, Q3 + 1.5·IQR]` fence (skipped on SQLite, which has no
percentile aggregate)
- **Orphan foreign-key-shaped values** — values in `<parent>_id` columns
that don't appear in `<parent>.<id>`, detected against any parent
table with exactly one ID-shaped column

Each finding includes a `cleanup_sql` field with a previewable `SELECT`
that shows the candidate rows. Destructive forms
(`UPDATE`, `CREATE OR REPLACE TABLE`) appear only as comments inside the
preview — datasight never auto-mutates your tables. The CLI table and
markdown outputs render the cleanup SQL in a dedicated
**Suggested Cleanup** section.

## Detect untidy column shapes

!!! warning "Experimental"
Expand Down Expand Up @@ -398,7 +432,7 @@ directory name appears in the report title.
For a thorough data quality audit, run the commands in this order:

1. **Profile** — understand the shape of the data
2. **Quality** — find nulls, range issues, date gaps, and untidy column shapes
2. **Quality** — find nulls, range issues, date gaps, and untidy column shapes (add `--deep` for duplicates, text cleanliness, outliers, and orphan-FK checks with previewable cleanup SQL)
3. **Integrity** — verify primary keys, foreign keys, and join behavior
4. **Distribution** — inspect percentiles, outliers, and temporal spikes
5. **Measures** — identify metrics and verify aggregation defaults
Expand Down
7 changes: 6 additions & 1 deletion src/datasight/audit_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@ async def build_audit_report(
validation_rules: list[dict[str, Any]] | None = None,
declared_joins: list[dict[str, Any]] | None = None,
project_name: str | None = None,
*,
sql_dialect: str = "duckdb",
deep: bool = False,
) -> dict[str, Any]:
"""Run all audit checks and assemble a composite report."""
dataset_overview = await build_dataset_overview(schema_info, run_sql)
quality = await build_quality_overview(schema_info, run_sql)
quality = await build_quality_overview(
schema_info, run_sql, sql_dialect=sql_dialect, deep=deep
)
integrity = await build_integrity_overview(schema_info, run_sql, declared_joins)
distribution = await build_distribution_overview(schema_info, run_sql, overrides)

Expand Down
130 changes: 130 additions & 0 deletions src/datasight/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Emit previewable cleanup SQL for findings from ``build_quality_overview``.

Mirrors the ``tidy`` module's preview-first ethos: each function returns a
single SQL string that lets the user *see* the rows in question, never an
UPDATE/DELETE that auto-mutates the table. The returned SQL is safe to copy
into a query window or attach to a quality report.

For destructive operations (deduplication, NULLIF rewrites, TRIM rewrites)
the preview is a ``SELECT`` that shows the candidate rows or the rewritten
column alongside the original; the caller can wrap it in an
``UPDATE``/``CREATE OR REPLACE TABLE`` once they've reviewed the preview.

Dialects supported: ``duckdb``, ``sqlite``, ``postgres``. Where a dialect
lacks a feature (e.g. SQLite has no ``QUALIFY`` and no percentile
aggregates), the function falls back to portable SQL.
"""

from __future__ import annotations

from datasight.schema import _quote_identifier


def empty_string_preview(table: str, column: str, dialect: str) -> str:
"""Show rows where the column is the empty string, candidates for NULLIF."""
qt = _quote_identifier(table)
qc = _quote_identifier(column)
return (
f"-- Rows where {column!r} is an empty string. "
f"To fix: UPDATE {qt} SET {qc} = NULL WHERE {qc} = '';\n"
f"SELECT * FROM {qt} WHERE {qc} = '';"
)


def whitespace_preview(table: str, column: str, dialect: str) -> str:
"""Show rows whose column value has surrounding whitespace."""
qt = _quote_identifier(table)
qc = _quote_identifier(column)
return (
f"-- Rows where {column!r} has leading/trailing whitespace. "
f"To fix: UPDATE {qt} SET {qc} = TRIM({qc}) WHERE {qc} <> TRIM({qc});\n"
f"SELECT {qc} AS original, TRIM({qc}) AS trimmed FROM {qt} "
f"WHERE {qc} IS NOT NULL AND {qc} <> TRIM({qc});"
)


def whole_row_dedup_preview(table: str, dialect: str) -> str:
"""Preview a deduplicated copy of the table."""
qt = _quote_identifier(table)
if dialect == "duckdb":
materialize = (
f"-- To materialize: CREATE OR REPLACE TABLE {qt} AS SELECT DISTINCT * FROM {qt};"
)
else:
materialize = (
f"-- To materialize: BEGIN; DROP TABLE IF EXISTS {qt}_deduped; "
f"CREATE TABLE {qt}_deduped AS SELECT DISTINCT * FROM {qt}; COMMIT;"
Comment on lines +49 to +56
)
return f"{materialize}\nSELECT DISTINCT * FROM {qt};"


def pk_dedup_preview(table: str, pk_column: str, dialect: str) -> str:
"""Show one canonical row per duplicate PK value.

DuckDB uses ``QUALIFY`` for a one-liner; Postgres uses a CTE with
``ROW_NUMBER``; SQLite falls back to ``MIN(rowid)``.
"""
qt = _quote_identifier(table)
qc = _quote_identifier(pk_column)
if dialect == "duckdb":
return (
f"-- One canonical row per duplicate {pk_column!r} value.\n"
f"SELECT * FROM {qt} "
f"QUALIFY ROW_NUMBER() OVER (PARTITION BY {qc} ORDER BY {qc}) = 1;"
)
if dialect == "postgres":
return (
f"-- One canonical row per duplicate {pk_column!r} value.\n"
f"WITH ranked AS (\n"
f" SELECT *, ROW_NUMBER() OVER (PARTITION BY {qc} ORDER BY {qc}) AS rn "
f"FROM {qt}\n"
f") SELECT * FROM ranked WHERE rn = 1;"
)
# sqlite
return (
f"-- One canonical row per duplicate {pk_column!r} value (SQLite uses rowid).\n"
f"SELECT * FROM {qt} WHERE rowid IN "
f"(SELECT MIN(rowid) FROM {qt} GROUP BY {qc});"
)


def outlier_preview(table: str, column: str, q1: str | None, q3: str | None, dialect: str) -> str:
"""Show rows whose value falls outside the IQR fence.

``q1`` / ``q3`` are stringified scalars from the original detector
query so we can inline them as literals instead of re-running the
percentile aggregate.
"""
qt = _quote_identifier(table)
qc = _quote_identifier(column)
if q1 is None or q3 is None:
return (
f"-- Inspect outliers in {column!r} (recompute IQR fence as needed).\n"
f"SELECT * FROM {qt} WHERE {qc} IS NOT NULL ORDER BY {qc} DESC LIMIT 20;"
)
return (
f"-- Rows in {column!r} outside the IQR fence [q1={q1}, q3={q3}].\n"
f"SELECT * FROM {qt} WHERE {qc} IS NOT NULL "
f"AND ({qc} < {q1} - 1.5 * ({q3} - {q1}) "
f"OR {qc} > {q3} + 1.5 * ({q3} - {q1}));"
)


def orphan_fk_preview(
table: str,
column: str,
parent_table: str,
parent_column: str,
dialect: str,
) -> str:
"""Show distinct child values not present in the parent's PK column."""
qt = _quote_identifier(table)
qc = _quote_identifier(column)
qpt = _quote_identifier(parent_table)
qpc = _quote_identifier(parent_column)
return (
f"-- Distinct {table}.{column} values not present in {parent_table}.{parent_column}.\n"
f"SELECT DISTINCT {qc} FROM {qt} "
f"WHERE {qc} IS NOT NULL "
f"AND {qc} NOT IN (SELECT {qpc} FROM {qpt} WHERE {qpc} IS NOT NULL);"
)
67 changes: 67 additions & 0 deletions src/datasight/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,73 @@ def render_quality_markdown(quality_data: dict[str, Any]) -> str: # noqa: C901
lines.extend(["", "## Wide Tables"])
for item in quality_data["wide_tables"]:
lines.append(f"- `{item['table']}`: {item['reason']}")

cleanup_blocks: list[tuple[str, str]] = []

if quality_data.get("duplicate_rows"):
lines.extend(["", "## Whole-Row Duplicates"])
for item in quality_data["duplicate_rows"]:
lines.append(f"- `{item['table']}`: {item['duplicate_count']} duplicate row(s)")
if item.get("cleanup_sql"):
cleanup_blocks.append((f"{item['table']} (whole-row dedup)", item["cleanup_sql"]))
if quality_data.get("pk_duplicates"):
lines.extend(["", "## Primary-Key-Shaped Duplicates"])
for item in quality_data["pk_duplicates"]:
sample = ", ".join(f"{e['value']} (×{e['count']})" for e in item["examples"][:3])
lines.append(f"- `{item['table']}.{item['column']}`: duplicate values — {sample}")
if item.get("cleanup_sql"):
cleanup_blocks.append(
(f"{item['table']}.{item['column']} (PK dedup)", item["cleanup_sql"])
)
if quality_data.get("text_flags"):
lines.extend(["", "## Text Cleanliness"])
for item in quality_data["text_flags"]:
lines.append(
f"- `{item['table']}.{item['column']}`: {item['issue']} ({item['count']} row(s))"
)
if item.get("cleanup_sql"):
cleanup_blocks.append(
(
f"{item['table']}.{item['column']} ({item['issue']})",
item["cleanup_sql"],
)
)
if quality_data.get("outlier_flags"):
lines.extend(["", "## Numeric Outliers (IQR)"])
for item in quality_data["outlier_flags"]:
lines.append(
f"- `{item['table']}.{item['column']}`: {item['outlier_count']} row(s) outside "
f"IQR fence [q1={item.get('q1')}, q3={item.get('q3')}]"
)
if item.get("cleanup_sql"):
cleanup_blocks.append(
(f"{item['table']}.{item['column']} (outliers)", item["cleanup_sql"])
)
if quality_data.get("orphan_flags"):
lines.extend(["", "## Orphan Foreign-Key-Shaped Values"])
for item in quality_data["orphan_flags"]:
lines.append(
f"- `{item['table']}.{item['column']}` → `{item['parent_table']}.{item['parent_column']}`: "
f"{item['orphan_count']} orphan value(s)"
)
if item.get("cleanup_sql"):
cleanup_blocks.append(
(
f"{item['table']}.{item['column']} (orphans → {item['parent_table']})",
item["cleanup_sql"],
)
)

if cleanup_blocks:
lines.extend(["", "## Suggested Cleanup"])
for title, sql in cleanup_blocks:
lines.append("")
lines.append(f"### {title}")
lines.append("")
lines.append("```sql")
lines.extend(sql.splitlines())
lines.append("```")

if quality_data["notes"]:
lines.extend(["", "## Notes"])
for item in quality_data["notes"]:
Expand Down
1 change: 1 addition & 0 deletions src/datasight/cli_commands/audit_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ async def _run_audit_report():
validation_rules,
declared_joins,
project_name=Path(project_dir).name,
sql_dialect=settings.database.sql_dialect,
)

report_data = asyncio.run(_run_audit_report())
Expand Down
7 changes: 6 additions & 1 deletion src/datasight/cli_commands/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,12 @@ async def _run_all():
"profiling tables", build_dataset_overview(schema_info, runner.run_sql)
)
quality_data = await _run_phase(
"running quality checks", build_quality_overview(schema_info, runner.run_sql)
"running quality checks",
build_quality_overview(
schema_info,
runner.run_sql,
sql_dialect=(db_settings.sql_dialect if db_settings else "duckdb"),
),
)
measure_data = await _run_phase(
"discovering measures",
Expand Down
Loading
Loading