From 8306131a5f0a68a8abdb0b571f8497b40f408376 Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 3 Jul 2026 19:19:18 -0700 Subject: [PATCH] statistics: reduce TopN/buckets collected for non-predicate columns Add the global variable tidb_analyze_non_predicate_column_ratio (default 0.1, range [0,1]). During ANALYZE v2, columns that are not predicate columns collect only ratio times the configured TopN and bucket numbers (buckets floored at 1). Columns that keep the configured numbers are: - predicate columns recorded in mysql.column_stats_usage, when any exist; - otherwise the handle column and the first column of each index; - columns explicitly specified in ANALYZE TABLE ... COLUMNS. Index statistics are never reduced. The full-stats column set is decided at plan-build time and carried on AnalyzeColumnsTask to the analyze executor, so auto-analyze picks it up as well. Setting the ratio to 1 disables the reduction. Co-Authored-By: Claude Fable 5 --- pkg/executor/analyze_col.go | 6 ++ pkg/executor/analyze_col_sampling.go | 11 ++- pkg/executor/builder.go | 2 + pkg/executor/test/analyzetest/analyze_test.go | 9 ++ .../test/analyzetest/columns/BUILD.bazel | 2 +- .../columns/analyze_columns_with_test.go | 89 ++++++++++++++++++ .../options/analyze_saved_options_test.go | 6 ++ pkg/planner/cardinality/selectivity_test.go | 6 ++ .../casetest/planstats/plan_stats_test.go | 3 + pkg/planner/core/common_plans.go | 8 ++ pkg/planner/core/planbuilder.go | 93 ++++++++++++++++--- pkg/sessionctx/vardef/tidb_vars.go | 38 +++++--- pkg/sessionctx/variable/sysvar.go | 15 +++ .../handle/handletest/handle_test.go | 3 + 14 files changed, 260 insertions(+), 31 deletions(-) diff --git a/pkg/executor/analyze_col.go b/pkg/executor/analyze_col.go index f6b8227d580c9..961bd0b328367 100644 --- a/pkg/executor/analyze_col.go +++ b/pkg/executor/analyze_col.go @@ -55,6 +55,12 @@ type AnalyzeColumnsExec struct { // concurrent lookup across partition workers. samplingStatsConcurrency int + // fullStatsCols holds the IDs of the columns that keep the configured TopN/bucket + // numbers; the other columns only collect nonPredicateColRatio times the configured + // numbers. A nil map disables the reduction. See AnalyzeColumnsTask.FullStatsCols. + fullStatsCols map[int64]struct{} + nonPredicateColRatio float64 + memTracker *memory.Tracker } diff --git a/pkg/executor/analyze_col_sampling.go b/pkg/executor/analyze_col_sampling.go index b306cb159acb6..38428924fce9d 100644 --- a/pkg/executor/analyze_col_sampling.go +++ b/pkg/executor/analyze_col_sampling.go @@ -867,7 +867,16 @@ workLoop: failpoint.InjectCall("analyzeSamplingBuildAfterReleaseCollectorMemory", collectorMemSize, e.memTracker.BytesConsumed()) } numTopN := int(e.opts[ast.AnalyzeOptNumTopN]) + numBuckets := int(e.opts[ast.AnalyzeOptNumBuckets]) if task.isColumn { + if e.fullStatsCols != nil { + if _, ok := e.fullStatsCols[e.colsInfo[task.slicePos].ID]; !ok { + // The column is not a predicate column, so only collect + // nonPredicateColRatio times the configured TopN/bucket numbers. + numTopN = int(float64(numTopN) * e.nonPredicateColRatio) + numBuckets = max(1, int(float64(numBuckets)*e.nonPredicateColRatio)) + } + } if e.tableInfo != nil && isColumnCoveredBySingleColUniqueIndex(e.tableInfo, e.colsInfo[task.slicePos].Offset) { numTopN = 0 } @@ -877,7 +886,7 @@ workLoop: numTopN = 0 } } - hist, topn, err := statistics.BuildHistAndTopN(e.ctx, int(e.opts[ast.AnalyzeOptNumBuckets]), numTopN, task.id, collector, task.tp, task.isColumn, e.memTracker) + hist, topn, err := statistics.BuildHistAndTopN(e.ctx, numBuckets, numTopN, task.id, collector, task.tp, task.isColumn, e.memTracker) if err != nil { resultCh <- err releaseCollectorMemory() diff --git a/pkg/executor/builder.go b/pkg/executor/builder.go index 7644929184311..b6d47eebf724e 100644 --- a/pkg/executor/builder.go +++ b/pkg/executor/builder.go @@ -3228,6 +3228,8 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown( schemaForVirtualColEval: schemaForVirtualColEval, baseCount: count, baseModifyCnt: modifyCount, + fullStatsCols: task.FullStatsCols, + nonPredicateColRatio: task.NonPredicateColRatio, } e.analyzePB.ColReq = &tipb.AnalyzeColumnsReq{ BucketSize: int64(opts[ast.AnalyzeOptNumBuckets]), diff --git a/pkg/executor/test/analyzetest/analyze_test.go b/pkg/executor/test/analyzetest/analyze_test.go index 770d0f88f0fbf..98ef4bdc47ac9 100644 --- a/pkg/executor/test/analyzetest/analyze_test.go +++ b/pkg/executor/test/analyzetest/analyze_test.go @@ -636,6 +636,9 @@ func TestAnalyzeColumnsAfterAnalyzeAll(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("create table t (a int, b int)") tk.MustExec("insert into t (a,b) values (1,1), (1,1), (2,2), (2,2), (3,3), (4,4)") tk.MustExec("flush stats_delta *.*") @@ -1061,6 +1064,9 @@ func TestAnalyzePartitionTableWithDynamicMode(t *testing.T) { tk.MustExec("use test") tk.MustExec("set @@session.tidb_analyze_version = 2") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("set @@session.tidb_stats_load_sync_wait = 20000") // to stabilise test tk.MustExec("set @@session.tidb_partition_prune_mode = 'dynamic'") createTable := `CREATE TABLE t (a int, b int, c varchar(10), d int, primary key(a), index idx(b)) @@ -1155,6 +1161,9 @@ func TestAnalyzePartitionTableStaticToDynamic(t *testing.T) { tk.MustExec("use test") tk.MustExec("set @@session.tidb_analyze_version = 2") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("set @@session.tidb_stats_load_sync_wait = 20000") // to stabilise test tk.MustExec("set @@session.tidb_partition_prune_mode = 'static'") createTable := `CREATE TABLE t (a int, b int, c varchar(10), d int, primary key(a), index idx(b)) diff --git a/pkg/executor/test/analyzetest/columns/BUILD.bazel b/pkg/executor/test/analyzetest/columns/BUILD.bazel index eed1fc5a8fd1f..392be07b76f2b 100644 --- a/pkg/executor/test/analyzetest/columns/BUILD.bazel +++ b/pkg/executor/test/analyzetest/columns/BUILD.bazel @@ -8,7 +8,7 @@ go_test( "main_test.go", ], flaky = True, - shard_count = 6, + shard_count = 7, deps = [ "//pkg/config", "//pkg/parser/ast", diff --git a/pkg/executor/test/analyzetest/columns/analyze_columns_with_test.go b/pkg/executor/test/analyzetest/columns/analyze_columns_with_test.go index 156945fca66c1..c52587b299c02 100644 --- a/pkg/executor/test/analyzetest/columns/analyze_columns_with_test.go +++ b/pkg/executor/test/analyzetest/columns/analyze_columns_with_test.go @@ -35,6 +35,9 @@ func TestAnalyzeColumnsWithPrimaryKey(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Pin the ratio so every analyzed column collects the configured TopN/bucket + // numbers; the reduction behavior is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("create table t (a int, b int, c int primary key)") statstestutil.HandleNextDDLEventWithTxn(h) tk.MustExec("insert into t values (1,1,1), (1,1,2), (2,2,3), (2,2,4), (3,3,5), (4,3,6), (5,4,7), (6,4,8), (null,null,9)") @@ -103,6 +106,9 @@ func TestAnalyzeColumnsWithIndex(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Pin the ratio so every analyzed column collects the configured TopN/bucket + // numbers; the reduction behavior is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("create table t (a int, b int, c int, d int, index idx_b_d(b, d))") statstestutil.HandleNextDDLEventWithTxn(h) tk.MustExec("insert into t values (1,1,null,1), (2,1,9,1), (1,1,8,1), (2,2,7,2), (1,3,7,3), (2,4,6,4), (1,4,6,5), (2,4,6,5), (1,5,6,5)") @@ -180,6 +186,9 @@ func TestAnalyzeColumnsWithClusteredIndex(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Pin the ratio so every analyzed column collects the configured TopN/bucket + // numbers; the reduction behavior is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("create table t (a int, b int, c int, d int, primary key(b, d) clustered)") statstestutil.HandleNextDDLEventWithTxn(h) tk.MustExec("insert into t values (1,1,null,1), (2,2,9,2), (1,3,8,3), (2,4,7,4), (1,5,7,5), (2,6,6,6), (1,7,6,7), (2,8,6,8), (1,9,6,9)") @@ -257,6 +266,9 @@ func TestAnalyzeColumnsWithDynamicPartitionTable(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Pin the ratio so every analyzed column collects the configured TopN/bucket + // numbers; the reduction behavior is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("set @@tidb_partition_prune_mode = 'dynamic'") tk.MustExec("create table t (a int, b int, c int, index idx(c)) partition by range (a) (partition p0 values less than (10), partition p1 values less than maxvalue)") statstestutil.HandleNextDDLEventWithTxn(h) @@ -383,6 +395,9 @@ func TestAnalyzeColumnsWithStaticPartitionTable(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Pin the ratio so every analyzed column collects the configured TopN/bucket + // numbers; the reduction behavior is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("set @@tidb_partition_prune_mode = 'static'") tk.MustExec("create table t (a int, b int, c int, index idx(c)) partition by range (a) (partition p0 values less than (10), partition p1 values less than maxvalue)") statstestutil.HandleNextDDLEventWithTxn(h) @@ -494,6 +509,9 @@ func TestAnalyzeColumnsWithVirtualColumnIndex(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_analyze_version = 2") + // Pin the ratio so every analyzed column collects the configured TopN/bucket + // numbers; the reduction behavior is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("create table t (a int, b int, c int as (b+1), index idx(c))") statstestutil.HandleNextDDLEventWithTxn(h) tk.MustExec("insert into t (a,b) values (1,1), (2,2), (3,3), (4,4), (5,4), (6,5), (7,5), (8,5), (null,null)") @@ -550,3 +568,74 @@ func TestAnalyzeColumnsWithVirtualColumnIndex(t *testing.T) { }(val) } } + +func TestAnalyzeNonPredicateColumnRatio(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + tk.MustExec("use test") + // The reduction is enabled by default with a ratio of 0.1. + tk.MustQuery("select @@global.tidb_analyze_non_predicate_column_ratio").Check(testkit.Rows("0.1")) + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 0.5") + defer tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = default") + tk.MustExec("set @@tidb_analyze_version = 2") + + checkStatsSize := func(tblName, colName string, isIndex, numTopN, numBuckets int) { + cond := fmt.Sprintf( + "where db_name = 'test' and table_name = '%s' and column_name = '%s' and is_index = %d", + tblName, colName, isIndex, + ) + require.Len(t, tk.MustQuery("show stats_topn "+cond).Rows(), numTopN, + "unexpected topn size for %s.%s", tblName, colName) + require.Len(t, tk.MustQuery("show stats_buckets "+cond).Rows(), numBuckets, + "unexpected bucket count for %s.%s", tblName, colName) + } + prepareTable := func(tblName string) { + tk.MustExec(fmt.Sprintf("create table %s (a int, b int, c int, index ib (b))", tblName)) + statstestutil.HandleNextDDLEventWithTxn(h) + // Every column gets 8 distinct values: 1~4 appear twice and 5~8 appear once, so + // with `with 2 topn, 2 buckets` a full-stats column collects 2 TopN values and + // 2 buckets while a reduced column collects 1 TopN value and 1 bucket. + for i := 1; i <= 4; i++ { + tk.MustExec(fmt.Sprintf("insert into %s values (%[2]d, %[2]d, %[2]d), (%[2]d, %[2]d, %[2]d)", tblName, i)) + } + for i := 5; i <= 8; i++ { + tk.MustExec(fmt.Sprintf("insert into %s values (%[2]d, %[2]d, %[2]d)", tblName, i)) + } + tk.MustExec("flush stats_delta *.*") + } + + // Case 1: column a is a predicate column, so it keeps the configured numbers while + // the other columns (including the indexed column b) collect the reduced numbers. + prepareTable("t1") + tk.MustExec("select * from t1 where a > 0") + require.NoError(t, h.DumpColStatsUsageToKV()) + tk.MustExec("analyze table t1 all columns with 2 topn, 2 buckets") + checkStatsSize("t1", "a", 0, 2, 2) + checkStatsSize("t1", "b", 0, 1, 1) + checkStatsSize("t1", "c", 0, 1, 1) + // Index stats always keep the configured numbers. + checkStatsSize("t1", "ib", 1, 2, 2) + + // Case 2: no predicate column has been collected for the table, so the first column + // of the index keeps the configured numbers while the other columns are reduced. + prepareTable("t2") + tk.MustExec("analyze table t2 all columns with 2 topn, 2 buckets") + checkStatsSize("t2", "a", 0, 1, 1) + checkStatsSize("t2", "b", 0, 2, 2) + checkStatsSize("t2", "c", 0, 1, 1) + + // Case 3: columns specified in ANALYZE ... COLUMNS keep the configured numbers. + prepareTable("t3") + tk.MustExec("analyze table t3 columns a, b with 2 topn, 2 buckets") + checkStatsSize("t3", "a", 0, 2, 2) + checkStatsSize("t3", "b", 0, 2, 2) + + // Case 4: setting the ratio to 1 disables the reduction. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") + tk.MustExec("analyze table t1 all columns with 2 topn, 2 buckets") + checkStatsSize("t1", "a", 0, 2, 2) + checkStatsSize("t1", "b", 0, 2, 2) + checkStatsSize("t1", "c", 0, 2, 2) +} diff --git a/pkg/executor/test/analyzetest/options/analyze_saved_options_test.go b/pkg/executor/test/analyzetest/options/analyze_saved_options_test.go index 04c5db4c8c0e2..806f9b73ffb23 100644 --- a/pkg/executor/test/analyzetest/options/analyze_saved_options_test.go +++ b/pkg/executor/test/analyzetest/options/analyze_saved_options_test.go @@ -37,6 +37,9 @@ func TestSavedAnalyzeOptions(t *testing.T) { tk.MustExec(fmt.Sprintf("set global tidb_persist_analyze_options = %v", originalVal1)) }() tk.MustExec("set global tidb_persist_analyze_options = true") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") originalVal2 := tk.MustQuery("select @@tidb_auto_analyze_ratio").Rows()[0][0].(string) defer func() { tk.MustExec(fmt.Sprintf("set global tidb_auto_analyze_ratio = %v", originalVal2)) @@ -136,6 +139,9 @@ func TestSavedPartitionAnalyzeOptions(t *testing.T) { tk.MustExec(fmt.Sprintf("set global tidb_persist_analyze_options = %v", originalVal)) }() tk.MustExec("set global tidb_persist_analyze_options = true") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("use test") tk.MustExec("set @@session.tidb_analyze_version = 2") diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index 45eaa12931484..8bcd4481a600a 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -856,6 +856,9 @@ func TestNewIndexWithColumnStats(t *testing.T) { store, dom := testkit.CreateMockStoreAndDomain(t) testKit := testkit.NewTestKit(t, store) testKit.MustExec("use test") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + testKit.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") testKit.MustExec("drop table if exists t") testKit.MustExec("drop table if exists t2") testKit.MustExec("create table t(a int)") @@ -1508,6 +1511,9 @@ func testTopNAssistedEstimationInner(t *testing.T, input []string, output []outp h.Clear() tk := testkit.NewTestKit(t, store) tk.MustExec("use test") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("drop table if exists t") tk.MustExec("set @@tidb_default_string_match_selectivity = 0") tk.MustExec("set @@tidb_stats_load_sync_wait = 3000") diff --git a/pkg/planner/core/casetest/planstats/plan_stats_test.go b/pkg/planner/core/casetest/planstats/plan_stats_test.go index f8c32c8ef3075..5a245161600e5 100644 --- a/pkg/planner/core/casetest/planstats/plan_stats_test.go +++ b/pkg/planner/core/casetest/planstats/plan_stats_test.go @@ -591,6 +591,9 @@ func TestStatsAnalyzedInDDL(t *testing.T) { testkit.RunTestUnderCascadesWithDomain(t, func(t *testing.T, testKit *testkit.TestKit, dom *domain.Domain, cascades, caller string) { testKit.MustExec("use test") testKit.MustExec("set session tidb_stats_update_during_ddl = 1") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + testKit.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") // test normal table testKit.MustExec("create table t(a int, b int, c int, primary key(a), key idx(b))") diff --git a/pkg/planner/core/common_plans.go b/pkg/planner/core/common_plans.go index ae124a35819e5..74952868070f4 100644 --- a/pkg/planner/core/common_plans.go +++ b/pkg/planner/core/common_plans.go @@ -395,6 +395,14 @@ type AnalyzeColumnsTask struct { SkipColsInfo []*model.ColumnInfo TblInfo *model.TableInfo Indexes []*model.IndexInfo + // FullStatsCols holds the IDs of the columns that keep the configured TopN/bucket + // numbers when tidb_analyze_non_predicate_column_ratio < 1. Columns absent from the + // set only collect NonPredicateColRatio times the configured numbers. A nil map + // disables the reduction and every column keeps the configured numbers. + FullStatsCols map[int64]struct{} + // NonPredicateColRatio is the value of tidb_analyze_non_predicate_column_ratio + // captured when the plan was built. + NonPredicateColRatio float64 AnalyzeInfo } diff --git a/pkg/planner/core/planbuilder.go b/pkg/planner/core/planbuilder.go index 72dc07b04ad3e..d4390e73b04a1 100644 --- a/pkg/planner/core/planbuilder.go +++ b/pkg/planner/core/planbuilder.go @@ -2302,7 +2302,9 @@ func (b *PlanBuilder) getMustAnalyzedColumns(tbl *resolve.TableNameW, cols *calc } // getPredicateColumns gets the columns used in predicates. -func (b *PlanBuilder) getPredicateColumns(tbl *resolve.TableNameW, cols *calcOnceMap) (map[int64]struct{}, error) { +// When warnEmpty is true and no predicate column has been collected for the table, +// it appends a warning about falling back to analyzing only indexed columns. +func (b *PlanBuilder) getPredicateColumns(tbl *resolve.TableNameW, cols *calcOnceMap, warnEmpty bool) (map[int64]struct{}, error) { // Already calculated in the previous call. if cols.calculated { return cols.data, nil @@ -2316,13 +2318,15 @@ func (b *PlanBuilder) getPredicateColumns(tbl *resolve.TableNameW, cols *calcOnc return nil, err } if len(colList) == 0 { - b.ctx.GetSessionVars().StmtCtx.AppendWarning( - errors.NewNoStackErrorf( - "No predicate column has been collected yet for table %s.%s, so only indexes and the columns composing the indexes will be analyzed", - tbl.Schema.L, - tbl.Name.L, - ), - ) + if warnEmpty { + b.ctx.GetSessionVars().StmtCtx.AppendWarning( + errors.NewNoStackErrorf( + "No predicate column has been collected yet for table %s.%s, so only indexes and the columns composing the indexes will be analyzed", + tbl.Schema.L, + tbl.Name.L, + ), + ) + } } else { // Some predicate columns are generated columns so we also need to add the columns that make up those generated columns. err := b.addColumnsWithVirtualExprs(tbl, cols, func(columns []*expression.Column) []expression.Expression { @@ -2449,7 +2453,7 @@ func (b *PlanBuilder) getColumnsBasedOnPredicateColumns( if rewriteAllStatsNeeded { return tbl.TableInfo.Columns, nil } - predicate, err := b.getPredicateColumns(tbl, predicateCols) + predicate, err := b.getPredicateColumns(tbl, predicateCols, true) if err != nil { return nil, err } @@ -2461,6 +2465,58 @@ func (b *PlanBuilder) getColumnsBasedOnPredicateColumns( return getColumnListFromSet(tbl.TableInfo.Columns, colSet), nil } +// getFullStatsColsAndRatio decides which columns keep the configured (full) TopN/bucket +// numbers when tidb_analyze_non_predicate_column_ratio is smaller than 1: +// - Predicate columns always keep the full numbers. +// - Columns explicitly specified in ANALYZE TABLE ... COLUMNS keep the full numbers. +// - When no predicate column has been collected for the table yet, the handle column and +// the first column of each index keep the full numbers, since they are the most likely +// columns to be used in future predicates. +// +// Every other column only collects ratio times the configured TopN/bucket numbers. +// It returns a nil map when the reduction is disabled (ratio >= 1), meaning every column +// keeps the configured numbers. +func (b *PlanBuilder) getFullStatsColsAndRatio( + tbl *resolve.TableNameW, + predicateCols *calcOnceMap, + specifiedCols []*model.ColumnInfo, +) (map[int64]struct{}, float64, error) { + ratio := vardef.AnalyzeNonPredicateColumnRatio.Load() + if ratio >= 1 { + return nil, 1, nil + } + predicate, err := b.getPredicateColumns(tbl, predicateCols, false) + if err != nil { + return nil, 1, err + } + var fullStatsCols map[int64]struct{} + if len(predicate) > 0 { + fullStatsCols = make(map[int64]struct{}, len(predicate)+len(specifiedCols)) + maps.Copy(fullStatsCols, predicate) + } else { + // No predicate column has been collected for the table yet, so fall back to + // keeping full stats for the handle column and the first column of each index. + tblInfo := tbl.TableInfo + fullStatsCols = make(map[int64]struct{}, len(tblInfo.Indices)+len(specifiedCols)+1) + if tblInfo.PKIsHandle { + if pkCol := tblInfo.GetPkColInfo(); pkCol != nil { + fullStatsCols[pkCol.ID] = struct{}{} + } + } + fullStatsCols[model.ExtraHandleID] = struct{}{} + for _, idx := range tblInfo.Indices { + if idx.State != model.StatePublic || len(idx.Columns) == 0 { + continue + } + fullStatsCols[tblInfo.Columns[idx.Columns[0].Offset].ID] = struct{}{} + } + } + for _, col := range specifiedCols { + fullStatsCols[col.ID] = struct{}{} + } + return fullStatsCols, ratio, nil +} + // Helper function to combine two column sets. func combineColumnSets(sets ...map[int64]struct{}) map[int64]struct{} { result := make(map[int64]struct{}) @@ -2737,6 +2793,11 @@ func (b *PlanBuilder) buildAnalyzeFullSamplingTask( return err } + fullStatsCols, nonPredicateColRatio, err := b.getFullStatsColsAndRatio(tbl, &predicateCols, astColList) + if err != nil { + return err + } + optionsMap, colsInfoMap, err := b.genV2AnalyzeOptions(persistOpts, tbl, isAnalyzeTable, physicalIDs, astOpts, as.ColumnChoice, astColList, &predicateCols, &mustAnalyzedCols, mustAllColumns) if err != nil { return err @@ -2777,12 +2838,14 @@ func (b *PlanBuilder) buildAnalyzeFullSamplingTask( indexes, independentIndexes, specialGlobalIndexes = getModifiedIndexesInfoForAnalyze(b.ctx, tbl.TableInfo, allColumns, execColsInfo) handleCols := BuildHandleColsForAnalyze(b.ctx, tbl.TableInfo, allColumns, execColsInfo) newTask := AnalyzeColumnsTask{ - HandleCols: handleCols, - ColsInfo: execColsInfo, - AnalyzeInfo: info, - TblInfo: tbl.TableInfo, - Indexes: indexes, - SkipColsInfo: skipColsInfo, + HandleCols: handleCols, + ColsInfo: execColsInfo, + AnalyzeInfo: info, + TblInfo: tbl.TableInfo, + Indexes: indexes, + SkipColsInfo: skipColsInfo, + FullStatsCols: fullStatsCols, + NonPredicateColRatio: nonPredicateColRatio, } if newTask.HandleCols == nil { extraCol := model.NewExtraHandleColInfo() diff --git a/pkg/sessionctx/vardef/tidb_vars.go b/pkg/sessionctx/vardef/tidb_vars.go index 23ab58a02678e..0480574e30627 100644 --- a/pkg/sessionctx/vardef/tidb_vars.go +++ b/pkg/sessionctx/vardef/tidb_vars.go @@ -1171,6 +1171,11 @@ const ( // `PREDICATE`: Analyze only the columns that are used in the predicates of the query. // `ALL`: Analyze all columns in the table. TiDBAnalyzeColumnOptions = "tidb_analyze_column_options" + // TiDBAnalyzeNonPredicateColumnRatio scales down the number of TopN values and histogram buckets + // collected for columns that are not predicate columns. When ANALYZE collects statistics for a + // column that has never been used in query predicates, it collects only + // `ratio * (the configured TopN/bucket numbers)`. Setting it to 1 disables the reduction. + TiDBAnalyzeNonPredicateColumnRatio = "tidb_analyze_non_predicate_column_ratio" // TiDBDisableColumnTrackingTime records the last time TiDBEnableColumnTracking is set off. // It is used to invalidate the collected predicate columns after turning off TiDBEnableColumnTracking, which avoids physical deletion. // It doesn't have cache in memory, and we directly get/set the variable value from/to mysql.tidb. @@ -1676,6 +1681,7 @@ const ( DefTiDBEnableAutoAnalyze = true DefTiDBEnableAutoAnalyzePriorityQueue = true DefTiDBAnalyzeColumnOptions = "ALL" + DefTiDBAnalyzeNonPredicateColumnRatio = 0.1 DefTiDBMemOOMAction = "CANCEL" DefTiDBMaxAutoAnalyzeTime = 12 * 60 * 60 DefTiDBAutoAnalyzeConcurrency = 3 @@ -1849,20 +1855,24 @@ var ( // the value of `tidb_analyze_column_options` determines the behavior of the analyze operation. // 2. If `tidb_persist_analyze_options` is disabled, `tidb_analyze_column_options` is used directly to decide // whether to analyze all columns or just the predicate columns. - AnalyzeColumnOptions = atomic.NewString(DefTiDBAnalyzeColumnOptions) - GlobalLogMaxDays = atomic.NewInt32(int32(config.GetGlobalConfig().Log.File.MaxDays)) - QueryLogMaxLen = atomic.NewInt32(DefTiDBQueryLogMaxLen) - EnablePProfSQLCPU = atomic.NewBool(false) - EnableBatchDML = atomic.NewBool(false) - EnableTmpStorageOnOOM = atomic.NewBool(DefTiDBEnableTmpStorageOnOOM) - DDLReorgWorkerCounter int32 = DefTiDBDDLReorgWorkerCount - DDLReorgBatchSize int32 = DefTiDBDDLReorgBatchSize - DDLFlashbackConcurrency int32 = DefTiDBDDLFlashbackConcurrency - DDLErrorCountLimit int64 = DefTiDBDDLErrorCountLimit - DDLReorgRowFormat int64 = DefTiDBRowFormatV2 - DDLReorgMaxWriteSpeed = atomic.NewInt64(DefTiDBDDLReorgMaxWriteSpeed) - MaxDeltaSchemaCount int64 = DefTiDBMaxDeltaSchemaCount - GlobalSlowLogRateLimiter = rate.NewLimiter(rate.Inf, 1) + AnalyzeColumnOptions = atomic.NewString(DefTiDBAnalyzeColumnOptions) + // AnalyzeNonPredicateColumnRatio is a global variable that scales down the TopN/bucket numbers + // collected by ANALYZE for columns that are not predicate columns. See + // TiDBAnalyzeNonPredicateColumnRatio for the detailed behavior. + AnalyzeNonPredicateColumnRatio = atomic.NewFloat64(DefTiDBAnalyzeNonPredicateColumnRatio) + GlobalLogMaxDays = atomic.NewInt32(int32(config.GetGlobalConfig().Log.File.MaxDays)) + QueryLogMaxLen = atomic.NewInt32(DefTiDBQueryLogMaxLen) + EnablePProfSQLCPU = atomic.NewBool(false) + EnableBatchDML = atomic.NewBool(false) + EnableTmpStorageOnOOM = atomic.NewBool(DefTiDBEnableTmpStorageOnOOM) + DDLReorgWorkerCounter int32 = DefTiDBDDLReorgWorkerCount + DDLReorgBatchSize int32 = DefTiDBDDLReorgBatchSize + DDLFlashbackConcurrency int32 = DefTiDBDDLFlashbackConcurrency + DDLErrorCountLimit int64 = DefTiDBDDLErrorCountLimit + DDLReorgRowFormat int64 = DefTiDBRowFormatV2 + DDLReorgMaxWriteSpeed = atomic.NewInt64(DefTiDBDDLReorgMaxWriteSpeed) + MaxDeltaSchemaCount int64 = DefTiDBMaxDeltaSchemaCount + GlobalSlowLogRateLimiter = rate.NewLimiter(rate.Inf, 1) // DDLSlowOprThreshold is the threshold for ddl slow operations, uint is millisecond. DDLSlowOprThreshold = config.GetGlobalConfig().Instance.DDLSlowOprThreshold GlobalSlowLogRules = atomic.NewPointer[slowlogrule.GlobalSlowLogRules]( diff --git a/pkg/sessionctx/variable/sysvar.go b/pkg/sessionctx/variable/sysvar.go index 714884a49b360..3e9cd88f7ab8a 100644 --- a/pkg/sessionctx/variable/sysvar.go +++ b/pkg/sessionctx/variable/sysvar.go @@ -1207,6 +1207,21 @@ var defaultSysVars = []*SysVar{ return normalizedValue, nil }, }, + { + Scope: vardef.ScopeGlobal, + Name: vardef.TiDBAnalyzeNonPredicateColumnRatio, + Value: strconv.FormatFloat(vardef.DefTiDBAnalyzeNonPredicateColumnRatio, 'f', -1, 64), + Type: vardef.TypeFloat, + MinValue: 0, + MaxValue: 1, + GetGlobal: func(_ context.Context, s *SessionVars) (string, error) { + return strconv.FormatFloat(vardef.AnalyzeNonPredicateColumnRatio.Load(), 'f', -1, 64), nil + }, + SetGlobal: func(_ context.Context, s *SessionVars, val string) error { + vardef.AnalyzeNonPredicateColumnRatio.Store(tidbOptFloat64(val, vardef.DefTiDBAnalyzeNonPredicateColumnRatio)) + return nil + }, + }, { Scope: vardef.ScopeGlobal, Name: vardef.TiDBEnableAutoAnalyzePriorityQueue, diff --git a/pkg/statistics/handle/handletest/handle_test.go b/pkg/statistics/handle/handletest/handle_test.go index b04b4f7b9c470..61aa54923d34f 100644 --- a/pkg/statistics/handle/handletest/handle_test.go +++ b/pkg/statistics/handle/handletest/handle_test.go @@ -925,6 +925,9 @@ func TestInitStatsLiteRecordsSynthesizedColumnStats(t *testing.T) { store, dom := testkit.CreateMockStoreAndDomain(t) tk := testkit.NewTestKit(t, store) tk.MustExec("use test") + // Keep the configured TopN/bucket numbers for all columns; the reduction for + // non-predicate columns is covered by TestAnalyzeNonPredicateColumnRatio. + tk.MustExec("set global tidb_analyze_non_predicate_column_ratio = 1") tk.MustExec("create table t(a int)") h := dom.StatsHandle()