From 2f4e1fd74f74f69d4dd7869b1ec1d3555884b75c Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Thu, 29 Aug 2024 12:19:04 +0900 Subject: [PATCH] Add query metrics to ruler query stats (#6173) --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 4 +- pkg/ruler/compat.go | 29 ++++++++------ pkg/ruler/compat_test.go | 16 ++++++-- pkg/ruler/manager_metrics.go | 42 ++++++++++++++++++--- pkg/ruler/manager_metrics_test.go | 12 +++++- pkg/ruler/ruler.go | 2 +- 7 files changed, 82 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d3827e883..4a155f427e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [ENHANCEMENT] Ruler: Add new ruler metric `cortex_ruler_rule_groups_in_store` that is the total rule groups per tenant in store, which can be used to compare with `cortex_prometheus_rule_group_rules` to count the number of rule groups that are not loaded by a ruler. #5869 +* [ENHANCEMENT] Ruler: Add query statistics metrics when --ruler.query-stats-enabled=true. #6173 ## 1.18.0 in progress diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 0144dbaf31..c664fa6a96 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4366,8 +4366,8 @@ ring: # CLI flag: -ruler.disabled-tenants [disabled_tenants: | default = ""] -# Report the wall time for ruler queries to complete as a per user metric and as -# an info level log message. +# Report query statistics for ruler queries to complete as a per user metric and +# as an info level log message. # CLI flag: -ruler.query-stats-enabled [query_stats_enabled: | default = false] diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 21b609a4fd..6ed72c9831 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -229,19 +229,25 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun } } -func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { - if queryTime == nil { - return qf - } +func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, userID string, evalMetrics *RuleEvalMetrics, logger log.Logger) rules.QueryFunc { + queryTime := evalMetrics.RulerQuerySeconds.WithLabelValues(userID) + querySeries := evalMetrics.RulerQuerySeries.WithLabelValues(userID) + querySample := evalMetrics.RulerQuerySamples.WithLabelValues(userID) + queryChunkBytes := evalMetrics.RulerQueryChunkBytes.WithLabelValues(userID) + queryDataBytes := evalMetrics.RulerQueryDataBytes.WithLabelValues(userID) return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queryStats, ctx := stats.ContextWithEmptyStats(ctx) // If we've been passed a counter we want to record the wall time spent executing this request. timer := prometheus.NewTimer(nil) + defer func() { querySeconds := timer.ObserveDuration().Seconds() queryTime.Add(querySeconds) - + querySeries.Add(float64(queryStats.FetchedSeriesCount)) + querySample.Add(float64(queryStats.FetchedSamplesCount)) + queryChunkBytes.Add(float64(queryStats.FetchedChunkBytes)) + queryDataBytes.Add(float64(queryStats.FetchedDataBytes)) // Log ruler query stats. logMessage := []interface{}{ "msg", "query stats", @@ -303,23 +309,24 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors) return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { - var queryTime prometheus.Counter - if evalMetrics.RulerQuerySeconds != nil { - queryTime = evalMetrics.RulerQuerySeconds.WithLabelValues(userID) - } - failedQueries := evalMetrics.FailedQueriesVec.WithLabelValues(userID) totalQueries := evalMetrics.TotalQueriesVec.WithLabelValues(userID) totalWrites := evalMetrics.TotalWritesVec.WithLabelValues(userID) failedWrites := evalMetrics.FailedWritesVec.WithLabelValues(userID) + var queryFunc rules.QueryFunc engineQueryFunc := EngineQueryFunc(engine, q, overrides, userID, cfg.LookbackDelta) metricsQueryFunc := MetricsQueryFunc(engineQueryFunc, totalQueries, failedQueries) + if cfg.EnableQueryStats { + queryFunc = RecordAndReportRuleQueryMetrics(metricsQueryFunc, userID, evalMetrics, logger) + } else { + queryFunc = metricsQueryFunc + } return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: RecordAndReportRuleQueryMetrics(metricsQueryFunc, queryTime, logger), + QueryFunc: queryFunc, Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 28c261f549..3a3d663350 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -23,6 +23,7 @@ import ( "github.com/weaveworks/common/httpgrpc" "github.com/cortexproject/cortex/pkg/cortexpb" + "github.com/cortexproject/cortex/pkg/querier/stats" "github.com/cortexproject/cortex/pkg/util/validation" ) @@ -392,14 +393,23 @@ func TestMetricsQueryFuncErrors(t *testing.T) { } func TestRecordAndReportRuleQueryMetrics(t *testing.T) { - queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) + metrics := NewRuleEvalMetrics(Config{EnableQueryStats: true}, prometheus.DefaultRegisterer) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { + queryStats := stats.FromContext(ctx) + queryStats.AddFetchedSeries(2) + queryStats.AddFetchedSamples(2) + queryStats.AddFetchedChunkBytes(10) + queryStats.AddFetchedDataBytes(14) time.Sleep(1 * time.Second) return promql.Vector{}, nil } - qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger()) + qf := RecordAndReportRuleQueryMetrics(mockFunc, "userID", metrics, log.NewNopLogger()) _, _ = qf(context.Background(), "test", time.Now()) - require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1)) + require.GreaterOrEqual(t, testutil.ToFloat64(metrics.RulerQuerySeconds.WithLabelValues("userID")), float64(1)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQuerySeries.WithLabelValues("userID")), float64(2)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQuerySamples.WithLabelValues("userID")), float64(2)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQueryChunkBytes.WithLabelValues("userID")), float64(10)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQueryDataBytes.WithLabelValues("userID")), float64(14)) } diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index ef211ddd89..93acdc26b1 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -225,11 +225,15 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { } type RuleEvalMetrics struct { - TotalWritesVec *prometheus.CounterVec - FailedWritesVec *prometheus.CounterVec - TotalQueriesVec *prometheus.CounterVec - FailedQueriesVec *prometheus.CounterVec - RulerQuerySeconds *prometheus.CounterVec + TotalWritesVec *prometheus.CounterVec + FailedWritesVec *prometheus.CounterVec + TotalQueriesVec *prometheus.CounterVec + FailedQueriesVec *prometheus.CounterVec + RulerQuerySeconds *prometheus.CounterVec + RulerQuerySeries *prometheus.CounterVec + RulerQuerySamples *prometheus.CounterVec + RulerQueryChunkBytes *prometheus.CounterVec + RulerQueryDataBytes *prometheus.CounterVec } func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics { @@ -256,6 +260,22 @@ func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics Name: "cortex_ruler_query_seconds_total", Help: "Total amount of wall clock time spent processing queries by the ruler.", }, []string{"user"}) + m.RulerQuerySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_fetched_series_total", + Help: "Number of series fetched to execute a query by the ruler.", + }, []string{"user"}) + m.RulerQuerySamples = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_samples_total", + Help: "Number of samples fetched to execute a query by the ruler.", + }, []string{"user"}) + m.RulerQueryChunkBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_fetched_chunks_bytes_total", + Help: "Size of all chunks fetched to execute a query in bytes by the ruler.", + }, []string{"user"}) + m.RulerQueryDataBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_fetched_data_bytes_total", + Help: "Size of all data fetched to execute a query in bytes by the ruler.", + }, []string{"user"}) } return m @@ -270,6 +290,18 @@ func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) { if m.RulerQuerySeconds != nil { m.RulerQuerySeconds.DeleteLabelValues(userID) } + if m.RulerQuerySeries != nil { + m.RulerQuerySeries.DeleteLabelValues(userID) + } + if m.RulerQuerySamples != nil { + m.RulerQuerySamples.DeleteLabelValues(userID) + } + if m.RulerQueryChunkBytes != nil { + m.RulerQueryChunkBytes.DeleteLabelValues(userID) + } + if m.RulerQueryDataBytes != nil { + m.RulerQueryDataBytes.DeleteLabelValues(userID) + } } type RuleGroupMetrics struct { diff --git a/pkg/ruler/manager_metrics_test.go b/pkg/ruler/manager_metrics_test.go index 60b68452ca..dfc9800ad5 100644 --- a/pkg/ruler/manager_metrics_test.go +++ b/pkg/ruler/manager_metrics_test.go @@ -574,8 +574,16 @@ func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) { m.FailedQueriesVec.WithLabelValues("fake2").Add(10) m.RulerQuerySeconds.WithLabelValues("fake1").Add(10) m.RulerQuerySeconds.WithLabelValues("fake2").Add(10) - - metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total"} + m.RulerQuerySeries.WithLabelValues("fake1").Add(10) + m.RulerQuerySeries.WithLabelValues("fake2").Add(10) + m.RulerQuerySamples.WithLabelValues("fake1").Add(10) + m.RulerQuerySamples.WithLabelValues("fake2").Add(10) + m.RulerQueryChunkBytes.WithLabelValues("fake1").Add(10) + m.RulerQueryChunkBytes.WithLabelValues("fake2").Add(10) + m.RulerQueryDataBytes.WithLabelValues("fake1").Add(10) + m.RulerQueryDataBytes.WithLabelValues("fake2").Add(10) + + metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total", "cortex_ruler_fetched_series_total", "cortex_ruler_samples_total", "cortex_ruler_fetched_chunks_bytes_total", "cortex_ruler_fetched_data_bytes_total"} gm, err := reg.Gather() require.NoError(t, err) mfm, err := util.NewMetricFamilyMap(gm) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 4f121570cc..a910896ad2 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -217,7 +217,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") - f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.") + f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report query statistics for ruler queries to complete as a per user metric and as an info level log message.") f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics") cfg.RingCheckPeriod = 5 * time.Second