From d85bef20d95cda14a5e0eba749d9aa5d2c825004 Mon Sep 17 00:00:00 2001
From: George Krajcsovits <krajorama@users.noreply.github.com>
Date: Thu, 29 Feb 2024 15:53:47 +0100
Subject: [PATCH] feature: add native histogram support to latency metrics
 (#3737)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note that this does not stop showing classic metrics, for now
it is up to the scrape config to decide whether to keep those instead or
both.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
---
 cluster/channel.go       | 10 +++++++---
 cluster/delegate.go      |  9 ++++++---
 cmd/alertmanager/main.go |  9 ++++++---
 nflog/nflog.go           |  8 ++++++--
 notify/notify.go         | 11 +++++++----
 silence/silence.go       |  8 ++++++--
 6 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/cluster/channel.go b/cluster/channel.go
index ba0b834cd1..5548d50819 100644
--- a/cluster/channel.go
+++ b/cluster/channel.go
@@ -70,9 +70,13 @@ func NewChannel(
 		ConstLabels: prometheus.Labels{"key": key},
 	})
 	oversizeGossipDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name:        "alertmanager_oversize_gossip_message_duration_seconds",
-		Help:        "Duration of oversized gossip message requests.",
-		ConstLabels: prometheus.Labels{"key": key},
+		Name:                            "alertmanager_oversize_gossip_message_duration_seconds",
+		Help:                            "Duration of oversized gossip message requests.",
+		ConstLabels:                     prometheus.Labels{"key": key},
+		Buckets:                         prometheus.DefBuckets,
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	})
 
 	reg.MustRegister(oversizeGossipDuration, oversizeGossipMessageFailureTotal, oversizeGossipMessageDroppedTotal, oversizeGossipMessageSentTotal)
diff --git a/cluster/delegate.go b/cluster/delegate.go
index 9957f69b91..edfda10705 100644
--- a/cluster/delegate.go
+++ b/cluster/delegate.go
@@ -104,9 +104,12 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
 	}, []string{"peer"},
 	)
 	nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
-		Name:    "alertmanager_cluster_pings_seconds",
-		Help:    "Histogram of latencies for ping messages.",
-		Buckets: []float64{.005, .01, .025, .05, .1, .25, .5},
+		Name:                            "alertmanager_cluster_pings_seconds",
+		Help:                            "Histogram of latencies for ping messages.",
+		Buckets:                         []float64{.005, .01, .025, .05, .1, .25, .5},
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	}, []string{"peer"},
 	)
 
diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go
index b2938189d5..c3e9d1b239 100644
--- a/cmd/alertmanager/main.go
+++ b/cmd/alertmanager/main.go
@@ -64,9 +64,12 @@ import (
 var (
 	requestDuration = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
-			Name:    "alertmanager_http_request_duration_seconds",
-			Help:    "Histogram of latencies for HTTP requests.",
-			Buckets: []float64{.05, 0.1, .25, .5, .75, 1, 2, 5, 20, 60},
+			Name:                            "alertmanager_http_request_duration_seconds",
+			Help:                            "Histogram of latencies for HTTP requests.",
+			Buckets:                         []float64{.05, 0.1, .25, .5, .75, 1, 2, 5, 20, 60},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
 		},
 		[]string{"handler", "method"},
 	)
diff --git a/nflog/nflog.go b/nflog/nflog.go
index c533dd0e66..6ce12a8e1f 100644
--- a/nflog/nflog.go
+++ b/nflog/nflog.go
@@ -139,8 +139,12 @@ func newMetrics(r prometheus.Registerer) *metrics {
 		Help: "Number notification log received queries that failed.",
 	})
 	m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name: "alertmanager_nflog_query_duration_seconds",
-		Help: "Duration of notification log query evaluation.",
+		Name:                            "alertmanager_nflog_query_duration_seconds",
+		Help:                            "Duration of notification log query evaluation.",
+		Buckets:                         prometheus.DefBuckets,
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	})
 	m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "alertmanager_nflog_gossip_messages_propagated_total",
diff --git a/notify/notify.go b/notify/notify.go
index 1d7597c9c6..3752148f41 100644
--- a/notify/notify.go
+++ b/notify/notify.go
@@ -291,10 +291,13 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
 			Help:      "The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.",
 		}, []string{"reason"}),
 		notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
-			Namespace: "alertmanager",
-			Name:      "notification_latency_seconds",
-			Help:      "The latency of notifications in seconds.",
-			Buckets:   []float64{1, 5, 10, 15, 20},
+			Namespace:                       "alertmanager",
+			Name:                            "notification_latency_seconds",
+			Help:                            "The latency of notifications in seconds.",
+			Buckets:                         []float64{1, 5, 10, 15, 20},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
 		}, labels),
 		ff: ff,
 	}
diff --git a/silence/silence.go b/silence/silence.go
index 710323f747..c87ab76e4d 100644
--- a/silence/silence.go
+++ b/silence/silence.go
@@ -271,8 +271,12 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
 		Help: "How many silence received queries did not succeed.",
 	})
 	m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name: "alertmanager_silences_query_duration_seconds",
-		Help: "Duration of silence query evaluation.",
+		Name:                            "alertmanager_silences_query_duration_seconds",
+		Help:                            "Duration of silence query evaluation.",
+		Buckets:                         prometheus.DefBuckets,
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	})
 	m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "alertmanager_silences_gossip_messages_propagated_total",