From 2420f6c0005faa393feb4c268c7ce8d5792839be Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Wed, 8 Jun 2022 02:43:13 -0700 Subject: [PATCH] fix: make metrics endpoint responsive by reducing the chatter (#15055) peerOnlineCounter was making NxN calls to many peers, this can be really long and tedious if there are random servers that are going down. Instead we should calculate online peers from the point of view of "self" and return those online and offline appropriately by performing a healthcheck. --- cmd/metrics-v2.go | 8 +++--- cmd/metrics.go | 2 +- cmd/notification.go | 57 ++++++++++++++++++++++++++--------------- cmd/peer-rest-server.go | 6 +---- 4 files changed, 42 insertions(+), 31 deletions(-) diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 61f470f7e..535e272fa 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -1347,7 +1347,7 @@ func getNodeHealthMetrics() *MetricsGroup { return } metrics = make([]Metric, 0, 16) - nodesUp, nodesDown := GetPeerOnlineCount() + nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount() metrics = append(metrics, Metric{ Description: getNodeOnlineTotalMD(), Value: float64(nodesUp), @@ -1932,11 +1932,9 @@ func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) { } // Call peer api to fetch metrics - peerCh := globalNotificationSys.GetClusterMetrics(GlobalContext) - selfCh := ReportMetrics(GlobalContext, c.metricsGroups) wg.Add(2) - go publish(peerCh) - go publish(selfCh) + go publish(ReportMetrics(GlobalContext, c.metricsGroups)) + go publish(globalNotificationSys.GetClusterMetrics(GlobalContext)) wg.Wait() } diff --git a/cmd/metrics.go b/cmd/metrics.go index 79225d363..80cd815e3 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -110,7 +110,7 @@ func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) { return } - nodesUp, nodesDown := GetPeerOnlineCount() + nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount() ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( prometheus.BuildFQName(minioNamespace, "nodes", "online"), diff --git a/cmd/notification.go b/cmd/notification.go index 4566ae5ef..c697da8cb 100644 --- a/cmd/notification.go +++ b/cmd/notification.go @@ -1344,6 +1344,35 @@ func (sys *NotificationSys) restClientFromHash(s string) (client *peerRESTClient return peerClients[idx] } +// GetPeerOnlineCount gets the count of online and offline nodes. +func (sys *NotificationSys) GetPeerOnlineCount() (nodesOnline, nodesOffline int) { + nodesOnline = 1 // Self is always online. + nodesOffline = 0 + nodesOnlineIndex := make([]bool, len(sys.peerClients)) + var wg sync.WaitGroup + for idx, client := range sys.peerClients { + if client == nil { + continue + } + wg.Add(1) + go func(idx int, client *peerRESTClient) { + defer wg.Done() + nodesOnlineIndex[idx] = client.restClient.HealthCheckFn() + }(idx, client) + + } + wg.Wait() + + for _, online := range nodesOnlineIndex { + if online { + nodesOnline++ + } else { + nodesOffline++ + } + } + return +} + // NewNotificationSys - creates new notification system object. func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys { // targetList/bucketRulesMap/bucketRemoteTargetRulesMap are populated by NotificationSys.Init() @@ -1358,21 +1387,6 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys { } } -// GetPeerOnlineCount gets the count of online and offline nodes. -func GetPeerOnlineCount() (nodesOnline, nodesOffline int) { - nodesOnline = 1 // Self is always online. - nodesOffline = 0 - servers := globalNotificationSys.ServerInfo() - for _, s := range servers { - if s.State == string(madmin.ItemOnline) { - nodesOnline++ - continue - } - nodesOffline++ - } - return -} - type eventArgs struct { EventName event.Name BucketName string @@ -1524,7 +1538,7 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ... } // GetClusterMetrics - gets the cluster metrics from all nodes excluding self. -func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric { +func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric { if sys == nil { return nil } @@ -1545,11 +1559,14 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric { ch := make(chan Metric) var wg sync.WaitGroup for index, err := range g.Wait() { - reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress", - sys.peerClients[index].host.String()) - ctx := logger.SetReqInfo(ctx, reqInfo) if err != nil { - logger.LogOnceIf(ctx, err, sys.peerClients[index].host.String()) + if sys.peerClients[index] != nil { + reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress", + sys.peerClients[index].host.String()) + logger.LogOnceIf(logger.SetReqInfo(ctx, reqInfo), err, sys.peerClients[index].host.String()) + } else { + logger.LogOnceIf(ctx, err, "peer-offline") + } continue } wg.Add(1) diff --git a/cmd/peer-rest-server.go b/cmd/peer-rest-server.go index ea9f01223..0a551f201 100644 --- a/cmd/peer-rest-server.go +++ b/cmd/peer-rest-server.go @@ -1174,13 +1174,9 @@ func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request) s.writeErrorResponse(w, errors.New("invalid request")) } - doneCh := make(chan struct{}) - defer close(doneCh) - enc := gob.NewEncoder(w) - ch := ReportMetrics(r.Context(), peerMetricsGroups) - for m := range ch { + for m := range ReportMetrics(r.Context(), peerMetricsGroups) { if err := enc.Encode(m); err != nil { s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error())) return