From a7973ed7d1e9b507d2f3982a7b433a20f5c19cc1 Mon Sep 17 00:00:00 2001 From: wyang Date: Mon, 4 Nov 2024 15:20:48 +0800 Subject: [PATCH] fix deadlock hang when broadcast to clients (#6184) fix deadlock when broadcast to clients when master thransfer leader, the old master will disconnect with all filers and volumeServers, if the cluster is a big , the broadcast messages may be more big than the max of the channel len 100, then if the KeepConnect was not listen on the channel in disconnect, it will deadlock. and the whole cluster will not serve! --- weed/server/master_grpc_server.go | 17 ++++++++++++----- weed/stats/metrics.go | 9 +++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go index 256a4be52..dcf279e1d 100644 --- a/weed/server/master_grpc_server.go +++ b/weed/server/master_grpc_server.go @@ -4,12 +4,13 @@ import ( "context" "errors" "fmt" - "github.com/google/uuid" - "github.com/seaweedfs/seaweedfs/weed/cluster" "net" "sort" "time" + "github.com/google/uuid" + "github.com/seaweedfs/seaweedfs/weed/cluster" + "github.com/seaweedfs/seaweedfs/weed/pb" "github.com/seaweedfs/seaweedfs/weed/stats" "github.com/seaweedfs/seaweedfs/weed/storage/backend" @@ -89,7 +90,7 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ glog.V(0).Infof("unregister disconnected volume server %s:%d", dn.Ip, dn.Port) ms.UnRegisterUuids(dn.Ip, dn.Port) - if len(message.DeletedVids) > 0 || len(message.DeletedEcVids) > 0 { + if ms.Topo.IsLeader() && (len(message.DeletedVids) > 0 || len(message.DeletedEcVids) > 0) { ms.broadcastToClients(&master_pb.KeepConnectedResponse{VolumeLocation: message}) } } @@ -338,8 +339,14 @@ func (ms *MasterServer) KeepConnected(stream master_pb.Seaweed_KeepConnectedServ func (ms *MasterServer) broadcastToClients(message *master_pb.KeepConnectedResponse) { ms.clientChansLock.RLock() - for _, ch := range ms.clientChans { - ch <- message + for client, ch := range ms.clientChans { + select { + case ch <- message: + glog.V(4).Infof("send message to %s", client) + default: + stats.MasterBroadcastToFullErrorCounter.Inc() + glog.Errorf("broadcastToClients %s message full", client) + } } ms.clientChansLock.RUnlock() } diff --git a/weed/stats/metrics.go b/weed/stats/metrics.go index 93f80c1f4..9f84a3e70 100644 --- a/weed/stats/metrics.go +++ b/weed/stats/metrics.go @@ -94,6 +94,14 @@ var ( Help: "Counter of master pick for write error", }) + MasterBroadcastToFullErrorCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "master", + Name: "broadcast_to_full", + Help: "Counter of master broadcast send to full message channel err", + }) + MasterLeaderChangeCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: Namespace, @@ -314,6 +322,7 @@ func init() { Gather.MustRegister(MasterReplicaPlacementMismatch) Gather.MustRegister(MasterVolumeLayoutWritable) Gather.MustRegister(MasterVolumeLayoutCrowded) + Gather.MustRegister(MasterBroadcastToFullErrorCounter) Gather.MustRegister(FilerRequestCounter) Gather.MustRegister(FilerHandlerCounter)