seaweedfs/weed/server/master_grpc_server.go

339 lines
10 KiB
Go
Raw Normal View History

2017-01-10 17:01:12 +08:00
package weed_server
import (
"context"
"github.com/chrislusf/seaweedfs/weed/pb"
2022-01-24 22:09:43 +08:00
"github.com/chrislusf/seaweedfs/weed/stats"
"github.com/chrislusf/seaweedfs/weed/storage/backend"
2021-09-08 07:43:54 +08:00
"github.com/chrislusf/seaweedfs/weed/util"
2017-01-13 05:42:53 +08:00
"net"
"time"
2017-01-13 05:42:53 +08:00
2018-07-29 12:02:56 +08:00
"github.com/chrislusf/raft"
2020-02-27 09:27:49 +08:00
"google.golang.org/grpc/peer"
2017-01-10 17:01:12 +08:00
"github.com/chrislusf/seaweedfs/weed/glog"
2018-05-10 14:11:54 +08:00
"github.com/chrislusf/seaweedfs/weed/pb/master_pb"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
2017-01-10 17:01:12 +08:00
"github.com/chrislusf/seaweedfs/weed/topology"
)
func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServer) error {
2017-01-10 17:01:12 +08:00
var dn *topology.DataNode
defer func() {
if dn != nil {
dn.Counter--
2021-09-15 01:37:06 +08:00
if dn.Counter > 0 {
2021-09-06 07:18:50 +08:00
glog.V(0).Infof("disconnect phantom volume server %s:%d remaining %d", dn.Ip, dn.Port, dn.Counter)
return
}
// if the volume server disconnects and reconnects quickly
// the unregister and register can race with each other
ms.Topo.UnRegisterDataNode(dn)
glog.V(0).Infof("unregister disconnected volume server %s:%d", dn.Ip, dn.Port)
message := &master_pb.VolumeLocation{
2020-11-12 07:10:06 +08:00
Url: dn.Url(),
PublicUrl: dn.PublicUrl,
}
for _, v := range dn.GetVolumes() {
message.DeletedVids = append(message.DeletedVids, uint32(v.Id))
}
2019-05-26 16:05:08 +08:00
for _, s := range dn.GetEcShards() {
message.DeletedVids = append(message.DeletedVids, uint32(s.VolumeId))
}
if len(message.DeletedVids) > 0 {
2021-11-06 19:07:38 +08:00
ms.broadcastToClients(&master_pb.KeepConnectedResponse{VolumeLocation: message})
}
}
}()
2017-01-10 17:01:12 +08:00
for {
heartbeat, err := stream.Recv()
2018-08-24 15:30:03 +08:00
if err != nil {
if dn != nil {
glog.Warningf("SendHeartbeat.Recv server %s:%d : %v", dn.Ip, dn.Port, err)
} else {
2019-10-26 00:44:58 +08:00
glog.Warningf("SendHeartbeat.Recv: %v", err)
}
2022-01-24 22:09:43 +08:00
stats.MasterReceivedHeartbeatCounter.WithLabelValues("error").Inc()
2018-08-24 15:30:03 +08:00
return err
}
ms.Topo.Sequence.SetMax(heartbeat.MaxFileKey)
2018-08-24 15:30:03 +08:00
if dn == nil {
dcName, rackName := ms.Topo.Configuration.Locate(heartbeat.Ip, heartbeat.DataCenter, heartbeat.Rack)
dc := ms.Topo.GetOrCreateDataCenter(dcName)
2018-08-24 15:30:03 +08:00
rack := dc.GetOrCreateRack(rackName)
dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.MaxVolumeCounts)
2021-09-12 05:27:57 +08:00
glog.V(0).Infof("added volume server %d: %v:%d", dn.Counter, heartbeat.GetIp(), heartbeat.GetPort())
2018-08-24 15:30:03 +08:00
if err := stream.Send(&master_pb.HeartbeatResponse{
2020-09-20 05:10:26 +08:00
VolumeSizeLimit: uint64(ms.option.VolumeSizeLimitMB) * 1024 * 1024,
2018-08-24 15:30:03 +08:00
}); err != nil {
glog.Warningf("SendHeartbeat.Send volume size to %s:%d %v", dn.Ip, dn.Port, err)
2018-08-24 15:30:03 +08:00
return err
}
2022-01-24 22:09:43 +08:00
stats.MasterReceivedHeartbeatCounter.WithLabelValues("dataNode").Inc()
dn.Counter++
2018-08-24 15:30:03 +08:00
}
2017-01-10 17:01:12 +08:00
2021-02-16 18:47:02 +08:00
dn.AdjustMaxVolumeCounts(heartbeat.MaxVolumeCounts)
2020-08-31 11:12:04 +08:00
glog.V(4).Infof("master received heartbeat %s", heartbeat.String())
2022-01-24 22:09:43 +08:00
stats.MasterReceivedHeartbeatCounter.WithLabelValues("total").Inc()
var dataCenter string
if dc := dn.GetDataCenter(); dc != nil {
dataCenter = string(dc.Id())
}
2018-08-24 15:30:03 +08:00
message := &master_pb.VolumeLocation{
2020-11-11 18:03:47 +08:00
Url: dn.Url(),
PublicUrl: dn.PublicUrl,
DataCenter: dataCenter,
2018-08-24 15:30:03 +08:00
}
2022-01-24 22:09:43 +08:00
if len(heartbeat.NewVolumes) > 0 {
stats.FilerRequestCounter.WithLabelValues("newVolumes").Inc()
}
if len(heartbeat.DeletedVolumes) > 0 {
stats.FilerRequestCounter.WithLabelValues("deletedVolumes").Inc()
}
2019-04-21 02:35:20 +08:00
if len(heartbeat.NewVolumes) > 0 || len(heartbeat.DeletedVolumes) > 0 {
// process delta volume ids if exists for fast volume id updates
2019-04-30 11:22:19 +08:00
for _, volInfo := range heartbeat.NewVolumes {
2019-04-21 02:35:20 +08:00
message.NewVids = append(message.NewVids, volInfo.Id)
}
2019-04-30 11:22:19 +08:00
for _, volInfo := range heartbeat.DeletedVolumes {
2019-04-21 02:35:20 +08:00
message.DeletedVids = append(message.DeletedVids, volInfo.Id)
}
// update master internal volume layouts
ms.Topo.IncrementalSyncDataNodeRegistration(heartbeat.NewVolumes, heartbeat.DeletedVolumes, dn)
}
2019-06-06 04:32:33 +08:00
if len(heartbeat.Volumes) > 0 || heartbeat.HasNoVolumes {
// process heartbeat.Volumes
2022-01-24 22:09:43 +08:00
stats.MasterReceivedHeartbeatCounter.WithLabelValues("Volumes").Inc()
newVolumes, deletedVolumes := ms.Topo.SyncDataNodeRegistration(heartbeat.Volumes, dn)
for _, v := range newVolumes {
2019-04-30 11:22:19 +08:00
glog.V(0).Infof("master see new volume %d from %s", uint32(v.Id), dn.Url())
message.NewVids = append(message.NewVids, uint32(v.Id))
}
for _, v := range deletedVolumes {
2019-04-21 02:35:20 +08:00
glog.V(0).Infof("master see deleted volume %d from %s", uint32(v.Id), dn.Url())
message.DeletedVids = append(message.DeletedVids, uint32(v.Id))
}
}
if len(heartbeat.NewEcShards) > 0 || len(heartbeat.DeletedEcShards) > 0 {
2022-01-24 22:09:43 +08:00
stats.MasterReceivedHeartbeatCounter.WithLabelValues("newEcShards").Inc()
// update master internal volume layouts
ms.Topo.IncrementalSyncDataNodeEcShards(heartbeat.NewEcShards, heartbeat.DeletedEcShards, dn)
for _, s := range heartbeat.NewEcShards {
message.NewVids = append(message.NewVids, s.Id)
}
for _, s := range heartbeat.DeletedEcShards {
if dn.HasVolumesById(needle.VolumeId(s.Id)) {
continue
}
message.DeletedVids = append(message.DeletedVids, s.Id)
}
}
2019-06-06 04:32:33 +08:00
if len(heartbeat.EcShards) > 0 || heartbeat.HasNoEcShards {
2022-01-24 22:09:43 +08:00
stats.MasterReceivedHeartbeatCounter.WithLabelValues("ecShards").Inc()
glog.V(4).Infof("master received ec shards from %s: %+v", dn.Url(), heartbeat.EcShards)
newShards, deletedShards := ms.Topo.SyncDataNodeEcShards(heartbeat.EcShards, dn)
2019-05-24 14:47:49 +08:00
// broadcast the ec vid changes to master clients
for _, s := range newShards {
message.NewVids = append(message.NewVids, uint32(s.VolumeId))
2019-05-24 14:47:49 +08:00
}
for _, s := range deletedShards {
if dn.HasVolumesById(s.VolumeId) {
continue
}
message.DeletedVids = append(message.DeletedVids, uint32(s.VolumeId))
2019-05-24 14:47:49 +08:00
}
2018-08-24 15:30:03 +08:00
}
if len(message.NewVids) > 0 || len(message.DeletedVids) > 0 {
2021-11-06 19:07:38 +08:00
ms.broadcastToClients(&master_pb.KeepConnectedResponse{VolumeLocation: message})
2017-01-10 17:01:12 +08:00
}
// tell the volume servers about the leader
newLeader, err := ms.Topo.Leader()
if err != nil {
glog.Warningf("SendHeartbeat find leader: %v", err)
return err
}
if err := stream.Send(&master_pb.HeartbeatResponse{
Leader: string(newLeader),
}); err != nil {
glog.Warningf("SendHeartbeat.Send response to to %s:%d %v", dn.Ip, dn.Port, err)
return err
}
2017-01-10 17:01:12 +08:00
}
}
// KeepConnected keep a stream gRPC call to the master. Used by clients to know the master is up.
2018-07-28 16:30:03 +08:00
// And clients gets the up-to-date list of volume locations
func (ms *MasterServer) KeepConnected(stream master_pb.Seaweed_KeepConnectedServer) error {
2021-08-14 20:03:45 +08:00
req, recvErr := stream.Recv()
if recvErr != nil {
return recvErr
}
2018-07-28 16:30:03 +08:00
if !ms.Topo.IsLeader() {
return ms.informNewLeader(stream)
2018-07-28 16:30:03 +08:00
}
peerAddress := pb.ServerAddress(req.ClientAddress)
// buffer by 1 so we don't end up getting stuck writing to stopChan forever
stopChan := make(chan bool, 1)
clientName, messageChan := ms.addClient(req.ClientType, peerAddress)
2021-11-06 19:07:38 +08:00
for _, update := range ms.Cluster.AddClusterNode(req.ClientType, peerAddress, req.Version) {
ms.broadcastToClients(update)
}
defer func() {
2021-11-06 19:07:38 +08:00
for _, update := range ms.Cluster.RemoveClusterNode(req.ClientType, peerAddress) {
ms.broadcastToClients(update)
}
ms.deleteClient(clientName)
}()
2018-07-28 16:17:35 +08:00
for _, message := range ms.Topo.ToVolumeLocations() {
2021-11-06 09:11:40 +08:00
if sendErr := stream.Send(&master_pb.KeepConnectedResponse{VolumeLocation: message}); sendErr != nil {
2021-08-14 20:03:45 +08:00
return sendErr
2018-07-28 16:17:35 +08:00
}
}
go func() {
for {
_, err := stream.Recv()
if err != nil {
glog.V(2).Infof("- client %v: %v", clientName, err)
2021-06-12 17:52:41 +08:00
close(stopChan)
return
}
}
}()
ticker := time.NewTicker(5 * time.Second)
for {
select {
case message := <-messageChan:
2021-11-06 19:07:38 +08:00
if err := stream.Send(message); err != nil {
glog.V(0).Infof("=> client %v: %+v", clientName, message)
return err
}
case <-ticker.C:
if !ms.Topo.IsLeader() {
2022-01-24 23:13:07 +08:00
stats.MasterRaftIsleader.Set(0)
return ms.informNewLeader(stream)
2022-01-24 23:13:07 +08:00
} else {
stats.MasterRaftIsleader.Set(1)
}
case <-stopChan:
return nil
}
}
}
2021-11-06 19:07:38 +08:00
func (ms *MasterServer) broadcastToClients(message *master_pb.KeepConnectedResponse) {
ms.clientChansLock.RLock()
for _, ch := range ms.clientChans {
ch <- message
}
ms.clientChansLock.RUnlock()
}
func (ms *MasterServer) informNewLeader(stream master_pb.Seaweed_KeepConnectedServer) error {
leader, err := ms.Topo.Leader()
if err != nil {
2019-07-31 17:09:04 +08:00
glog.Errorf("topo leader: %v", err)
return raft.NotLeaderError
}
2021-11-06 09:11:40 +08:00
if err := stream.Send(&master_pb.KeepConnectedResponse{
VolumeLocation: &master_pb.VolumeLocation{
Leader: string(leader),
},
}); err != nil {
return err
}
return nil
}
2021-11-06 19:07:38 +08:00
func (ms *MasterServer) addClient(clientType string, clientAddress pb.ServerAddress) (clientName string, messageChan chan *master_pb.KeepConnectedResponse) {
clientName = clientType + "@" + string(clientAddress)
glog.V(0).Infof("+ client %v", clientName)
// we buffer this because otherwise we end up in a potential deadlock where
// the KeepConnected loop is no longer listening on this channel but we're
// trying to send to it in SendHeartbeat and so we can't lock the
// clientChansLock to remove the channel and we're stuck writing to it
// 100 is probably overkill
2021-11-06 19:07:38 +08:00
messageChan = make(chan *master_pb.KeepConnectedResponse, 100)
ms.clientChansLock.Lock()
ms.clientChans[clientName] = messageChan
ms.clientChansLock.Unlock()
return
}
func (ms *MasterServer) deleteClient(clientName string) {
glog.V(0).Infof("- client %v", clientName)
ms.clientChansLock.Lock()
delete(ms.clientChans, clientName)
ms.clientChansLock.Unlock()
}
func findClientAddress(ctx context.Context, grpcPort uint32) string {
// fmt.Printf("FromContext %+v\n", ctx)
pr, ok := peer.FromContext(ctx)
if !ok {
glog.Error("failed to get peer from ctx")
return ""
}
if pr.Addr == net.Addr(nil) {
glog.Error("failed to get peer address")
return ""
}
if grpcPort == 0 {
return pr.Addr.String()
}
if tcpAddr, ok := pr.Addr.(*net.TCPAddr); ok {
externalIP := tcpAddr.IP
2021-09-08 07:43:54 +08:00
return util.JoinHostPort(externalIP.String(), int(grpcPort))
}
return pr.Addr.String()
}
func (ms *MasterServer) GetMasterConfiguration(ctx context.Context, req *master_pb.GetMasterConfigurationRequest) (*master_pb.GetMasterConfigurationResponse, error) {
// tell the volume servers about the leader
leader, _ := ms.Topo.Leader()
resp := &master_pb.GetMasterConfigurationResponse{
MetricsAddress: ms.option.MetricsAddress,
MetricsIntervalSeconds: uint32(ms.option.MetricsIntervalSec),
StorageBackends: backend.ToPbStorageBackends(),
DefaultReplication: ms.option.DefaultReplicaPlacement,
2021-08-13 08:54:34 +08:00
VolumeSizeLimitMB: uint32(ms.option.VolumeSizeLimitMB),
VolumePreallocate: ms.option.VolumePreallocate,
Leader: string(leader),
}
return resp, nil
}