seaweedfs/weed/topology/topology.go

438 lines
12 KiB
Go
Raw Normal View History

2012-08-24 13:33:37 +08:00
package topology
import (
2022-04-04 20:51:51 +08:00
"encoding/json"
"errors"
2019-04-22 01:14:17 +08:00
"fmt"
2024-08-30 00:52:21 +08:00
"math/rand/v2"
2019-05-24 14:34:29 +08:00
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
2022-05-16 10:41:18 +08:00
backoff "github.com/cenkalti/backoff/v4"
2022-04-04 20:51:51 +08:00
hashicorpRaft "github.com/hashicorp/raft"
"github.com/seaweedfs/raft"
2019-12-24 04:48:20 +08:00
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/sequence"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/util"
2012-08-24 13:33:37 +08:00
)
type Topology struct {
vacuumLockCounter int64
NodeImpl
2019-05-24 14:34:29 +08:00
collectionMap *util.ConcurrentReadMap
ecShardMap map[needle.VolumeId]*EcShardLocations
ecShardMapLock sync.RWMutex
pulse int64
volumeSizeLimit uint64
replicationAsMin bool
isDisableVacuum bool
Sequence sequence.Sequencer
2012-09-19 05:05:12 +08:00
chanFullVolumes chan storage.VolumeInfo
chanCrowdedVolumes chan storage.VolumeInfo
Configuration *Configuration
RaftServer raft.Server
RaftServerAccessLock sync.RWMutex
HashicorpRaft *hashicorpRaft.Raft
barrierLock sync.Mutex
barrierDone bool
UuidAccessLock sync.RWMutex
UuidMap map[string][]string
2012-08-31 16:35:11 +08:00
}
func NewTopology(id string, seq sequence.Sequencer, volumeSizeLimit uint64, pulse int, replicationAsMin bool) *Topology {
t := &Topology{}
t.id = NodeId(id)
t.nodeType = "Topology"
t.NodeImpl.value = t
2021-02-16 18:47:02 +08:00
t.diskUsages = newDiskUsages()
t.children = make(map[NodeId]Node)
t.collectionMap = util.NewConcurrentReadMap()
2019-05-24 14:34:29 +08:00
t.ecShardMap = make(map[needle.VolumeId]*EcShardLocations)
t.pulse = int64(pulse)
t.volumeSizeLimit = volumeSizeLimit
t.replicationAsMin = replicationAsMin
t.Sequence = seq
t.chanFullVolumes = make(chan storage.VolumeInfo)
t.chanCrowdedVolumes = make(chan storage.VolumeInfo)
2017-02-13 13:58:44 +08:00
t.Configuration = &Configuration{}
2017-02-13 13:58:44 +08:00
return t
}
2023-06-10 00:42:48 +08:00
func (t *Topology) IsChildLocked() (bool, error) {
if t.IsLocked() {
return true, errors.New("topology is locked")
}
for _, dcNode := range t.Children() {
if dcNode.IsLocked() {
return true, fmt.Errorf("topology child %s is locked", dcNode.String())
}
for _, rackNode := range dcNode.Children() {
if rackNode.IsLocked() {
return true, fmt.Errorf("dc %s child %s is locked", dcNode.String(), rackNode.String())
}
for _, dataNode := range rackNode.Children() {
if dataNode.IsLocked() {
return true, fmt.Errorf("rack %s child %s is locked", rackNode.String(), dataNode.Id())
}
}
}
}
return false, nil
}
func (t *Topology) IsLeader() bool {
t.RaftServerAccessLock.RLock()
defer t.RaftServerAccessLock.RUnlock()
2019-02-15 16:09:48 +08:00
if t.RaftServer != nil {
if t.RaftServer.State() == raft.Leader {
return true
}
if leader, err := t.Leader(); err == nil {
if pb.ServerAddress(t.RaftServer.Name()) == leader {
return true
}
}
2022-04-04 20:51:51 +08:00
} else if t.HashicorpRaft != nil {
if t.HashicorpRaft.State() == hashicorpRaft.Leader {
return true
}
2019-01-29 02:36:16 +08:00
}
return false
}
func (t *Topology) IsLeaderAndCanRead() bool {
if t.RaftServer != nil {
return t.IsLeader()
} else if t.HashicorpRaft != nil {
return t.IsLeader() && t.DoBarrier()
} else {
return false
}
}
func (t *Topology) DoBarrier() bool {
t.barrierLock.Lock()
defer t.barrierLock.Unlock()
if t.barrierDone {
return true
}
glog.V(0).Infof("raft do barrier")
barrier := t.HashicorpRaft.Barrier(2 * time.Minute)
if err := barrier.Error(); err != nil {
glog.Errorf("failed to wait for barrier, error %s", err)
return false
}
t.barrierDone = true
glog.V(0).Infof("raft do barrier success")
return true
}
func (t *Topology) BarrierReset() {
t.barrierLock.Lock()
defer t.barrierLock.Unlock()
t.barrierDone = false
}
func (t *Topology) Leader() (l pb.ServerAddress, err error) {
exponentialBackoff := backoff.NewExponentialBackOff()
exponentialBackoff.InitialInterval = 100 * time.Millisecond
exponentialBackoff.MaxElapsedTime = 20 * time.Second
leaderNotSelected := errors.New("leader not selected yet")
l, err = backoff.RetryWithData(
func() (l pb.ServerAddress, err error) {
l, err = t.MaybeLeader()
if err == nil && l == "" {
err = leaderNotSelected
}
return l, err
},
exponentialBackoff)
if err == leaderNotSelected {
l = ""
}
return l, err
}
func (t *Topology) MaybeLeader() (l pb.ServerAddress, err error) {
t.RaftServerAccessLock.RLock()
defer t.RaftServerAccessLock.RUnlock()
if t.RaftServer != nil {
l = pb.ServerAddress(t.RaftServer.Leader())
} else if t.HashicorpRaft != nil {
l = pb.ServerAddress(t.HashicorpRaft.Leader())
} else {
err = errors.New("Raft Server not ready yet!")
}
return
}
2019-06-06 14:20:26 +08:00
func (t *Topology) Lookup(collection string, vid needle.VolumeId) (dataNodes []*DataNode) {
2020-08-11 11:42:27 +08:00
// maybe an issue if lots of collections?
2013-11-12 18:21:22 +08:00
if collection == "" {
2016-05-31 03:30:26 +08:00
for _, c := range t.collectionMap.Items() {
if list := c.(*Collection).Lookup(vid); list != nil {
return list
}
}
2013-11-12 18:21:22 +08:00
} else {
2016-05-31 03:30:26 +08:00
if c, ok := t.collectionMap.Find(collection); ok {
return c.(*Collection).Lookup(vid)
2013-11-12 18:21:22 +08:00
}
}
2019-06-06 14:20:26 +08:00
if locations, found := t.LookupEcShards(vid); found {
for _, loc := range locations.Locations {
dataNodes = append(dataNodes, loc...)
}
return dataNodes
}
return nil
}
2019-04-19 12:43:36 +08:00
func (t *Topology) NextVolumeId() (needle.VolumeId, error) {
if !t.IsLeaderAndCanRead() {
return 0, fmt.Errorf("as leader can not read yet")
}
vid := t.GetMaxVolumeId()
next := vid.Next()
t.RaftServerAccessLock.RLock()
defer t.RaftServerAccessLock.RUnlock()
2022-04-04 20:51:51 +08:00
if t.RaftServer != nil {
if _, err := t.RaftServer.Do(NewMaxVolumeIdCommand(next)); err != nil {
return 0, err
}
} else if t.HashicorpRaft != nil {
b, err := json.Marshal(NewMaxVolumeIdCommand(next))
if err != nil {
return 0, fmt.Errorf("failed marshal NewMaxVolumeIdCommand: %+v", err)
}
if future := t.HashicorpRaft.Apply(b, time.Second); future.Error() != nil {
return 0, future.Error()
}
}
return next, nil
}
func (t *Topology) PickForWrite(requestedCount uint64, option *VolumeGrowOption, volumeLayout *VolumeLayout) (fileId string, count uint64, volumeLocationList *VolumeLocationList, shouldGrow bool, err error) {
var vid needle.VolumeId
vid, count, volumeLocationList, shouldGrow, err = volumeLayout.PickForWrite(requestedCount, option)
2019-04-22 01:14:17 +08:00
if err != nil {
return "", 0, nil, shouldGrow, fmt.Errorf("failed to find writable volumes for collection:%s replication:%s ttl:%s error: %v", option.Collection, option.ReplicaPlacement.String(), option.Ttl.String(), err)
2019-04-22 01:14:17 +08:00
}
if volumeLocationList == nil || volumeLocationList.Length() == 0 {
return "", 0, nil, shouldGrow, fmt.Errorf("%s available for collection:%s replication:%s ttl:%s", noWritableVolumes, option.Collection, option.ReplicaPlacement.String(), option.Ttl.String())
}
nextFileId := t.Sequence.NextFileId(requestedCount)
fileId = needle.NewFileId(vid, nextFileId, rand.Uint32()).String()
return fileId, count, volumeLocationList, shouldGrow, nil
}
2021-02-16 18:47:02 +08:00
func (t *Topology) GetVolumeLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType) *VolumeLayout {
return t.collectionMap.Get(collectionName, func() interface{} {
return NewCollection(collectionName, t.volumeSizeLimit, t.replicationAsMin)
2020-12-14 03:59:32 +08:00
}).(*Collection).GetOrCreateVolumeLayout(rp, ttl, diskType)
2012-09-14 16:17:13 +08:00
}
2019-05-31 00:27:23 +08:00
func (t *Topology) ListCollections(includeNormalVolumes, includeEcVolumes bool) (ret []string) {
2019-05-31 00:17:58 +08:00
mapOfCollections := make(map[string]bool)
for _, c := range t.collectionMap.Items() {
2019-05-31 00:17:58 +08:00
mapOfCollections[c.(*Collection).Name] = true
}
2019-05-31 00:27:23 +08:00
if includeEcVolumes {
t.ecShardMapLock.RLock()
for _, ecVolumeLocation := range t.ecShardMap {
mapOfCollections[ecVolumeLocation.Collection] = true
}
t.ecShardMapLock.RUnlock()
2019-05-31 00:17:58 +08:00
}
for k := range mapOfCollections {
2019-05-31 00:17:58 +08:00
ret = append(ret, k)
}
return ret
}
2016-05-31 03:30:26 +08:00
func (t *Topology) FindCollection(collectionName string) (*Collection, bool) {
c, hasCollection := t.collectionMap.Find(collectionName)
if !hasCollection {
return nil, false
}
return c.(*Collection), hasCollection
}
func (t *Topology) DeleteCollection(collectionName string) {
2016-05-31 03:30:26 +08:00
t.collectionMap.Delete(collectionName)
}
2021-02-16 18:47:02 +08:00
func (t *Topology) DeleteLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType) {
collection, found := t.FindCollection(collectionName)
if !found {
return
}
2020-12-14 03:59:32 +08:00
collection.DeleteVolumeLayout(rp, ttl, diskType)
2020-12-13 20:14:50 +08:00
if len(collection.storageType2VolumeLayout.Items()) == 0 {
t.DeleteCollection(collectionName)
}
}
func (t *Topology) RegisterVolumeLayout(v storage.VolumeInfo, dn *DataNode) {
2021-02-16 18:47:02 +08:00
diskType := types.ToDiskType(v.DiskType)
2020-12-14 03:59:32 +08:00
vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
vl.RegisterVolume(&v, dn)
vl.EnsureCorrectWritables(&v)
}
func (t *Topology) UnRegisterVolumeLayout(v storage.VolumeInfo, dn *DataNode) {
glog.Infof("removing volume info: %+v from %v", v, dn.id)
if v.ReplicaPlacement.GetCopyCount() > 1 {
stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(0)
}
2021-02-16 18:47:02 +08:00
diskType := types.ToDiskType(v.DiskType)
2020-12-14 03:59:32 +08:00
volumeLayout := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
volumeLayout.UnRegisterVolume(&v, dn)
if volumeLayout.isEmpty() {
2020-12-14 03:59:32 +08:00
t.DeleteLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
}
}
func (t *Topology) DataCenterExists(dcName string) bool {
return dcName == "" || t.GetOrCreateDataCenter(dcName) != nil
}
func (t *Topology) GetDataCenter(dcName string) (dc *DataCenter) {
t.RLock()
defer t.RUnlock()
for _, c := range t.children {
dc = c.(*DataCenter)
if string(dc.Id()) == dcName {
return dc
}
}
return dc
}
func (t *Topology) GetOrCreateDataCenter(dcName string) *DataCenter {
t.Lock()
defer t.Unlock()
for _, c := range t.children {
dc := c.(*DataCenter)
if string(dc.Id()) == dcName {
return dc
}
}
dc := NewDataCenter(dcName)
2022-09-11 02:26:19 +08:00
t.doLinkChildNode(dc)
return dc
}
2018-06-25 15:01:53 +08:00
func (t *Topology) SyncDataNodeRegistration(volumes []*master_pb.VolumeInformationMessage, dn *DataNode) (newVolumes, deletedVolumes []storage.VolumeInfo) {
2019-05-23 15:04:24 +08:00
// convert into in memory struct storage.VolumeInfo
2018-06-25 15:01:53 +08:00
var volumeInfos []storage.VolumeInfo
for _, v := range volumes {
if vi, err := storage.NewVolumeInfo(v); err == nil {
volumeInfos = append(volumeInfos, vi)
} else {
glog.V(0).Infof("Fail to convert joined volume information: %v", err)
}
}
2019-05-23 15:04:24 +08:00
// find out the delta volumes
var changedVolumes []storage.VolumeInfo
newVolumes, deletedVolumes, changedVolumes = dn.UpdateVolumes(volumeInfos)
2019-04-21 02:35:20 +08:00
for _, v := range newVolumes {
2018-06-25 15:01:53 +08:00
t.RegisterVolumeLayout(v, dn)
}
for _, v := range deletedVolumes {
t.UnRegisterVolumeLayout(v, dn)
}
for _, v := range changedVolumes {
2021-02-16 18:47:02 +08:00
diskType := types.ToDiskType(v.DiskType)
2020-12-14 03:59:32 +08:00
vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
adding locking to avoid nil VolumeLocationList fix panic: runtime error: invalid memory address or nil pointer dereference Oct 22 00:53:44 bedb-master1 weed[8055]: [signal SIGSEGV: segmentation violation code=0x1 addr=0x8 pc=0x17658da] Oct 22 00:53:44 bedb-master1 weed[8055]: goroutine 310 [running]: Oct 22 00:53:44 bedb-master1 weed[8055]: github.com/chrislusf/seaweedfs/weed/topology.(*VolumeLocationList).Length(...) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/seaweedfs/weed/topology/volume_location_list.go:35 Oct 22 00:53:44 bedb-master1 weed[8055]: github.com/chrislusf/seaweedfs/weed/topology.(*VolumeLayout).enoughCopies(...) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/seaweedfs/weed/topology/volume_layout.go:376 Oct 22 00:53:44 bedb-master1 weed[8055]: github.com/chrislusf/seaweedfs/weed/topology.(*VolumeLayout).ensureCorrectWritables(0xc000111d50, 0xc000b55438) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/seaweedfs/weed/topology/volume_layout.go:202 +0x5a Oct 22 00:53:44 bedb-master1 weed[8055]: github.com/chrislusf/seaweedfs/weed/topology.(*Topology).SyncDataNodeRegistration(0xc00042ac60, 0xc001454d30, 0x1, 0x1, 0xc0005fc000, 0xc00135de40, 0x4, 0xc00135de50, 0x10, 0x10d, ...) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/seaweedfs/weed/topology/topology.go:224 +0x616 Oct 22 00:53:44 bedb-master1 weed[8055]: github.com/chrislusf/seaweedfs/weed/server.(*MasterServer).SendHeartbeat(0xc000162700, 0x23b97c0, 0xc000ae2c90, 0x0, 0x0) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/seaweedfs/weed/server/master_grpc_server.go:106 +0x325 Oct 22 00:53:44 bedb-master1 weed[8055]: github.com/chrislusf/seaweedfs/weed/pb/master_pb._Seaweed_SendHeartbeat_Handler(0x1f8e7c0, 0xc000162700, 0x23b0a60, 0xc00024b440, 0x3172c38, 0xc000ab7100) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/seaweedfs/weed/pb/master_pb/master.pb.go:4250 +0xad Oct 22 00:53:44 bedb-master1 weed[8055]: google.golang.org/grpc.(*Server).processStreamingRPC(0xc0001f31e0, 0x23bb800, 0xc000ac5500, 0xc000ab7100, 0xc0001fea80, 0x311fec0, 0x0, 0x0, 0x0) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/go/pkg/mod/google.golang.org/grpc@v1.29.1/server.go:1329 +0xcd8 Oct 22 00:53:44 bedb-master1 weed[8055]: google.golang.org/grpc.(*Server).handleStream(0xc0001f31e0, 0x23bb800, 0xc000ac5500, 0xc000ab7100, 0x0) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/go/pkg/mod/google.golang.org/grpc@v1.29.1/server.go:1409 +0xc5c Oct 22 00:53:44 bedb-master1 weed[8055]: google.golang.org/grpc.(*Server).serveStreams.func1.1(0xc0001ce8b0, 0xc0001f31e0, 0x23bb800, 0xc000ac5500, 0xc000ab7100) Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/go/pkg/mod/google.golang.org/grpc@v1.29.1/server.go:746 +0xa5 Oct 22 00:53:44 bedb-master1 weed[8055]: created by google.golang.org/grpc.(*Server).serveStreams.func1 Oct 22 00:53:44 bedb-master1 weed[8055]: #011/root/go/pkg/mod/google.golang.org/grpc@v1.29.1/server.go:744 +0xa5 Oct 22 00:53:44 bedb-master1 systemd[1]: weedmaster.service: Main process exited, code=exited, status=2/INVALIDARGUMENT Oct 22 00:53:44 bedb-master1 systemd[1]: weedmaster.service: Failed with result 'exit-code'.
2020-10-22 14:15:48 +08:00
vl.EnsureCorrectWritables(&v)
}
return
2018-06-25 15:01:53 +08:00
}
2019-04-21 02:35:20 +08:00
func (t *Topology) IncrementalSyncDataNodeRegistration(newVolumes, deletedVolumes []*master_pb.VolumeShortInformationMessage, dn *DataNode) {
var newVis, oldVis []storage.VolumeInfo
for _, v := range newVolumes {
vi, err := storage.NewVolumeInfoFromShort(v)
if err != nil {
glog.V(0).Infof("NewVolumeInfoFromShort %v: %v", v, err)
continue
}
newVis = append(newVis, vi)
}
for _, v := range deletedVolumes {
vi, err := storage.NewVolumeInfoFromShort(v)
if err != nil {
glog.V(0).Infof("NewVolumeInfoFromShort %v: %v", v, err)
continue
}
oldVis = append(oldVis, vi)
}
dn.DeltaUpdateVolumes(newVis, oldVis)
2019-04-21 14:53:37 +08:00
for _, vi := range newVis {
t.RegisterVolumeLayout(vi, dn)
}
for _, vi := range oldVis {
t.UnRegisterVolumeLayout(vi, dn)
}
2019-04-21 02:35:20 +08:00
return
}
2022-04-07 15:18:28 +08:00
2022-05-02 14:16:29 +08:00
func (t *Topology) DataNodeRegistration(dcName, rackName string, dn *DataNode) {
if dn.Parent() != nil {
2022-04-07 15:18:28 +08:00
return
}
// registration to topo
dc := t.GetOrCreateDataCenter(dcName)
rack := dc.GetOrCreateRack(rackName)
rack.LinkChildNode(dn)
glog.Infof("[%s] reLink To topo ", dn.Id())
2022-05-02 14:16:29 +08:00
}
func (t *Topology) DisableVacuum() {
glog.V(0).Infof("DisableVacuum")
t.isDisableVacuum = true
}
func (t *Topology) EnableVacuum() {
glog.V(0).Infof("EnableVacuum")
t.isDisableVacuum = false
}