Improve EC shards rebalancing logic across racks (#6270)

Improve EC shards rebalancing logic across racks.

  - Favor target shards with less preexisting shards, to ensure a fair distribution.
  - Randomize selection when multiple possible target shards are available.
  - Add logic to account for replication settings when selecting target shards (currently disabled).
This commit is contained in:
Lisandro Pin 2024-11-21 17:46:24 +01:00 committed by GitHub
parent e56327e3b0
commit ca499de1cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 274 additions and 45 deletions

View File

@ -3,6 +3,7 @@ package shell
import (
"context"
"fmt"
"math/rand"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/operation"
@ -473,16 +474,19 @@ func balanceEcShardsAcrossRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, ra
return nil
}
func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
func countShardsByRack(vid needle.VolumeId, locations []*EcNode) map[string]int {
return groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
shardBits := findEcVolumeShards(ecNode, vid)
return string(ecNode.rack), shardBits.ShardIdCount()
})
}
func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
// calculate average number of shards an ec rack should have for one volume
averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
// see the volume's shards are in how many racks, and how many in each rack
rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
shardBits := findEcVolumeShards(ecNode, vid)
return string(ecNode.rack), shardBits.ShardIdCount()
})
rackToShardCount := countShardsByRack(vid, locations)
rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
return string(ecNode.rack)
})
@ -490,16 +494,18 @@ func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid
// ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
for rackId, count := range rackToShardCount {
if count > averageShardsPerEcRack {
possibleEcNodes := rackEcNodesWithVid[rackId]
for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
ecShardsToMove[shardId] = ecNode
}
if count <= averageShardsPerEcRack {
continue
}
possibleEcNodes := rackEcNodesWithVid[rackId]
for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
ecShardsToMove[shardId] = ecNode
}
}
for shardId, ecNode := range ecShardsToMove {
rackId := pickOneRack(racks, rackToShardCount, averageShardsPerEcRack)
// TODO: consider volume replica info when balancing racks
rackId := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack)
if rackId == "" {
fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id)
continue
@ -521,23 +527,44 @@ func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid
return nil
}
func pickOneRack(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, averageShardsPerEcRack int) RackId {
// TODO later may need to add some randomness
for rackId, rack := range rackToEcNodes {
if rackToShardCount[string(rackId)] >= averageShardsPerEcRack {
continue
func pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, replicaPlacement *super_block.ReplicaPlacement, averageShardsPerEcRack int) RackId {
targets := []RackId{}
targetShards := -1
for _, shards := range rackToShardCount {
if shards > targetShards {
targetShards = shards
}
if rack.freeEcSlot <= 0 {
continue
}
return rackId
}
return ""
for rackId, rack := range rackToEcNodes {
shards := rackToShardCount[string(rackId)]
if rack.freeEcSlot <= 0 {
// No EC shards slots left :(
continue
}
if replicaPlacement != nil && shards >= replicaPlacement.DiffRackCount {
// Don't select racks with more EC shards for the target volume than the replicaton limit.
continue
}
if shards >= averageShardsPerEcRack {
// Keep EC shards across racks as balanced as possible.
continue
}
if shards < targetShards {
// Favor racks with less shards, to ensure an uniform distribution.
targets = nil
targetShards = shards
}
if shards == targetShards {
targets = append(targets, rackId)
}
}
if len(targets) == 0 {
return ""
}
return targets[rand.Intn(len(targets))]
}
func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
@ -774,6 +801,7 @@ func collectVolumeIdToEcNodes(allEcNodes []*EcNode, collection string) map[needl
return vidLocations
}
// TODO: EC volumes have no replica placement info :( Maybe rely on the master's default?
func volumeIdToReplicaPlacement(vid needle.VolumeId, nodes []*EcNode) (*super_block.ReplicaPlacement, error) {
for _, ecNode := range nodes {
for _, diskInfo := range ecNode.info.DiskInfos {
@ -789,6 +817,21 @@ func volumeIdToReplicaPlacement(vid needle.VolumeId, nodes []*EcNode) (*super_bl
return nil, fmt.Errorf("failed to resolve replica placement for volume ID %d", vid)
}
func getDefaultReplicaPlacement(commandEnv *CommandEnv) (*super_block.ReplicaPlacement, error) {
var resp *master_pb.GetMasterConfigurationResponse
var err error
err = commandEnv.MasterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
resp, err = client.GetMasterConfiguration(context.Background(), &master_pb.GetMasterConfigurationRequest{})
return err
})
if err != nil {
return nil, err
}
return super_block.NewReplicaPlacementFromString(resp.DefaultReplication)
}
func EcBalance(commandEnv *CommandEnv, collections []string, dc string, applyBalancing bool) (err error) {
if len(collections) == 0 {
return fmt.Errorf("no collections to balance")

View File

@ -10,12 +10,16 @@ import (
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
)
var (
topology1 = parseOutput(topoData)
topology2 = parseOutput(topoData2)
topologyEc = parseOutput(topoDataEc)
)
func TestEcDistribution(t *testing.T) {
topologyInfo := parseOutput(topoData)
// find out all volume servers with one slot left.
ecNodes, totalFreeEcSlots := collectEcVolumeServersByDc(topologyInfo, "")
ecNodes, totalFreeEcSlots := collectEcVolumeServersByDc(topology1, "")
sortEcNodesByFreeslotsDescending(ecNodes)
@ -34,23 +38,20 @@ func TestEcDistribution(t *testing.T) {
}
func TestVolumeIdToReplicaPlacement(t *testing.T) {
topo1 := parseOutput(topoData)
topo2 := parseOutput(topoData2)
testCases := []struct {
topology *master_pb.TopologyInfo
vid string
want string
wantErr string
}{
{topo1, "", "", "failed to resolve replica placement for volume ID 0"},
{topo1, "0", "", "failed to resolve replica placement for volume ID 0"},
{topo1, "1", "100", ""},
{topo1, "296", "100", ""},
{topo2, "", "", "failed to resolve replica placement for volume ID 0"},
{topo2, "19012", "", "failed to resolve replica placement for volume ID 19012"},
{topo2, "6271", "002", ""},
{topo2, "17932", "002", ""},
{topology1, "", "", "failed to resolve replica placement for volume ID 0"},
{topology1, "0", "", "failed to resolve replica placement for volume ID 0"},
{topology1, "1", "100", ""},
{topology1, "296", "100", ""},
{topology2, "", "", "failed to resolve replica placement for volume ID 0"},
{topology2, "19012", "", "failed to resolve replica placement for volume ID 19012"},
{topology2, "6271", "002", ""},
{topology2, "17932", "002", ""},
}
for _, tc := range testCases {
@ -59,29 +60,74 @@ func TestVolumeIdToReplicaPlacement(t *testing.T) {
got, gotErr := volumeIdToReplicaPlacement(vid, ecNodes)
if tc.wantErr == "" && gotErr != nil {
t.Errorf("expected no error for volume '%s', got '%s'", tc.vid, gotErr.Error())
t.Errorf("expected no error for volume %q, got %q", tc.vid, gotErr.Error())
continue
}
if tc.wantErr != "" {
if gotErr == nil {
t.Errorf("got no error for volume '%s', expected '%s'", tc.vid, tc.wantErr)
t.Errorf("got no error for volume %q, expected %q", tc.vid, tc.wantErr)
continue
}
if gotErr.Error() != tc.wantErr {
t.Errorf("expected error '%s' for volume '%s', got '%s'", tc.wantErr, tc.vid, gotErr.Error())
t.Errorf("expected error %q for volume %q, got %q", tc.wantErr, tc.vid, gotErr.Error())
continue
}
}
if got == nil {
if tc.want != "" {
t.Errorf("expected replica placement '%s' for volume '%s', got nil", tc.want, tc.vid)
t.Errorf("expected replica placement %q for volume %q, got nil", tc.want, tc.vid)
}
continue
}
want, _ := super_block.NewReplicaPlacementFromString(tc.want)
if !got.Equals(want) {
t.Errorf("got replica placement '%s' for volune '%s', want '%s'", got.String(), tc.vid, want.String())
t.Errorf("got replica placement %q for volune %q, want %q", got.String(), tc.vid, want.String())
}
}
}
func TestPickRackToBalanceShardsInto(t *testing.T) {
testCases := []struct {
topology *master_pb.TopologyInfo
vid string
wantOneOf []string
}{
// Non-EC volumes. We don't care about these, but the function should return all racks as a safeguard.
{topologyEc, "", []string{"rack1", "rack2", "rack3", "rack4", "rack5", "rack6"}},
{topologyEc, "6225", []string{"rack1", "rack2", "rack3", "rack4", "rack5", "rack6"}},
{topologyEc, "6226", []string{"rack1", "rack2", "rack3", "rack4", "rack5", "rack6"}},
{topologyEc, "6241", []string{"rack1", "rack2", "rack3", "rack4", "rack5", "rack6"}},
{topologyEc, "6242", []string{"rack1", "rack2", "rack3", "rack4", "rack5", "rack6"}},
// EC volumes.
{topologyEc, "9577", []string{"rack1", "rack2", "rack3"}},
{topologyEc, "10457", []string{"rack1"}},
{topologyEc, "12737", []string{"rack2"}},
{topologyEc, "14322", []string{"rack3"}},
}
for _, tc := range testCases {
vid, _ := needle.NewVolumeId(tc.vid)
ecNodes, _ := collectEcVolumeServersByDc(tc.topology, "")
racks := collectRacks(ecNodes)
locations := ecNodes
rackToShardCount := countShardsByRack(vid, locations)
averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
got := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack)
if string(got) == "" && len(tc.wantOneOf) == 0 {
continue
}
found := false
for _, want := range tc.wantOneOf {
if got := string(got); got == want {
found = true
break
}
}
if !(found) {
t.Errorf("expected one of %v for volume %q, got %q", tc.wantOneOf, tc.vid, got)
}
}
}

View File

@ -2,15 +2,18 @@ package shell
import (
_ "embed"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/stretchr/testify/assert"
//"google.golang.org/protobuf/proto"
"github.com/golang/protobuf/proto"
"strconv"
"strings"
"testing"
"github.com/golang/protobuf/proto"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
)
@ -127,3 +130,6 @@ var topoData string
//go:embed volume.list2.txt
var topoData2 string
//go:embed volume.ecshards.txt
var topoDataEc string

View File

@ -0,0 +1,134 @@
Topology volumeSizeLimit:1024 MB hdd(volume:15900/25063 active:15900 free:9163 remote:0)
DataCenter DefaultDataCenter hdd(volume:15900/25063 active:15900 free:9163 remote:0)
Rack rack1 hdd(volume:15900/25063 active:15900 free:9163 remote:0)
DataNode 172.19.0.10:8702 hdd(volume:7/2225 active:7 free:2225 remote:0)
Disk hdd(volume:7/2232 active:7 free:2225 remote:0)
volume id:6225 size:24404408 file_count:275 replica_placement:2 version:3 compact_revision:2 modified_at_second:1664897660
volume id:6226 size:20871152 file_count:258 replica_placement:2 version:3 compact_revision:2 modified_at_second:1664888660
volume id:6241 size:34861224 file_count:274 replica_placement:2 version:3 compact_revision:1 modified_at_second:1664909248
volume id:6242 size:40460472 file_count:236 replica_placement:2 version:3 compact_revision:1 modified_at_second:1664906607
ec volume id:12737 collection:s3qldata shards:[3]
ec volume id:14322 collection:s3qldata shards:[5]
ec volume id:9577 collection:s3qldata shards:[11]
Disk hdd total size:1737345132344 file_count:533580 deleted_file:10764 deleted_bytes:22028207276
DataNode 172.19.0.10:8702 total size:1737345132344 file_count:533580 deleted_file:10764 deleted_bytes:22028207276
Rack rack1 total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
Rack rack2 hdd(volume:3/25063 active:3 free:25060 remote:0)
DataNode 172.19.0.13:8701 hdd(volume:3/2187 active:3 free:2184 remote:0)
Disk hdd(volume:3/2187 active:3 free:2184 remote:0)
volume id:6241 size:34861256 file_count:275 delete_count:1 replica_placement:2 version:3 compact_revision:1 modified_at_second:1664909248
ec volume id:10457 collection:s3qldata shards:[12]
ec volume id:14322 collection:s3qldata shards:[10]
ec volume id:9577 collection:s3qldata shards:[10]
Disk hdd total size:1695600546816 file_count:521054 deleted_file:9961 deleted_bytes:21063702677
DataNode 172.19.0.13:8701 total size:1695600546816 file_count:521054 deleted_file:9961 deleted_bytes:21063702677
Rack rack2 total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
Rack rack3 hdd(volume:3/25063 active:28 free:25060 remote:0)
DataNode 172.19.0.14:8711 hdd(volume:3/1627 active:3 free:1624 remote:0)
Disk hdd(volume:3/1627 active:3 free:1624 remote:0)
ec volume id:10457 collection:s3qldata shards:[3]
ec volume id:12737 collection:s3qldata shards:[6]
ec volume id:9577 collection:s3qldata shards:[5]
Disk hdd total size:1050933775360 file_count:323231 deleted_file:8245 deleted_bytes:15595720358
DataNode 172.19.0.14:8711 total size:1050933775360 file_count:323231 deleted_file:8245 deleted_bytes:15595720358
Rack rack3 total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
Rack rack4 hdd(volume:10/25063 active:4 free:25053 remote:0)
DataNode 172.19.0.16:8704 hdd(volume:10/2174 active:4 free:2164 remote:0)
Disk hdd(volume:4/2174 active:4 free:2170 remote:0)
ec volume id:10457 collection:s3qldata shards:[0 13]
ec volume id:12737 collection:s3qldata shards:[1]
ec volume id:14322 collection:s3qldata shards:[7]
ec volume id:9577 collection:s3qldata shards:[2]
Disk hdd total size:1653215155776 file_count:507914 deleted_file:11402 deleted_bytes:22641676340
DataNode 172.19.0.16:8704 total size:1653215155776 file_count:507914 deleted_file:11402 deleted_bytes:22641676340
DataNode 172.19.0.17:8703 hdd(volume:6/2214 active:6 free:2208 remote:0)
Disk hdd(volume:6/2214 active:6 free:2208 remote:0)
volume id:6226 size:20871152 file_count:258 replica_placement:2 version:3 compact_revision:2 modified_at_second:1664888660
volume id:6241 size:34861256 file_count:275 delete_count:1 replica_placement:2 version:3 compact_revision:1 modified_at_second:1664909248
ec volume id:10457 collection:s3qldata shards:[11]
ec volume id:12737 collection:s3qldata shards:[5]
ec volume id:14322 collection:s3qldata shards:[2 9]
ec volume id:9577 collection:s3qldata shards:[0]
Disk hdd total size:1715724688456 file_count:526901 deleted_file:10854 deleted_bytes:22441405472
DataNode 172.19.0.17:8703 total size:1715724688456 file_count:526901 deleted_file:10854 deleted_bytes:22441405472
Rack rack4 total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
Rack rack5 hdd(volume:20/25063 active:20 free:25043 remote:0)
DataNode 172.19.0.19:8700 hdd(volume:6/2132 active:6 free:2126 remote:0)
Disk hdd(volume:6/2132 active:6 free:2126 remote:0)
volume id:6242 size:40460472 file_count:236 replica_placement:2 version:3 compact_revision:1 modified_at_second:1664906607
volume id:6225 size:24398232 file_count:274 replica_placement:2 version:3 compact_revision:2 modified_at_second:1664897660
ec volume id:10457 collection:s3qldata shards:[8]
ec volume id:12737 collection:s3qldata shards:[13]
ec volume id:14322 collection:s3qldata shards:[8]
ec volume id:9577 collection:s3qldata shards:[12]
Disk hdd total size:1635328083064 file_count:504512 deleted_file:11567 deleted_bytes:23202472281
DataNode 172.19.0.19:8700 total size:1635328083064 file_count:504512 deleted_file:11567 deleted_bytes:23202472281
DataNode 172.19.0.20:8706 hdd(volume:4/2153 active:4 free:2149 remote:0)
Disk hdd(volume:6/2153 active:1497 free:656 remote:0)
ec volume id:10457 collection:s3qldata shards:[1]
ec volume id:12737 collection:s3qldata shards:[7]
ec volume id:14322 collection:s3qldata shards:[5 11 12]
ec volume id:9577 collection:s3qldata shards:[1]
Disk hdd total size:1662887597128 file_count:510323 deleted_file:10919 deleted_bytes:22504428853
DataNode 172.19.0.20:8706 total size:1662887597128 file_count:510323 deleted_file:10919 deleted_bytes:22504428853
DataNode 172.19.0.21:8710 hdd(volume:6/2184 active:6 free:2178 remote:0)
Disk hdd(volume:6/2184 active:6 free:2178 remote:0)
volume id:6225 size:24398232 file_count:274 replica_placement:2 version:3 compact_revision:2 modified_at_second:1664897660
volume id:6242 size:40460472 file_count:236 replica_placement:2 version:3 compact_revision:1 modified_at_second:1664906607
ec volume id:10457 collection:s3qldata shards:[9]
ec volume id:12737 collection:s3qldata shards:[4]
ec volume id:14322 collection:s3qldata shards:[11]
ec volume id:9577 collection:s3qldata shards:[3]
Disk hdd total size:1685060737528 file_count:517231 deleted_file:10635 deleted_bytes:22306836236
DataNode 172.19.0.21:8710 total size:1685060737528 file_count:517231 deleted_file:10635 deleted_bytes:22306836236
DataNode 172.19.0.3:8708 hdd(volume:4/961 active:4 free:957 remote:0)
Disk hdd(volume:4/961 active:4 free:957 remote:0)
ec volume id:10457 collection:s3qldata shards:[4]
ec volume id:12737 collection:s3qldata shards:[10]
ec volume id:14322 collection:s3qldata shards:[3]
ec volume id:9577 collection:s3qldata shards:[7]
Disk hdd total size:377523488192 file_count:119577 deleted_file:5368 deleted_bytes:8596766559
DataNode 172.19.0.3:8708 total size:377523488192 file_count:119577 deleted_file:5368 deleted_bytes:8596766559
Rack rack5 total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
Rack rack6 hdd(volume:18/25063 active:18 free:25045 remote:0)
DataNode 172.19.0.4:8707 hdd(volume:4/958 active:4 free:954 remote:0)
Disk hdd(volume:4/958 active:4 free:954 remote:0)
ec volume id:10457 collection:s3qldata shards:[6]
ec volume id:12737 collection:s3qldata shards:[9]
ec volume id:14322 collection:s3qldata shards:[4]
ec volume id:9577 collection:s3qldata shards:[9]
Disk hdd total size:378345005760 file_count:119036 deleted_file:5301 deleted_bytes:8574028334
DataNode 172.19.0.4:8707 total size:378345005760 file_count:119036 deleted_file:5301 deleted_bytes:8574028334
DataNode 172.19.0.5:8705 hdd(volume:3/983 active:3 free:980 remote:0)
Disk hdd(volume:3/983 active:3 free:980 remote:0)
ec volume id:10457 collection:s3qldata shards:[5]
ec volume id:12737 collection:s3qldata shards:[8]
ec volume id:9577 collection:s3qldata shards:[6]
Disk hdd total size:404653451288 file_count:126527 deleted_file:4789 deleted_bytes:8145619860
DataNode 172.19.0.5:8705 total size:404653451288 file_count:126527 deleted_file:4789 deleted_bytes:8145619860
DataNode 172.19.0.6:8713 hdd(volume:2/970 active:2 free:968 remote:0)
Disk hdd(volume:2/970 active:2 free:968 remote:0)
ec volume id:12737 collection:s3qldata shards:[11]
ec volume id:9577 collection:s3qldata shards:[8]
Disk hdd total size:401028073512 file_count:125448 deleted_file:4891 deleted_bytes:7914078769
DataNode 172.19.0.6:8713 total size:401028073512 file_count:125448 deleted_file:4891 deleted_bytes:7914078769
DataNode 172.19.0.8:8709 hdd(volume:5/2144 active:5 free:2139 remote:0)
Disk hdd(volume:5/2144 active:5 free:2139 remote:0)
volume id:6226 size:20871152 file_count:258 replica_placement:2 version:3 compact_revision:2 modified_at_second:1664888660
ec volume id:10457 collection:s3qldata shards:[2]
ec volume id:12737 collection:s3qldata shards:[2 12]
ec volume id:14322 collection:s3qldata shards:[1 13]
ec volume id:9577 collection:s3qldata shards:[13]
Disk hdd total size:1648662273096 file_count:507133 deleted_file:11386 deleted_bytes:23141702025
DataNode 172.19.0.8:8709 total size:1648662273096 file_count:507133 deleted_file:11386 deleted_bytes:23141702025
DataNode 172.19.0.9:8712 hdd(volume:4/2144 active:4 free:2140 remote:0)
Disk hdd(volume:4/2144 active:4 free:2140 remote:0)
ec volume id:10457 collection:s3qldata shards:[7]
ec volume id:12737 collection:s3qldata shards:[0]
ec volume id:14322 collection:s3qldata shards:[0 6]
ec volume id:9577 collection:s3qldata shards:[4]
Disk hdd total size:1629878746296 file_count:497502 deleted_file:11825 deleted_bytes:23550625989
DataNode 172.19.0.9:8712 total size:1629878746296 file_count:497502 deleted_file:11825 deleted_bytes:23550625989
Rack rack6 total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
DataCenter DefaultDataCenter total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029
total size:17676186754616 file_count:5439969 deleted_file:127907 deleted_bytes:251707271029