Display details upon failures to re-balance EC shards racks. (#6299)

This commit is contained in:
Lisandro Pin 2024-11-28 17:42:41 +01:00 committed by GitHub
parent 559a1fd0f4
commit 9a741a61b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 12 deletions

View File

@ -507,16 +507,17 @@ func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid
for shardId, ecNode := range ecShardsToMove { for shardId, ecNode := range ecShardsToMove {
// TODO: consider volume replica info when balancing racks // TODO: consider volume replica info when balancing racks
rackId := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack) rackId, err := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack)
if rackId == "" { if err != nil {
fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id) fmt.Printf("ec shard %d.%d at %s can not find a destination rack:\n%s\n", vid, shardId, ecNode.info.Id, err.Error())
continue continue
} }
var possibleDestinationEcNodes []*EcNode var possibleDestinationEcNodes []*EcNode
for _, n := range racks[rackId].ecNodes { for _, n := range racks[rackId].ecNodes {
possibleDestinationEcNodes = append(possibleDestinationEcNodes, n) possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
} }
err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing) err = pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
if err != nil { if err != nil {
return err return err
} }
@ -529,8 +530,7 @@ func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid
return nil return nil
} }
// TOOD: Return an error with details upon failure to resolve a destination rack. func pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, replicaPlacement *super_block.ReplicaPlacement, averageShardsPerEcRack int) (RackId, error) {
func pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, replicaPlacement *super_block.ReplicaPlacement, averageShardsPerEcRack int) RackId {
targets := []RackId{} targets := []RackId{}
targetShards := -1 targetShards := -1
for _, shards := range rackToShardCount { for _, shards := range rackToShardCount {
@ -539,19 +539,20 @@ func pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCo
} }
} }
details := ""
for rackId, rack := range rackToEcNodes { for rackId, rack := range rackToEcNodes {
shards := rackToShardCount[string(rackId)] shards := rackToShardCount[string(rackId)]
if rack.freeEcSlot <= 0 { if rack.freeEcSlot <= 0 {
// No EC shards slots left :( details += fmt.Sprintf(" Skipped %s because it has no free slots\n", rackId)
continue continue
} }
if replicaPlacement != nil && shards >= replicaPlacement.DiffRackCount { if replicaPlacement != nil && shards >= replicaPlacement.DiffRackCount {
// Don't select racks with more EC shards for the target volume than the replicaton limit. details += fmt.Sprintf(" Skipped %s because shards %d >= replica placement limit for other racks (%d)\n", rackId, shards, replicaPlacement.DiffRackCount)
continue continue
} }
if shards >= averageShardsPerEcRack { if shards >= averageShardsPerEcRack {
// Keep EC shards across racks as balanced as possible. details += fmt.Sprintf(" Skipped %s because shards %d >= averageShards (%d)\n", rackId, shards, averageShardsPerEcRack)
continue continue
} }
if shards < targetShards { if shards < targetShards {
@ -565,9 +566,9 @@ func pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCo
} }
if len(targets) == 0 { if len(targets) == 0 {
return "" return "", errors.New(details)
} }
return targets[rand.IntN(len(targets))] return targets[rand.IntN(len(targets))], nil
} }
func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error { func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {

View File

@ -122,7 +122,12 @@ func TestPickRackToBalanceShardsInto(t *testing.T) {
rackToShardCount := countShardsByRack(vid, locations) rackToShardCount := countShardsByRack(vid, locations)
averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks)) averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
got := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack) got, gotErr := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack)
if gotErr != nil {
t.Errorf("volume %q: %s", tc.vid, gotErr.Error())
continue
}
if string(got) == "" && len(tc.wantOneOf) == 0 { if string(got) == "" && len(tc.wantOneOf) == 0 {
continue continue
} }