avoid ListBuckets returning quorum errors when node is down (#10555)

Also, revamp the way ListBuckets work make few portions
of the healing logic parallel

- walk objects for healing disks in parallel
- collect the list of buckets in parallel across drives
- provide consistent view for listBuckets()
This commit is contained in:
Harshavardhana
2020-09-24 09:53:38 -07:00
committed by GitHub
parent d778d034e7
commit ca989eb0b3
8 changed files with 78 additions and 126 deletions

View File

@@ -153,11 +153,22 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureZones, bgSeq *healS
for _, ep := range endpoints {
logger.Info("Healing disk '%s' on %s zone", ep, humanize.Ordinal(i+1))
if err := healErasureSet(ctx, setIndex, z.zones[i].sets[setIndex], z.zones[i].setDriveCount); err != nil {
buckets, err := z.ListBucketsHeal(ctx)
if err != nil {
logger.LogIf(ctx, err)
continue
}
if len(buckets) > 0 {
disks := z.zones[i].sets[setIndex].getLoadBalancedDisks()
if err := healErasureSet(ctx, setIndex, buckets, disks, z.zones[i].setDriveCount); err != nil {
logger.LogIf(ctx, err)
continue
}
}
logger.Info("Healing disk '%s' on %s zone complete", ep, humanize.Ordinal(i+1))
// Only upon success pop the healed disk.
globalBackgroundHealState.popHealLocalDisks(ep)
}