diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index 1250cb603..a9d5e55e0 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -88,6 +88,8 @@ type healingTracker struct { ItemsSkipped uint64 BytesSkipped uint64 + + RetryAttempts uint64 // Add future tracking capabilities // Be sure that they are included in toHealingDisk } @@ -382,6 +384,8 @@ func getLocalDisksToHeal() (disksToHeal Endpoints) { var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second) +var errRetryHealing = errors.New("some items failed to heal, we will retry healing this drive again") + func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error { poolIdx, setIdx := endpoint.PoolIdx, endpoint.SetIdx disk := getStorageViaEndpoint(endpoint) @@ -451,8 +455,27 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint return err } - healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed) + // if objects have failed healing, we attempt a retry to heal the drive upto 3 times before giving up. + if tracker.ItemsFailed > 0 && tracker.RetryAttempts < 4 { + tracker.RetryAttempts++ + bugLogIf(ctx, tracker.update(ctx)) + healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retrying %s time (healed: %d, skipped: %d, failed: %d).", disk, + humanize.Ordinal(int(tracker.RetryAttempts)), tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed) + return errRetryHealing + } + + if tracker.ItemsFailed > 0 { + healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retried %d times (healed: %d, skipped: %d, failed: %d).", disk, + tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed) + } else { + if tracker.RetryAttempts > 0 { + healingLogEvent(ctx, "Healing of drive '%s' is complete, retried %d times (healed: %d, skipped: %d).", disk, + tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped) + } else { + healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped) + } + } if serverDebugLog { tracker.printTo(os.Stdout) fmt.Printf("\n") @@ -524,7 +547,7 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools) { if err := healFreshDisk(ctx, z, disk); err != nil { globalBackgroundHealState.setDiskHealingStatus(disk, false) timedout := OperationTimedOut{} - if !errors.Is(err, context.Canceled) && !errors.As(err, &timedout) { + if !errors.Is(err, context.Canceled) && !errors.As(err, &timedout) && !errors.Is(err, errRetryHealing) { printEndpointError(disk, err, false) } return diff --git a/cmd/background-newdisks-heal-ops_gen.go b/cmd/background-newdisks-heal-ops_gen.go index 2cb507813..52350eb39 100644 --- a/cmd/background-newdisks-heal-ops_gen.go +++ b/cmd/background-newdisks-heal-ops_gen.go @@ -200,6 +200,12 @@ func (z *healingTracker) DecodeMsg(dc *msgp.Reader) (err error) { err = msgp.WrapError(err, "BytesSkipped") return } + case "RetryAttempts": + z.RetryAttempts, err = dc.ReadUint64() + if err != nil { + err = msgp.WrapError(err, "RetryAttempts") + return + } default: err = dc.Skip() if err != nil { @@ -213,9 +219,9 @@ func (z *healingTracker) DecodeMsg(dc *msgp.Reader) (err error) { // EncodeMsg implements msgp.Encodable func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) { - // map header, size 25 + // map header, size 26 // write "ID" - err = en.Append(0xde, 0x0, 0x19, 0xa2, 0x49, 0x44) + err = en.Append(0xde, 0x0, 0x1a, 0xa2, 0x49, 0x44) if err != nil { return } @@ -478,15 +484,25 @@ func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) { err = msgp.WrapError(err, "BytesSkipped") return } + // write "RetryAttempts" + err = en.Append(0xad, 0x52, 0x65, 0x74, 0x72, 0x79, 0x41, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x73) + if err != nil { + return + } + err = en.WriteUint64(z.RetryAttempts) + if err != nil { + err = msgp.WrapError(err, "RetryAttempts") + return + } return } // MarshalMsg implements msgp.Marshaler func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) { o = msgp.Require(b, z.Msgsize()) - // map header, size 25 + // map header, size 26 // string "ID" - o = append(o, 0xde, 0x0, 0x19, 0xa2, 0x49, 0x44) + o = append(o, 0xde, 0x0, 0x1a, 0xa2, 0x49, 0x44) o = msgp.AppendString(o, z.ID) // string "PoolIndex" o = append(o, 0xa9, 0x50, 0x6f, 0x6f, 0x6c, 0x49, 0x6e, 0x64, 0x65, 0x78) @@ -566,6 +582,9 @@ func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) { // string "BytesSkipped" o = append(o, 0xac, 0x42, 0x79, 0x74, 0x65, 0x73, 0x53, 0x6b, 0x69, 0x70, 0x70, 0x65, 0x64) o = msgp.AppendUint64(o, z.BytesSkipped) + // string "RetryAttempts" + o = append(o, 0xad, 0x52, 0x65, 0x74, 0x72, 0x79, 0x41, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x73) + o = msgp.AppendUint64(o, z.RetryAttempts) return } @@ -763,6 +782,12 @@ func (z *healingTracker) UnmarshalMsg(bts []byte) (o []byte, err error) { err = msgp.WrapError(err, "BytesSkipped") return } + case "RetryAttempts": + z.RetryAttempts, bts, err = msgp.ReadUint64Bytes(bts) + if err != nil { + err = msgp.WrapError(err, "RetryAttempts") + return + } default: bts, err = msgp.Skip(bts) if err != nil { @@ -785,6 +810,6 @@ func (z *healingTracker) Msgsize() (s int) { for za0002 := range z.HealedBuckets { s += msgp.StringPrefixSize + len(z.HealedBuckets[za0002]) } - s += 7 + msgp.StringPrefixSize + len(z.HealID) + 13 + msgp.Uint64Size + 13 + msgp.Uint64Size + s += 7 + msgp.StringPrefixSize + len(z.HealID) + 13 + msgp.Uint64Size + 13 + msgp.Uint64Size + 14 + msgp.Uint64Size return }