Add grace period and retry logic for Fabric Card firmware upload
After an IOM reboot, the Redfish API responds before inter-IOM services are fully initialized. This causes the Fabric Card upload to fail with "Failed to send update package to other IOM; Couldn't connect to server". Two fixes: 1. After _wait_for_iom_online confirms the IOM is back, wait an additional 60s for services to fully initialize before proceeding. 2. Retry the Fabric Card firmware upload up to 3 times with 60s between attempts, covering cases where services are still starting up. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -166,6 +166,8 @@ def _update_iom_fw(password: str, ip: str, iom: str, fw_path: str) -> bool:
|
||||
time.sleep(30) # allow time for the IOM to begin shutting down before polling
|
||||
if _wait_for_iom_online(password, ip):
|
||||
ok(f"{iom} is back online.")
|
||||
info("Allowing IOM services to fully initialize before next step...")
|
||||
time.sleep(60)
|
||||
else:
|
||||
warn(f"{iom} did not respond within 5 minutes — proceeding anyway.")
|
||||
return True
|
||||
@@ -179,10 +181,25 @@ def _update_fabric_fw(password: str, ip: str, iom: str, fw_path: str) -> bool:
|
||||
After the update: restart fabric card, then restart IOM.
|
||||
"""
|
||||
sz = os.path.getsize(fw_path)
|
||||
info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}...")
|
||||
|
||||
# Retry the upload — after an IOM reboot the inter-IOM services can take
|
||||
# time to initialize, causing the first upload attempt to fail with
|
||||
# "Failed to send update package to other IOM".
|
||||
MAX_UPLOAD_ATTEMPTS = 3
|
||||
ok_flag, data = False, ""
|
||||
for attempt in range(1, MAX_UPLOAD_ATTEMPTS + 1):
|
||||
info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}"
|
||||
+ (f" (attempt {attempt}/{MAX_UPLOAD_ATTEMPTS})" if attempt > 1 else "") + "...")
|
||||
ok_flag, data = _redfish_upload_firmware(password, ip, fw_path)
|
||||
if ok_flag:
|
||||
break
|
||||
if attempt < MAX_UPLOAD_ATTEMPTS:
|
||||
warn(f"Upload failed: {data}")
|
||||
info("Waiting 60s for IOM services to finish initializing...")
|
||||
time.sleep(60)
|
||||
|
||||
if not ok_flag:
|
||||
error(f"Upload failed: {data}")
|
||||
error(f"Upload failed after {MAX_UPLOAD_ATTEMPTS} attempts: {data}")
|
||||
return False
|
||||
ok("Firmware file uploaded.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user