Add grace period and retry logic for Fabric Card firmware upload
After an IOM reboot, the Redfish API responds before inter-IOM services are fully initialized. This causes the Fabric Card upload to fail with "Failed to send update package to other IOM; Couldn't connect to server". Two fixes: 1. After _wait_for_iom_online confirms the IOM is back, wait an additional 60s for services to fully initialize before proceeding. 2. Retry the Fabric Card firmware upload up to 3 times with 60s between attempts, covering cases where services are still starting up. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -166,6 +166,8 @@ def _update_iom_fw(password: str, ip: str, iom: str, fw_path: str) -> bool:
|
|||||||
time.sleep(30) # allow time for the IOM to begin shutting down before polling
|
time.sleep(30) # allow time for the IOM to begin shutting down before polling
|
||||||
if _wait_for_iom_online(password, ip):
|
if _wait_for_iom_online(password, ip):
|
||||||
ok(f"{iom} is back online.")
|
ok(f"{iom} is back online.")
|
||||||
|
info("Allowing IOM services to fully initialize before next step...")
|
||||||
|
time.sleep(60)
|
||||||
else:
|
else:
|
||||||
warn(f"{iom} did not respond within 5 minutes — proceeding anyway.")
|
warn(f"{iom} did not respond within 5 minutes — proceeding anyway.")
|
||||||
return True
|
return True
|
||||||
@@ -179,10 +181,25 @@ def _update_fabric_fw(password: str, ip: str, iom: str, fw_path: str) -> bool:
|
|||||||
After the update: restart fabric card, then restart IOM.
|
After the update: restart fabric card, then restart IOM.
|
||||||
"""
|
"""
|
||||||
sz = os.path.getsize(fw_path)
|
sz = os.path.getsize(fw_path)
|
||||||
info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}...")
|
|
||||||
|
# Retry the upload — after an IOM reboot the inter-IOM services can take
|
||||||
|
# time to initialize, causing the first upload attempt to fail with
|
||||||
|
# "Failed to send update package to other IOM".
|
||||||
|
MAX_UPLOAD_ATTEMPTS = 3
|
||||||
|
ok_flag, data = False, ""
|
||||||
|
for attempt in range(1, MAX_UPLOAD_ATTEMPTS + 1):
|
||||||
|
info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}"
|
||||||
|
+ (f" (attempt {attempt}/{MAX_UPLOAD_ATTEMPTS})" if attempt > 1 else "") + "...")
|
||||||
ok_flag, data = _redfish_upload_firmware(password, ip, fw_path)
|
ok_flag, data = _redfish_upload_firmware(password, ip, fw_path)
|
||||||
|
if ok_flag:
|
||||||
|
break
|
||||||
|
if attempt < MAX_UPLOAD_ATTEMPTS:
|
||||||
|
warn(f"Upload failed: {data}")
|
||||||
|
info("Waiting 60s for IOM services to finish initializing...")
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
if not ok_flag:
|
if not ok_flag:
|
||||||
error(f"Upload failed: {data}")
|
error(f"Upload failed after {MAX_UPLOAD_ATTEMPTS} attempts: {data}")
|
||||||
return False
|
return False
|
||||||
ok("Firmware file uploaded.")
|
ok("Firmware file uploaded.")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user