Add grace period and retry logic for Fabric Card firmware upload

After an IOM reboot, the Redfish API responds before inter-IOM services
are fully initialized. This causes the Fabric Card upload to fail with
"Failed to send update package to other IOM; Couldn't connect to server".

Two fixes:
1. After _wait_for_iom_online confirms the IOM is back, wait an additional
   60s for services to fully initialize before proceeding.
2. Retry the Fabric Card firmware upload up to 3 times with 60s between
   attempts, covering cases where services are still starting up.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 13:50:35 -04:00
parent 5c3a0b0ed6
commit b617f48829

View File

@@ -166,6 +166,8 @@ def _update_iom_fw(password: str, ip: str, iom: str, fw_path: str) -> bool:
time.sleep(30) # allow time for the IOM to begin shutting down before polling time.sleep(30) # allow time for the IOM to begin shutting down before polling
if _wait_for_iom_online(password, ip): if _wait_for_iom_online(password, ip):
ok(f"{iom} is back online.") ok(f"{iom} is back online.")
info("Allowing IOM services to fully initialize before next step...")
time.sleep(60)
else: else:
warn(f"{iom} did not respond within 5 minutes — proceeding anyway.") warn(f"{iom} did not respond within 5 minutes — proceeding anyway.")
return True return True
@@ -179,10 +181,25 @@ def _update_fabric_fw(password: str, ip: str, iom: str, fw_path: str) -> bool:
After the update: restart fabric card, then restart IOM. After the update: restart fabric card, then restart IOM.
""" """
sz = os.path.getsize(fw_path) sz = os.path.getsize(fw_path)
info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}...")
ok_flag, data = _redfish_upload_firmware(password, ip, fw_path) # Retry the upload — after an IOM reboot the inter-IOM services can take
# time to initialize, causing the first upload attempt to fail with
# "Failed to send update package to other IOM".
MAX_UPLOAD_ATTEMPTS = 3
ok_flag, data = False, ""
for attempt in range(1, MAX_UPLOAD_ATTEMPTS + 1):
info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}"
+ (f" (attempt {attempt}/{MAX_UPLOAD_ATTEMPTS})" if attempt > 1 else "") + "...")
ok_flag, data = _redfish_upload_firmware(password, ip, fw_path)
if ok_flag:
break
if attempt < MAX_UPLOAD_ATTEMPTS:
warn(f"Upload failed: {data}")
info("Waiting 60s for IOM services to finish initializing...")
time.sleep(60)
if not ok_flag: if not ok_flag:
error(f"Upload failed: {data}") error(f"Upload failed after {MAX_UPLOAD_ATTEMPTS} attempts: {data}")
return False return False
ok("Firmware file uploaded.") ok("Firmware file uploaded.")