diff --git a/modules/workflow_firmware.py b/modules/workflow_firmware.py index ac944be..af9e846 100644 --- a/modules/workflow_firmware.py +++ b/modules/workflow_firmware.py @@ -166,6 +166,8 @@ def _update_iom_fw(password: str, ip: str, iom: str, fw_path: str) -> bool: time.sleep(30) # allow time for the IOM to begin shutting down before polling if _wait_for_iom_online(password, ip): ok(f"{iom} is back online.") + info("Allowing IOM services to fully initialize before next step...") + time.sleep(60) else: warn(f"{iom} did not respond within 5 minutes — proceeding anyway.") return True @@ -179,10 +181,25 @@ def _update_fabric_fw(password: str, ip: str, iom: str, fw_path: str) -> bool: After the update: restart fabric card, then restart IOM. """ sz = os.path.getsize(fw_path) - info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}...") - ok_flag, data = _redfish_upload_firmware(password, ip, fw_path) + + # Retry the upload — after an IOM reboot the inter-IOM services can take + # time to initialize, causing the first upload attempt to fail with + # "Failed to send update package to other IOM". + MAX_UPLOAD_ATTEMPTS = 3 + ok_flag, data = False, "" + for attempt in range(1, MAX_UPLOAD_ATTEMPTS + 1): + info(f"Uploading Fabric Card firmware ({sz // 1024} KB) to {iom} at {ip}" + + (f" (attempt {attempt}/{MAX_UPLOAD_ATTEMPTS})" if attempt > 1 else "") + "...") + ok_flag, data = _redfish_upload_firmware(password, ip, fw_path) + if ok_flag: + break + if attempt < MAX_UPLOAD_ATTEMPTS: + warn(f"Upload failed: {data}") + info("Waiting 60s for IOM services to finish initializing...") + time.sleep(60) + if not ok_flag: - error(f"Upload failed: {data}") + error(f"Upload failed after {MAX_UPLOAD_ATTEMPTS} attempts: {data}") return False ok("Firmware file uploaded.")