ci(studio-mac): retry composer.wait_for after change-password redirect

Mac Studio UI / Chat UI Tests on commit 81534ddd timed out 60s into
composer.wait_for(state='visible') right after the change-password
form submit (run 25552964008 / job 75005076366). Same renderer-
kills-context pattern that --single-process Chromium exposes on
the macos-14 free runner.

Make the wait robust against both failure modes (composer still
suspending, page object dead from renderer crash):

1. Settle the network with wait_for_load_state('networkidle', 30s)
   before looking for the textarea, so the post-submit React
   redirect has a chance to land.

2. Wrap composer.wait_for in a 2-attempt loop. On first failure,
   dump page.url + page_errors + console_errors counts + first
   message of each, screenshot, then either spawn a fresh page
   in the same context (if page.is_closed()) or page.goto(BASE)
   with wait_until='domcontentloaded'.

3. If both attempts fail, raise the original exception so CI
   still sees a meaningful TimeoutError / TargetClosedError with
   the recovery diagnostics already on stdout.

Same hardening applied to playwright_extra_ui.py which has the
same change-password -> composer pattern.
This commit is contained in:
Daniel Han 2026-05-08 11:39:38 +00:00
parent 420f588205
commit 3274f72039
2 changed files with 116 additions and 2 deletions

View file

@ -296,8 +296,74 @@ with sync_playwright() as p:
# 2. Chat surface mounts, default model surface is visible.
# ─────────────────────────────────────────────────────
step("wait for composer to mount")
# The change-password POST resolves async and the React router
# rebuilds the tree (login form -> chat shell) on success. On
# macos-14 free runners under --single-process Chromium, the
# rebuild is heavy enough under software rendering that one of
# two things happens if we race straight into wait_for():
# (a) the composer textarea is still suspending and we burn
# the 60s ceiling waiting for it to mount, or
# (b) the renderer crashes mid-mount, which under
# --single-process takes the entire context down (next
# Playwright call returns TargetClosedError).
# Defend against both: settle network first, then attempt
# wait_for with one recovery cycle on failure.
try:
page.wait_for_load_state("networkidle", timeout = 30_000)
except Exception:
pass # best-effort -- proceed even if network never idles
composer = page.locator('textarea[aria-label="Message input"]')
composer.wait_for(state = "visible", timeout = 60_000)
last_err: Exception | None = None
for _attempt in range(2):
try:
composer.wait_for(state = "visible", timeout = 60_000)
last_err = None
break
except Exception as e:
last_err = e
try:
cur_url = page.url
except Exception:
cur_url = "<page closed>"
print(
f"[ui] composer.wait_for attempt {_attempt + 1} failed: "
f"{type(e).__name__}: {str(e)[:200]}; page.url={cur_url}; "
f"page_errors={len(page_errors)} console_errors={len(console_errors)}",
flush = True,
)
if console_errors:
print(f"[ui] first console.error: {console_errors[0][:200]!r}", flush = True)
if page_errors:
print(f"[ui] first pageerror: {page_errors[0][:200]!r}", flush = True)
try:
shoot(f"03-composer-wait-attempt-{_attempt + 1}-fail")
except Exception:
pass
if _attempt == 0:
# Recovery: re-navigate. If the page died (renderer
# gone under --single-process) we open a fresh page in
# the same context so the auth state in localStorage
# survives; otherwise we re-goto the same URL to force
# a clean re-render.
try:
if page.is_closed():
page = ctx.new_page()
page.set_default_timeout(60_000)
page.goto(BASE, wait_until = "domcontentloaded", timeout = 60_000)
try:
page.wait_for_load_state("networkidle", timeout = 30_000)
except Exception:
pass
composer = page.locator('textarea[aria-label="Message input"]')
except Exception as recover_err:
print(
f"[ui] recovery navigation failed: "
f"{type(recover_err).__name__}: {str(recover_err)[:200]}",
flush = True,
)
if last_err is not None:
raise last_err
shoot("03-chat-loaded")
# Pull the auth token now -- /api/models/list and

View file

@ -200,8 +200,56 @@ with sync_playwright() as p:
pw_field.fill(NEW, timeout = 60_000)
page.fill("#confirm-password", NEW, timeout = 60_000)
page.locator('button[type="submit"]').click()
# Same defense-in-depth as playwright_chat_ui.py: settle network,
# then wait_for with one recovery cycle. The post-submit React
# re-render can either leave the composer suspending or crash the
# renderer outright under --single-process Chromium on macos-14.
try:
page.wait_for_load_state("networkidle", timeout = 30_000)
except Exception:
pass
composer = page.locator('textarea[aria-label="Message input"]')
composer.wait_for(state = "visible", timeout = 60_000)
last_err: Exception | None = None
for _attempt in range(2):
try:
composer.wait_for(state = "visible", timeout = 60_000)
last_err = None
break
except Exception as e:
last_err = e
try:
cur_url = page.url
except Exception:
cur_url = "<page closed>"
print(
f"[extra-ui] composer.wait_for attempt {_attempt + 1} failed: "
f"{type(e).__name__}: {str(e)[:200]}; page.url={cur_url}; "
f"page_errors={len(page_errors)}",
flush = True,
)
try:
shoot(f"01-composer-wait-attempt-{_attempt + 1}-fail")
except Exception:
pass
if _attempt == 0:
try:
if page.is_closed():
page = ctx.new_page()
page.set_default_timeout(60_000)
page.goto(BASE, wait_until = "domcontentloaded", timeout = 60_000)
try:
page.wait_for_load_state("networkidle", timeout = 30_000)
except Exception:
pass
composer = page.locator('textarea[aria-label="Message input"]')
except Exception as recover_err:
print(
f"[extra-ui] recovery navigation failed: "
f"{type(recover_err).__name__}: {str(recover_err)[:200]}",
flush = True,
)
if last_err is not None:
raise last_err
shoot("01-chat-loaded")
token = page.evaluate("() => localStorage.getItem('unsloth_auth_token')")