From 66316aba38adea609a7d9635fe572d33aa95bc2d Mon Sep 17 00:00:00 2001 From: iamtoruk Date: Sat, 9 May 2026 14:27:48 -0700 Subject: [PATCH 1/2] Fix menubar stuck loading with non-blocking pipe I/O and watchdog Replace blocking availableData drain with non-blocking POSIX read that respects Task cancellation. Handle EINTR from child SIGCHLD, close pipe fds after drain to prevent deadlock on oversized output, and escalate SIGTERM to SIGKILL after 0.5s grace period. Add 60-second loading watchdog as safety net that auto-clears stuck state on each refresh loop tick. Fixes #282 --- mac/Sources/CodeBurnMenubar/AppStore.swift | 19 ++++- mac/Sources/CodeBurnMenubar/CodeBurnApp.swift | 1 + .../CodeBurnMenubar/Data/DataClient.swift | 83 ++++++++++--------- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/mac/Sources/CodeBurnMenubar/AppStore.swift b/mac/Sources/CodeBurnMenubar/AppStore.swift index 1d7ad6a..f0c65a6 100644 --- a/mac/Sources/CodeBurnMenubar/AppStore.swift +++ b/mac/Sources/CodeBurnMenubar/AppStore.swift @@ -27,6 +27,7 @@ final class AppStore { var currency: String = "USD" var isLoading: Bool { loadingCount > 0 } private var loadingCount: Int = 0 + private var loadingStartedAt: Date? var lastError: String? var subscription: SubscriptionUsage? var subscriptionError: String? @@ -131,9 +132,21 @@ final class AppStore { func resetLoadingState() { loadingCount = 0 + loadingStartedAt = nil inFlightKeys.removeAll() } + private let loadingWatchdogSeconds: TimeInterval = 60 + + @discardableResult + func clearStaleLoadingIfNeeded() -> Bool { + guard isLoading, let started = loadingStartedAt, + Date().timeIntervalSince(started) > loadingWatchdogSeconds else { return false } + NSLog("CodeBurn: loading stuck for %ds — auto-clearing", Int(Date().timeIntervalSince(started))) + resetLoadingState() + return true + } + private func invalidateStaleDayCache() { let formatter = DateFormatter() formatter.dateFormat = "yyyy-MM-dd" @@ -157,6 +170,7 @@ final class AppStore { inFlightKeys.insert(key) let didShowLoading = showLoading || cache[key] == nil if didShowLoading { + if loadingCount == 0 { loadingStartedAt = Date() } loadingCount += 1 } // Diagnostic anchor: if this key has been empty for a long time (the @@ -172,7 +186,10 @@ final class AppStore { } defer { inFlightKeys.remove(key) - if didShowLoading { loadingCount = max(loadingCount - 1, 0) } + if didShowLoading { + loadingCount = max(loadingCount - 1, 0) + if loadingCount == 0 { loadingStartedAt = nil } + } } do { let fresh = try await DataClient.fetch(period: key.period, provider: key.provider, includeOptimize: includeOptimize) diff --git a/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift b/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift index 899f273..f7e57a0 100644 --- a/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift +++ b/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift @@ -259,6 +259,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate { } while !Task.isCancelled { guard let self else { return } + self.store.clearStaleLoadingIfNeeded() // Skip the loop's tick if a wake / manual / distributed- // notification refresh just ran. Without this gate, every // wake produced two refreshes (forceRefresh from the wake diff --git a/mac/Sources/CodeBurnMenubar/Data/DataClient.swift b/mac/Sources/CodeBurnMenubar/Data/DataClient.swift index e18a03a..4b0083c 100644 --- a/mac/Sources/CodeBurnMenubar/Data/DataClient.swift +++ b/mac/Sources/CodeBurnMenubar/Data/DataClient.swift @@ -61,41 +61,27 @@ struct DataClient { throw DataClientError.spawn(error.localizedDescription) } - // Wall-clock timeout: if the CLI hangs (parser stuck, disk stall), kill it. - // Log when this fires so a recurring stuck-popover state has an actual - // diagnostic — historically users saw "Loading..." forever with no signal - // about what failed; the only way to debug was to read process state at - // the wrong time. The log line names the subcommand so we can correlate - // with a specific period/provider combination. let timeoutTask = Task.detached(priority: .utility) { try? await Task.sleep(nanoseconds: spawnTimeoutSeconds * 1_000_000_000) if process.isRunning { NSLog("CodeBurn: CLI subprocess timed out after %llus for %@ — terminating", spawnTimeoutSeconds, subcommand.joined(separator: " ")) - process.terminate() + terminateWithEscalation(process) } } defer { timeoutTask.cancel() } - // If the caller cancels its Task (rapid period/provider tab clicks - // cancel switchTask in AppStore), terminate the in-flight subprocess. - // Without this the cancelled Task returns immediately but the spawned - // CLI keeps running to completion, piling up zombie codeburn processes - // on rapid UI interactions. We hold a strong reference to the Process - // in the cancellation handler so the closure can find it even if the - // surrounding scope has gone async. + let outHandle = outPipe.fileHandleForReading + let errHandle = errPipe.fileHandleForReading let (out, err) = await withTaskCancellationHandler { - // Drain both pipes concurrently so a large stderr can't deadlock stdout - // (the child blocks on write once the pipe buffer fills). `drain` - // also enforces a byte cap. - async let stdoutData = drain(outPipe.fileHandleForReading, limit: maxPayloadBytes) - async let stderrData = drain(errPipe.fileHandleForReading, limit: maxStderrBytes) + async let stdoutData = drain(outHandle, limit: maxPayloadBytes) + async let stderrData = drain(errHandle, limit: maxStderrBytes) return await (stdoutData, stderrData) } onCancel: { - if process.isRunning { - process.terminate() - } + terminateWithEscalation(process) } + try? outHandle.close() + try? errHandle.close() process.waitUntilExit() if out.count >= maxPayloadBytes { @@ -106,22 +92,45 @@ struct DataClient { return ProcessResult(stdout: out, stderr: stderrString, exitCode: process.terminationStatus) } - /// Pulls bytes off a pipe until EOF or `limit`. Intentionally uses `availableData`, which - /// returns empty on EOF -- no blocking once the child exits. + private static func terminateWithEscalation(_ process: Process) { + guard process.isRunning else { return } + process.terminate() + let pid = process.processIdentifier + DispatchQueue.global(qos: .utility).asyncAfter(deadline: .now() + 0.5) { + if process.isRunning { kill(pid, SIGKILL) } + } + } + private static func drain(_ handle: FileHandle, limit: Int) async -> Data { - await Task.detached(priority: .utility) { - var buffer = Data() - while buffer.count < limit { - let chunk = handle.availableData - if chunk.isEmpty { break } - let remaining = limit - buffer.count - if chunk.count > remaining { - buffer.append(chunk.prefix(remaining)) - break - } - buffer.append(chunk) + let fd = handle.fileDescriptor + let flags = Darwin.fcntl(fd, F_GETFL) + if flags >= 0 { + _ = Darwin.fcntl(fd, F_SETFL, flags | O_NONBLOCK) + } else { + NSLog("CodeBurn: fcntl F_GETFL failed on fd %d, drain may block", fd) + } + + var buffer = Data() + var chunk = [UInt8](repeating: 0, count: 65_536) + + while buffer.count < limit && !Task.isCancelled { + let toRead = min(chunk.count, limit - buffer.count) + let n = chunk.withUnsafeMutableBufferPointer { ptr in + Darwin.read(fd, ptr.baseAddress!, toRead) } - return buffer - }.value + if n > 0 { + buffer.append(contentsOf: chunk.prefix(n)) + } else if n == 0 { + break + } else if errno == EAGAIN || errno == EWOULDBLOCK { + try? await Task.sleep(nanoseconds: 5_000_000) + } else if errno == EINTR { + continue + } else { + NSLog("CodeBurn: drain read() failed on fd %d: errno %d", fd, errno) + break + } + } + return buffer } } From d79deefaae79e0d5a16abbd1f700962909b90b7c Mon Sep 17 00:00:00 2001 From: iamtoruk Date: Sun, 10 May 2026 03:30:56 -0700 Subject: [PATCH 2/2] Fix menubar refresh recovery deadlock --- mac/Sources/CodeBurnMenubar/AppStore.swift | 72 ++++++++++++++----- mac/Sources/CodeBurnMenubar/CodeBurnApp.swift | 37 +++++++++- .../Views/MenuBarContent.swift | 11 ++- 3 files changed, 97 insertions(+), 23 deletions(-) diff --git a/mac/Sources/CodeBurnMenubar/AppStore.swift b/mac/Sources/CodeBurnMenubar/AppStore.swift index f0c65a6..498f54c 100644 --- a/mac/Sources/CodeBurnMenubar/AppStore.swift +++ b/mac/Sources/CodeBurnMenubar/AppStore.swift @@ -25,10 +25,14 @@ final class AppStore { } var showingAccentPicker: Bool = false var currency: String = "USD" - var isLoading: Bool { loadingCount > 0 } - private var loadingCount: Int = 0 - private var loadingStartedAt: Date? - var lastError: String? + var isLoading: Bool { loadingCountsByKey.values.contains { $0 > 0 } } + var isCurrentKeyLoading: Bool { loadingCountsByKey[currentKey, default: 0] > 0 } + var hasAttemptedCurrentKeyLoad: Bool { attemptedKeys.contains(currentKey) } + var lastError: String? { lastErrorByKey[currentKey] } + private var loadingCountsByKey: [PayloadCacheKey: Int] = [:] + private var loadingStartedAtByKey: [PayloadCacheKey: Date] = [:] + private var attemptedKeys: Set = [] + private var lastErrorByKey: [PayloadCacheKey: String] = [:] var subscription: SubscriptionUsage? var subscriptionError: String? var subscriptionLoadState: SubscriptionLoadState = ClaudeCredentialStore.isBootstrapCompleted ? .loading : .notBootstrapped @@ -131,8 +135,8 @@ final class AppStore { private var inFlightKeys: Set = [] func resetLoadingState() { - loadingCount = 0 - loadingStartedAt = nil + loadingCountsByKey.removeAll() + loadingStartedAtByKey.removeAll() inFlightKeys.removeAll() } @@ -140,13 +144,42 @@ final class AppStore { @discardableResult func clearStaleLoadingIfNeeded() -> Bool { - guard isLoading, let started = loadingStartedAt, - Date().timeIntervalSince(started) > loadingWatchdogSeconds else { return false } - NSLog("CodeBurn: loading stuck for %ds — auto-clearing", Int(Date().timeIntervalSince(started))) - resetLoadingState() + let now = Date() + let staleEntries = loadingStartedAtByKey.filter { + now.timeIntervalSince($0.value) > loadingWatchdogSeconds + } + guard !staleEntries.isEmpty else { return false } + + for (key, started) in staleEntries { + NSLog("CodeBurn: loading stuck for %ds on %@/%@ — auto-clearing", + Int(now.timeIntervalSince(started)), key.period.rawValue, key.provider.rawValue) + loadingCountsByKey[key] = nil + loadingStartedAtByKey[key] = nil + inFlightKeys.remove(key) + if cache[key] == nil { + lastErrorByKey[key] = "Refresh took longer than expected. CodeBurn will keep retrying in the background." + } + } return true } + private func beginLoading(for key: PayloadCacheKey) { + if loadingCountsByKey[key, default: 0] == 0 { + loadingStartedAtByKey[key] = Date() + } + loadingCountsByKey[key, default: 0] += 1 + } + + private func finishLoading(for key: PayloadCacheKey) { + guard let count = loadingCountsByKey[key], count > 0 else { return } + if count == 1 { + loadingCountsByKey[key] = nil + loadingStartedAtByKey[key] = nil + } else { + loadingCountsByKey[key] = count - 1 + } + } + private func invalidateStaleDayCache() { let formatter = DateFormatter() formatter.dateFormat = "yyyy-MM-dd" @@ -168,10 +201,11 @@ final class AppStore { if !force, cache[key]?.isFresh == true { return } if !force, inFlightKeys.contains(key) { return } inFlightKeys.insert(key) + attemptedKeys.insert(key) + lastErrorByKey[key] = nil let didShowLoading = showLoading || cache[key] == nil if didShowLoading { - if loadingCount == 0 { loadingStartedAt = Date() } - loadingCount += 1 + beginLoading(for: key) } // Diagnostic anchor: if this key has been empty for a long time (the // popover would currently be showing "Loading..."), log how stale the @@ -187,8 +221,7 @@ final class AppStore { defer { inFlightKeys.remove(key) if didShowLoading { - loadingCount = max(loadingCount - 1, 0) - if loadingCount == 0 { loadingStartedAt = nil } + finishLoading(for: key) } } do { @@ -211,7 +244,7 @@ final class AppStore { } cache[key] = CachedPayload(payload: fresh, fetchedAt: Date()) lastSuccessByKey[key] = Date() - lastError = nil + lastErrorByKey[key] = nil } catch { if Task.isCancelled { return } NSLog("CodeBurn: fetch failed for \(key.period.rawValue)/\(key.provider.rawValue): \(error)") @@ -222,14 +255,14 @@ final class AppStore { if cacheDate != cacheDateAtStart { return } cache[key] = CachedPayload(payload: fallback, fetchedAt: Date()) lastSuccessByKey[key] = Date() - lastError = nil + lastErrorByKey[key] = nil return } catch { if Task.isCancelled { return } NSLog("CodeBurn: fallback fetch also failed: \(error)") } } - lastError = String(describing: error) + lastErrorByKey[key] = String(describing: error) } let allKey = PayloadCacheKey(period: selectedPeriod, provider: .all) @@ -249,7 +282,10 @@ final class AppStore { // Same day-rollover guard as refresh(): drop yesterday's payload if // the calendar rolled over during the fetch. if cacheDate != cacheDateAtStart { return } - cache[PayloadCacheKey(period: period, provider: .all)] = CachedPayload(payload: fresh, fetchedAt: Date()) + let key = PayloadCacheKey(period: period, provider: .all) + cache[key] = CachedPayload(payload: fresh, fetchedAt: Date()) + lastSuccessByKey[key] = Date() + lastErrorByKey[key] = nil } catch { NSLog("CodeBurn: quiet refresh failed for \(period.rawValue): \(error)") } diff --git a/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift b/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift index f7e57a0..5868258 100644 --- a/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift +++ b/mac/Sources/CodeBurnMenubar/CodeBurnApp.swift @@ -5,6 +5,7 @@ import Observation private let refreshIntervalSeconds: UInt64 = 30 private let nanosPerSecond: UInt64 = 1_000_000_000 private let refreshIntervalNanos: UInt64 = refreshIntervalSeconds * nanosPerSecond +private let forceRefreshWatchdogSeconds: TimeInterval = 90 private let statusItemWidth: CGFloat = NSStatusItem.variableLength private let popoverWidth: CGFloat = 360 private let popoverHeight: CGFloat = 660 @@ -36,6 +37,8 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate { private var pendingRefreshWork: DispatchWorkItem? private var refreshLoopTask: Task? private var forceRefreshTask: Task? + private var forceRefreshStartedAt: Date? + private var forceRefreshGeneration: UInt64 = 0 func applicationWillFinishLaunching(_ notification: Notification) { // Set accessory policy before the app's focus chain forms. On macOS Tahoe @@ -90,6 +93,8 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate { Task { @MainActor in self?.forceRefreshTask?.cancel() self?.forceRefreshTask = nil + self?.forceRefreshStartedAt = nil + self?.forceRefreshGeneration &+= 1 self?.refreshLoopTask?.cancel() self?.refreshLoopTask = nil } @@ -208,17 +213,42 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate { private var lastRefreshTime: Date = .distantPast + @discardableResult + private func clearStaleForceRefreshIfNeeded(now: Date = Date()) -> Bool { + if let started = forceRefreshStartedAt, forceRefreshTask != nil { + let elapsed = now.timeIntervalSince(started) + guard elapsed > forceRefreshWatchdogSeconds else { return false } + NSLog("CodeBurn: force refresh stuck for %ds — cancelling and restarting", Int(elapsed)) + forceRefreshTask?.cancel() + forceRefreshTask = nil + forceRefreshStartedAt = nil + forceRefreshGeneration &+= 1 + store.resetLoadingState() + return true + } + return false + } + private func forceRefresh() { let now = Date() + _ = clearStaleForceRefreshIfNeeded(now: now) guard now.timeIntervalSince(lastRefreshTime) > 5 else { return } lastRefreshTime = now + forceRefreshStartedAt = now + forceRefreshGeneration &+= 1 + let generation = forceRefreshGeneration - forceRefreshTask?.cancel() forceRefreshTask = Task { async let main: Void = store.refresh(includeOptimize: false, force: true, showLoading: true) async let today: Void = store.refreshQuietly(period: .today) _ = await (main, today) refreshStatusButton() + await MainActor.run { [weak self] in + guard let self, self.forceRefreshGeneration == generation else { return } + self.forceRefreshTask = nil + self.forceRefreshStartedAt = nil + self.lastRefreshTime = Date() + } } } @@ -259,13 +289,14 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate { } while !Task.isCancelled { guard let self else { return } - self.store.clearStaleLoadingIfNeeded() + let clearedStaleForceRefresh = self.clearStaleForceRefreshIfNeeded() + let clearedStaleLoading = self.store.clearStaleLoadingIfNeeded() // Skip the loop's tick if a wake / manual / distributed- // notification refresh just ran. Without this gate, every // wake produced two refreshes (forceRefresh from the wake // observer plus the loop's natural tick). let sinceLast = Date().timeIntervalSince(self.lastRefreshTime) - if sinceLast >= 5 { + if self.forceRefreshTask == nil && (clearedStaleForceRefresh || clearedStaleLoading || sinceLast >= 5) { if self.store.selectedPeriod != .today || self.store.selectedProvider != .all { async let quiet: Void = self.store.refreshQuietly(period: .today) async let main: Void = self.store.refresh(includeOptimize: false, force: true) diff --git a/mac/Sources/CodeBurnMenubar/Views/MenuBarContent.swift b/mac/Sources/CodeBurnMenubar/Views/MenuBarContent.swift index fe96215..fbf3dd9 100644 --- a/mac/Sources/CodeBurnMenubar/Views/MenuBarContent.swift +++ b/mac/Sources/CodeBurnMenubar/Views/MenuBarContent.swift @@ -47,7 +47,10 @@ struct MenuBarContent: View { // error, etc.), surface a retry card instead of leaving the // user stuck on a perpetual "Loading..." spinner. if !store.hasCachedData { - if let err = store.lastError, !store.isLoading { + if store.isCurrentKeyLoading || !store.hasAttemptedCurrentKeyLoad { + BurnLoadingOverlay(periodLabel: store.selectedPeriod.rawValue) + .transition(.opacity) + } else if let err = store.lastError { FetchErrorOverlay( error: err, periodLabel: store.selectedPeriod.rawValue, @@ -55,7 +58,11 @@ struct MenuBarContent: View { ) .transition(.opacity) } else { - BurnLoadingOverlay(periodLabel: store.selectedPeriod.rawValue) + FetchErrorOverlay( + error: "The last refresh stopped before returning data. CodeBurn will keep retrying, or you can retry now.", + periodLabel: store.selectedPeriod.rawValue, + retry: { Task { await store.refresh(includeOptimize: false, force: true, showLoading: true) } } + ) .transition(.opacity) } }