fix(kanban): recover and self-heal stuck agent runs
A run could get stranded at 'running' in the UI after a crash/disconnect/ restart, with no way to clear it. Root cause was a race: the SSE history replay re-asserted a stale `running` status that beat the poll's settled status, leaving the run showing "Running" + the settle error at once. Server (runs.ts / runner.ts / index.ts): - reconcile() on every read force-settles any 'running' run with no live runner, so the board self-heals on the next poll (≤3s) — no restart needed. - forceSettle() emits a persisted `status` event so an open/reconnecting SSE stream replays the terminal state last, not a stale `running`. - Startup orphan-reconciliation now also emits that event (was the gap that let the replay re-assert `running` after a server restart). - Idle watchdog (10min): a silent pi is settled as 'failed' instead of hanging forever; SIGKILL escalation (20s) reaps wedged processes. - stop() now recovers: active→abort, orphaned-but-running→force-stop (the Stop button clears wedged runs instead of 409'ing). - start() catch force-settles 'failed' so a spawn failure never orphans a half-created 'running' row. Client (useOrchestrator.ts): - patchRun refuses to un-settle a terminal run, dropping stale replayed status as a belt-and-suspenders guard against any such race. EOF && echo "" && git log --oneline -3
This commit is contained in:
@@ -4,6 +4,7 @@ import {
|
||||
type AgentRun,
|
||||
type DiffResult,
|
||||
type MergeResult,
|
||||
type RunStatus,
|
||||
} from '../../lib/orchestratorApi';
|
||||
|
||||
/**
|
||||
@@ -64,6 +65,10 @@ export interface UseOrchestrator {
|
||||
activeByCard: Map<string, { running: boolean; bevy: boolean; runId: string }>;
|
||||
}
|
||||
|
||||
/** Terminal states a run never legitimately leaves once reached. Used by
|
||||
* `patchRun` to reject stale history replays that would un-settle a run. */
|
||||
const TERMINAL_STATUS: ReadonlySet<RunStatus> = new Set(['completed', 'failed', 'stopped']);
|
||||
|
||||
export function useOrchestrator(): UseOrchestrator {
|
||||
const [runs, setRuns] = useState<AgentRun[]>([]);
|
||||
const [bevyRunning, setBevyRunning] = useState<Set<string>>(new Set());
|
||||
@@ -144,9 +149,25 @@ export function useOrchestrator(): UseOrchestrator {
|
||||
setRuns((prev) => prev.filter((r) => r.id !== runId));
|
||||
}, []);
|
||||
|
||||
/** Apply a partial update to a run (status/summary/etc., from stream events). */
|
||||
/**
|
||||
* Apply a partial update to a run (status/summary/etc., from stream events).
|
||||
* Guards against stale history replays un-settling a run: a terminal run can
|
||||
* never legitimately return to `running`/`queued`, so such a patch (e.g. a
|
||||
* reconnect re-emitting an old `running` status) is dropped.
|
||||
*/
|
||||
const patchRun = useCallback((runId: string, patch: Partial<AgentRun>) => {
|
||||
setRuns((prev) => prev.map((r) => (r.id === runId ? { ...r, ...patch } : r)));
|
||||
setRuns((prev) => {
|
||||
const cur = prev.find((r) => r.id === runId);
|
||||
if (
|
||||
cur &&
|
||||
TERMINAL_STATUS.has(cur.status) &&
|
||||
patch.status !== undefined &&
|
||||
!TERMINAL_STATUS.has(patch.status)
|
||||
) {
|
||||
return prev; // ignore stale status re-assertion
|
||||
}
|
||||
return prev.map((r) => (r.id === runId ? { ...r, ...patch } : r));
|
||||
});
|
||||
}, []);
|
||||
|
||||
/** Reflect a Bevy playtest lifecycle change (from stream events). */
|
||||
|
||||
Reference in New Issue
Block a user