Files
Space-Game/apps/docs/src/components/kanban/useOrchestrator.ts
francy51 408bdb6dd7 fix(kanban): recover and self-heal stuck agent runs
A run could get stranded at 'running' in the UI after a crash/disconnect/
restart, with no way to clear it. Root cause was a race: the SSE history
replay re-asserted a stale `running` status that beat the poll's settled
status, leaving the run showing "Running" + the settle error at once.

Server (runs.ts / runner.ts / index.ts):
- reconcile() on every read force-settles any 'running' run with no live
  runner, so the board self-heals on the next poll (≤3s) — no restart needed.
- forceSettle() emits a persisted `status` event so an open/reconnecting
  SSE stream replays the terminal state last, not a stale `running`.
- Startup orphan-reconciliation now also emits that event (was the gap that
  let the replay re-assert `running` after a server restart).
- Idle watchdog (10min): a silent pi is settled as 'failed' instead of
  hanging forever; SIGKILL escalation (20s) reaps wedged processes.
- stop() now recovers: active→abort, orphaned-but-running→force-stop
  (the Stop button clears wedged runs instead of 409'ing).
- start() catch force-settles 'failed' so a spawn failure never orphans a
  half-created 'running' row.

Client (useOrchestrator.ts):
- patchRun refuses to un-settle a terminal run, dropping stale replayed
  status as a belt-and-suspenders guard against any such race.
EOF && echo "" && git log --oneline -3
2026-06-17 18:53:44 -04:00

292 lines
9.8 KiB
TypeScript

import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
import {
orchestratorApi,
type AgentRun,
type DiffResult,
type MergeResult,
type RunStatus,
} from '../../lib/orchestratorApi';
/**
* Shared run registry for the implementation board.
*
* Owns the lightweight, board-level slice of orchestrator state: the run list,
* Bevy-playtest flags, and the derived active-run index. It deliberately does
* NOT hold the streaming event log — that lives in `useRunStream`, scoped to the
* run console (`AgentRunBar`), so a burst of agent events re-renders only the
* console and never the board page or card modal. Lifecycle changes observed in
* a stream (status / bevy / done) are pushed back here via `patchRun` /
* `reflectBevy` so the board's active indicators stay correct.
*
* The returned object is memoized on its state/callbacks, so its identity is
* stable between registry lifecycle changes (not, e.g., on every render).
*/
export interface UseOrchestrator {
/** Newest run for a card (active if one is running, else the last settled). */
runForCard: (cardId: string) => AgentRun | undefined;
/** True while a run is actively working the given card. */
isRunning: (cardId: string) => boolean;
/** All known runs (newest first), for a global activity view. */
runs: AgentRun[];
/** Load initial state. */
reload: () => Promise<void>;
loading: boolean;
error: string | null;
/** Begin a run for a card. Returns the created run. */
start: (input: { cardId: string; prompt?: string; refineRunId?: string }) => Promise<AgentRun>;
/** Send a steer/follow-up message to an active run. */
message: (runId: string, text: string, mode: 'steer' | 'followUp') => Promise<void>;
/** Stop an active run. */
stop: (runId: string) => Promise<void>;
/** Remove a settled run from the UI (and reclaim its worktree). */
remove: (runId: string) => Promise<void>;
/** Apply a partial update to a run record (used by stream reflectors). */
patchRun: (runId: string, patch: Partial<AgentRun>) => void;
/** Reflect a Bevy playtest lifecycle change (used by stream reflectors). */
reflectBevy: (runId: string, running: boolean) => void;
/** Fetch a run's branch diff vs main. */
getDiff: (runId: string) => Promise<DiffResult>;
/** Merge a run's branch into the main worktree. */
mergeRun: (runId: string) => Promise<MergeResult>;
/** Start a Bevy playtest in a run's worktree. */
startBevy: (runId: string) => Promise<void>;
/** Stop a run's Bevy playtest. */
stopBevy: (runId: string) => Promise<void>;
/** Whether a Bevy playtest is running for a run. */
bevyIsRunning: (runId: string) => boolean;
/** Re-fetch a run's Bevy status from the server (truth after a reconnect). */
refreshBevyStatus: (runId: string) => Promise<void>;
/**
* Active runs indexed by card id. Memoized and referentially stable unless
* the active set actually changes (not on every streamed event), so memoized
* card components can read their flags without re-rendering on noise.
*/
activeByCard: Map<string, { running: boolean; bevy: boolean; runId: string }>;
}
/** Terminal states a run never legitimately leaves once reached. Used by
* `patchRun` to reject stale history replays that would un-settle a run. */
const TERMINAL_STATUS: ReadonlySet<RunStatus> = new Set(['completed', 'failed', 'stopped']);
export function useOrchestrator(): UseOrchestrator {
const [runs, setRuns] = useState<AgentRun[]>([]);
const [bevyRunning, setBevyRunning] = useState<Set<string>>(new Set());
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
// Signature of the last loaded run list, so the background poll can skip
// state updates (and re-renders) when nothing actually changed.
const lastSig = useRef('');
const reload = useCallback(async () => {
setError(null);
try {
const { runs: list } = await orchestratorApi.listRuns();
// Only the fields that affect what the UI renders; identities/order are
// stable from the server (newest-first), so this is a reliable change check.
const sig = list
.map((r) => `${r.id}|${r.status}|${r.finishedAt ?? ''}|${r.summary ?? ''}|${r.commitSha ?? ''}`)
.join('\n');
if (sig !== lastSig.current) {
lastSig.current = sig;
setRuns(list);
}
} catch (e) {
setError(e instanceof Error ? e.message : 'Failed to load runs');
} finally {
setLoading(false);
}
}, []);
useEffect(() => {
void reload();
}, [reload]);
// Background poll keeps run status fresh even when no card modal is open.
// Poll faster while any run is active so a collapsed card's running indicator
// turns over promptly when it settles. (The live event stream, when a modal is
// open, is the primary updater; this is a liveness backstop.)
const anyRunning = useMemo(() => runs.some((r) => r.status === 'running'), [runs]);
useEffect(() => {
const ms = anyRunning ? 3_000 : 10_000;
const id = setInterval(() => {
void reload();
}, ms);
return () => clearInterval(id);
}, [reload, anyRunning]);
const upsertRun = useCallback((run: AgentRun) => {
setRuns((prev) => {
const next = prev.filter((r) => r.id !== run.id);
next.unshift(run);
return next;
});
}, []);
const start = useCallback(
async (input: { cardId: string; prompt?: string; refineRunId?: string }) => {
const { run } = await orchestratorApi.startRun(input);
upsertRun(run);
return run;
},
[upsertRun],
);
const message = useCallback(
async (runId: string, text: string, mode: 'steer' | 'followUp') => {
await orchestratorApi.messageRun(runId, text, mode);
},
[],
);
const stop = useCallback(async (runId: string) => {
await orchestratorApi.stopRun(runId);
}, []);
const remove = useCallback(async (runId: string) => {
await orchestratorApi.deleteRun(runId);
setRuns((prev) => prev.filter((r) => r.id !== runId));
}, []);
/**
* Apply a partial update to a run (status/summary/etc., from stream events).
* Guards against stale history replays un-settling a run: a terminal run can
* never legitimately return to `running`/`queued`, so such a patch (e.g. a
* reconnect re-emitting an old `running` status) is dropped.
*/
const patchRun = useCallback((runId: string, patch: Partial<AgentRun>) => {
setRuns((prev) => {
const cur = prev.find((r) => r.id === runId);
if (
cur &&
TERMINAL_STATUS.has(cur.status) &&
patch.status !== undefined &&
!TERMINAL_STATUS.has(patch.status)
) {
return prev; // ignore stale status re-assertion
}
return prev.map((r) => (r.id === runId ? { ...r, ...patch } : r));
});
}, []);
/** Reflect a Bevy playtest lifecycle change (from stream events). */
const reflectBevy = useCallback((runId: string, running: boolean) => {
setBevyRunning((prev) => {
const next = new Set(prev);
if (running) next.add(runId);
else next.delete(runId);
return next;
});
}, []);
const runForCard = useCallback(
(cardId: string) => {
const forCard = runs.filter((r) => r.cardId === cardId);
const active = forCard.find((r) => r.status === 'running');
return active ?? forCard[0];
},
[runs],
);
const isRunning = useCallback(
(cardId: string) => Boolean(runs.find((r) => r.cardId === cardId && r.status === 'running')),
[runs],
);
const getDiff = useCallback((runId: string) => orchestratorApi.getDiff(runId), []);
const mergeRun = useCallback((runId: string) => orchestratorApi.mergeRun(runId), []);
const startBevy = useCallback(async (runId: string) => {
await orchestratorApi.startBevy(runId);
setBevyRunning((prev) => new Set(prev).add(runId));
}, []);
const stopBevy = useCallback(async (runId: string) => {
await orchestratorApi.stopBevy(runId);
// Optimistically clear; the `end` event reconciles.
setBevyRunning((prev) => {
const next = new Set(prev);
next.delete(runId);
return next;
});
}, []);
const bevyIsRunning = useCallback((runId: string) => bevyRunning.has(runId), [bevyRunning]);
/**
* Card-id index of active runs. Recomputed only when `runs` or `bevyRunning`
* changes — NOT on every streamed event — so memoized consumers stay stable.
*/
const activeByCard = useMemo(() => {
const m = new Map<string, { running: boolean; bevy: boolean; runId: string }>();
for (const r of runs) {
if (r.status === 'running') {
m.set(r.cardId, { running: true, bevy: bevyRunning.has(r.id), runId: r.id });
}
}
return m;
}, [runs, bevyRunning]);
const refreshBevyStatus = useCallback(async (runId: string) => {
try {
const { running } = await orchestratorApi.bevyStatus(runId);
setBevyRunning((prev) => {
const next = new Set(prev);
if (running) next.add(runId);
else next.delete(runId);
return next;
});
} catch {
/* server unavailable — keep current state */
}
}, []);
// Stable identity: re-created only when registry state changes, so consumers
// (and the per-run stream effect) don't churn on unrelated renders.
return useMemo<UseOrchestrator>(
() => ({
runForCard,
isRunning,
runs,
reload,
loading,
error,
start,
message,
stop,
remove,
patchRun,
reflectBevy,
getDiff,
mergeRun,
startBevy,
stopBevy,
bevyIsRunning,
refreshBevyStatus,
activeByCard,
}),
[
runForCard,
isRunning,
runs,
reload,
loading,
error,
start,
message,
stop,
remove,
patchRun,
reflectBevy,
getDiff,
mergeRun,
startBevy,
stopBevy,
bevyIsRunning,
refreshBevyStatus,
activeByCard,
],
);
}