|
|
@@ -0,0 +1,2999 @@
|
|
|
+import { feature } from 'bun:bundle'
|
|
|
+import { randomUUID } from 'crypto'
|
|
|
+import { hostname, tmpdir } from 'os'
|
|
|
+import { basename, join, resolve } from 'path'
|
|
|
+import { getRemoteSessionUrl } from '../constants/product.js'
|
|
|
+import { shutdownDatadog } from '../services/analytics/datadog.js'
|
|
|
+import { shutdown1PEventLogging } from '../services/analytics/firstPartyEventLogger.js'
|
|
|
+import { checkGate_CACHED_OR_BLOCKING } from '../services/analytics/growthbook.js'
|
|
|
+import {
|
|
|
+ type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ logEvent,
|
|
|
+ logEventAsync,
|
|
|
+} from '../services/analytics/index.js'
|
|
|
+import { isInBundledMode } from '../utils/bundledMode.js'
|
|
|
+import { logForDebugging } from '../utils/debug.js'
|
|
|
+import { logForDiagnosticsNoPII } from '../utils/diagLogs.js'
|
|
|
+import { isEnvTruthy, isInProtectedNamespace } from '../utils/envUtils.js'
|
|
|
+import { errorMessage } from '../utils/errors.js'
|
|
|
+import { truncateToWidth } from '../utils/format.js'
|
|
|
+import { logError } from '../utils/log.js'
|
|
|
+import { sleep } from '../utils/sleep.js'
|
|
|
+import { createAgentWorktree, removeAgentWorktree } from '../utils/worktree.js'
|
|
|
+import {
|
|
|
+ BridgeFatalError,
|
|
|
+ createBridgeApiClient,
|
|
|
+ isExpiredErrorType,
|
|
|
+ isSuppressible403,
|
|
|
+ validateBridgeId,
|
|
|
+} from './bridgeApi.js'
|
|
|
+import { formatDuration } from './bridgeStatusUtil.js'
|
|
|
+import { createBridgeLogger } from './bridgeUI.js'
|
|
|
+import { createCapacityWake } from './capacityWake.js'
|
|
|
+import { describeAxiosError } from './debugUtils.js'
|
|
|
+import { createTokenRefreshScheduler } from './jwtUtils.js'
|
|
|
+import { getPollIntervalConfig } from './pollConfig.js'
|
|
|
+import { toCompatSessionId, toInfraSessionId } from './sessionIdCompat.js'
|
|
|
+import { createSessionSpawner, safeFilenameId } from './sessionRunner.js'
|
|
|
+import { getTrustedDeviceToken } from './trustedDevice.js'
|
|
|
+import {
|
|
|
+ BRIDGE_LOGIN_ERROR,
|
|
|
+ type BridgeApiClient,
|
|
|
+ type BridgeConfig,
|
|
|
+ type BridgeLogger,
|
|
|
+ DEFAULT_SESSION_TIMEOUT_MS,
|
|
|
+ type SessionDoneStatus,
|
|
|
+ type SessionHandle,
|
|
|
+ type SessionSpawner,
|
|
|
+ type SessionSpawnOpts,
|
|
|
+ type SpawnMode,
|
|
|
+} from './types.js'
|
|
|
+import {
|
|
|
+ buildCCRv2SdkUrl,
|
|
|
+ buildSdkUrl,
|
|
|
+ decodeWorkSecret,
|
|
|
+ registerWorker,
|
|
|
+ sameSessionId,
|
|
|
+} from './workSecret.js'
|
|
|
+
|
|
|
+export type BackoffConfig = {
|
|
|
+ connInitialMs: number
|
|
|
+ connCapMs: number
|
|
|
+ connGiveUpMs: number
|
|
|
+ generalInitialMs: number
|
|
|
+ generalCapMs: number
|
|
|
+ generalGiveUpMs: number
|
|
|
+ /** SIGTERM→SIGKILL grace period on shutdown. Default 30s. */
|
|
|
+ shutdownGraceMs?: number
|
|
|
+ /** stopWorkWithRetry base delay (1s/2s/4s backoff). Default 1000ms. */
|
|
|
+ stopWorkBaseDelayMs?: number
|
|
|
+}
|
|
|
+
|
|
|
+const DEFAULT_BACKOFF: BackoffConfig = {
|
|
|
+ connInitialMs: 2_000,
|
|
|
+ connCapMs: 120_000, // 2 minutes
|
|
|
+ connGiveUpMs: 600_000, // 10 minutes
|
|
|
+ generalInitialMs: 500,
|
|
|
+ generalCapMs: 30_000,
|
|
|
+ generalGiveUpMs: 600_000, // 10 minutes
|
|
|
+}
|
|
|
+
|
|
|
+/** Status update interval for the live display (ms). */
|
|
|
+const STATUS_UPDATE_INTERVAL_MS = 1_000
|
|
|
+const SPAWN_SESSIONS_DEFAULT = 32
|
|
|
+
|
|
|
+/**
|
|
|
+ * GrowthBook gate for multi-session spawn modes (--spawn / --capacity / --create-session-in-dir).
|
|
|
+ * Sibling of tengu_ccr_bridge_multi_environment (multiple envs per host:dir) —
|
|
|
+ * this one enables multiple sessions per environment.
|
|
|
+ * Rollout staged via targeting rules: ants first, then gradual external.
|
|
|
+ *
|
|
|
+ * Uses the blocking gate check so a stale disk-cache miss doesn't unfairly
|
|
|
+ * deny access. The fast path (cache has true) is still instant; only the
|
|
|
+ * cold-start path awaits the server fetch, and that fetch also seeds the
|
|
|
+ * disk cache for next time.
|
|
|
+ */
|
|
|
+async function isMultiSessionSpawnEnabled(): Promise<boolean> {
|
|
|
+ return checkGate_CACHED_OR_BLOCKING('tengu_ccr_bridge_multi_session')
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Returns the threshold for detecting system sleep/wake in the poll loop.
|
|
|
+ * Must exceed the max backoff cap — otherwise normal backoff delays trigger
|
|
|
+ * false sleep detection (resetting the error budget indefinitely). Using
|
|
|
+ * 2× the connection backoff cap, matching the pattern in WebSocketTransport
|
|
|
+ * and replBridge.
|
|
|
+ */
|
|
|
+function pollSleepDetectionThresholdMs(backoff: BackoffConfig): number {
|
|
|
+ return backoff.connCapMs * 2
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Returns the args that must precede CLI flags when spawning a child claude
|
|
|
+ * process. In compiled binaries, process.execPath is the claude binary itself
|
|
|
+ * and args go directly to it. In npm installs (node running cli.js),
|
|
|
+ * process.execPath is the node runtime — the child spawn must pass the script
|
|
|
+ * path as the first arg, otherwise node interprets --sdk-url as a node option
|
|
|
+ * and exits with "bad option: --sdk-url". See anthropics/claude-code#28334.
|
|
|
+ */
|
|
|
+function spawnScriptArgs(): string[] {
|
|
|
+ if (isInBundledMode() || !process.argv[1]) {
|
|
|
+ return []
|
|
|
+ }
|
|
|
+ return [process.argv[1]]
|
|
|
+}
|
|
|
+
|
|
|
+/** Attempt to spawn a session; returns error string if spawn throws. */
|
|
|
+function safeSpawn(
|
|
|
+ spawner: SessionSpawner,
|
|
|
+ opts: SessionSpawnOpts,
|
|
|
+ dir: string,
|
|
|
+): SessionHandle | string {
|
|
|
+ try {
|
|
|
+ return spawner.spawn(opts, dir)
|
|
|
+ } catch (err) {
|
|
|
+ const errMsg = errorMessage(err)
|
|
|
+ logError(new Error(`Session spawn failed: ${errMsg}`))
|
|
|
+ return errMsg
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+export async function runBridgeLoop(
|
|
|
+ config: BridgeConfig,
|
|
|
+ environmentId: string,
|
|
|
+ environmentSecret: string,
|
|
|
+ api: BridgeApiClient,
|
|
|
+ spawner: SessionSpawner,
|
|
|
+ logger: BridgeLogger,
|
|
|
+ signal: AbortSignal,
|
|
|
+ backoffConfig: BackoffConfig = DEFAULT_BACKOFF,
|
|
|
+ initialSessionId?: string,
|
|
|
+ getAccessToken?: () => string | undefined | Promise<string | undefined>,
|
|
|
+): Promise<void> {
|
|
|
+ // Local abort controller so that onSessionDone can stop the poll loop.
|
|
|
+ // Linked to the incoming signal so external aborts also work.
|
|
|
+ const controller = new AbortController()
|
|
|
+ if (signal.aborted) {
|
|
|
+ controller.abort()
|
|
|
+ } else {
|
|
|
+ signal.addEventListener('abort', () => controller.abort(), { once: true })
|
|
|
+ }
|
|
|
+ const loopSignal = controller.signal
|
|
|
+
|
|
|
+ const activeSessions = new Map<string, SessionHandle>()
|
|
|
+ const sessionStartTimes = new Map<string, number>()
|
|
|
+ const sessionWorkIds = new Map<string, string>()
|
|
|
+ // Compat-surface ID (session_*) computed once at spawn and cached so
|
|
|
+ // cleanup and status-update ticks use the same key regardless of whether
|
|
|
+ // the tengu_bridge_repl_v2_cse_shim_enabled gate flips mid-session.
|
|
|
+ const sessionCompatIds = new Map<string, string>()
|
|
|
+ // Session ingress JWTs for heartbeat auth, keyed by sessionId.
|
|
|
+ // Stored separately from handle.accessToken because the token refresh
|
|
|
+ // scheduler overwrites that field with the OAuth token (~3h55m in).
|
|
|
+ const sessionIngressTokens = new Map<string, string>()
|
|
|
+ const sessionTimers = new Map<string, ReturnType<typeof setTimeout>>()
|
|
|
+ const completedWorkIds = new Set<string>()
|
|
|
+ const sessionWorktrees = new Map<
|
|
|
+ string,
|
|
|
+ {
|
|
|
+ worktreePath: string
|
|
|
+ worktreeBranch?: string
|
|
|
+ gitRoot?: string
|
|
|
+ hookBased?: boolean
|
|
|
+ }
|
|
|
+ >()
|
|
|
+ // Track sessions killed by the timeout watchdog so onSessionDone can
|
|
|
+ // distinguish them from server-initiated or shutdown interrupts.
|
|
|
+ const timedOutSessions = new Set<string>()
|
|
|
+ // Sessions that already have a title (server-set or bridge-derived) so
|
|
|
+ // onFirstUserMessage doesn't clobber a user-assigned --name / web rename.
|
|
|
+ // Keyed by compatSessionId to match logger.setSessionTitle's key.
|
|
|
+ const titledSessions = new Set<string>()
|
|
|
+ // Signal to wake the at-capacity sleep early when a session completes,
|
|
|
+ // so the bridge can immediately accept new work.
|
|
|
+ const capacityWake = createCapacityWake(loopSignal)
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Heartbeat all active work items.
|
|
|
+ * Returns 'ok' if at least one heartbeat succeeded, 'auth_failed' if any
|
|
|
+ * got a 401/403 (JWT expired — re-queued via reconnectSession so the next
|
|
|
+ * poll delivers fresh work), or 'failed' if all failed for other reasons.
|
|
|
+ */
|
|
|
+ async function heartbeatActiveWorkItems(): Promise<
|
|
|
+ 'ok' | 'auth_failed' | 'fatal' | 'failed'
|
|
|
+ > {
|
|
|
+ let anySuccess = false
|
|
|
+ let anyFatal = false
|
|
|
+ const authFailedSessions: string[] = []
|
|
|
+ for (const [sessionId] of activeSessions) {
|
|
|
+ const workId = sessionWorkIds.get(sessionId)
|
|
|
+ const ingressToken = sessionIngressTokens.get(sessionId)
|
|
|
+ if (!workId || !ingressToken) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ try {
|
|
|
+ await api.heartbeatWork(environmentId, workId, ingressToken)
|
|
|
+ anySuccess = true
|
|
|
+ } catch (err) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:heartbeat] Failed for sessionId=${sessionId} workId=${workId}: ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ if (err instanceof BridgeFatalError) {
|
|
|
+ logEvent('tengu_bridge_heartbeat_error', {
|
|
|
+ status:
|
|
|
+ err.status as unknown as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ error_type: (err.status === 401 || err.status === 403
|
|
|
+ ? 'auth_failed'
|
|
|
+ : 'fatal') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ })
|
|
|
+ if (err.status === 401 || err.status === 403) {
|
|
|
+ authFailedSessions.push(sessionId)
|
|
|
+ } else {
|
|
|
+ // 404/410 = environment expired or deleted — no point retrying
|
|
|
+ anyFatal = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // JWT expired → trigger server-side re-dispatch. Without this, work stays
|
|
|
+ // ACK'd out of the Redis PEL and poll returns empty forever (CC-1263).
|
|
|
+ // The existingHandle path below delivers the fresh token to the child.
|
|
|
+ // sessionId is already in the format /bridge/reconnect expects: it comes
|
|
|
+ // from work.data.id, which matches the server's EnvironmentInstance store
|
|
|
+ // (cse_* under the compat gate, session_* otherwise).
|
|
|
+ for (const sessionId of authFailedSessions) {
|
|
|
+ logger.logVerbose(
|
|
|
+ `Session ${sessionId} token expired — re-queuing via bridge/reconnect`,
|
|
|
+ )
|
|
|
+ try {
|
|
|
+ await api.reconnectSession(environmentId, sessionId)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:heartbeat] Re-queued sessionId=${sessionId} via bridge/reconnect`,
|
|
|
+ )
|
|
|
+ } catch (err) {
|
|
|
+ logger.logError(
|
|
|
+ `Failed to refresh session ${sessionId} token: ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:heartbeat] reconnectSession(${sessionId}) failed: ${errorMessage(err)}`,
|
|
|
+ { level: 'error' },
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (anyFatal) {
|
|
|
+ return 'fatal'
|
|
|
+ }
|
|
|
+ if (authFailedSessions.length > 0) {
|
|
|
+ return 'auth_failed'
|
|
|
+ }
|
|
|
+ return anySuccess ? 'ok' : 'failed'
|
|
|
+ }
|
|
|
+
|
|
|
+ // Sessions spawned with CCR v2 env vars. v2 children cannot use OAuth
|
|
|
+ // tokens (CCR worker endpoints validate the JWT's session_id claim,
|
|
|
+ // register_worker.go:32), so onRefresh triggers server re-dispatch
|
|
|
+ // instead — the next poll delivers fresh work with a new JWT via the
|
|
|
+ // existingHandle path below.
|
|
|
+ const v2Sessions = new Set<string>()
|
|
|
+
|
|
|
+ // Proactive token refresh: schedules a timer 5min before the session
|
|
|
+ // ingress JWT expires. v1 delivers OAuth directly; v2 calls
|
|
|
+ // reconnectSession to trigger server re-dispatch (CC-1263: without
|
|
|
+ // this, v2 daemon sessions silently die at ~5h since the server does
|
|
|
+ // not auto-re-dispatch ACK'd work on lease expiry).
|
|
|
+ const tokenRefresh = getAccessToken
|
|
|
+ ? createTokenRefreshScheduler({
|
|
|
+ getAccessToken,
|
|
|
+ onRefresh: (sessionId, oauthToken) => {
|
|
|
+ const handle = activeSessions.get(sessionId)
|
|
|
+ if (!handle) {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if (v2Sessions.has(sessionId)) {
|
|
|
+ logger.logVerbose(
|
|
|
+ `Refreshing session ${sessionId} token via bridge/reconnect`,
|
|
|
+ )
|
|
|
+ void api
|
|
|
+ .reconnectSession(environmentId, sessionId)
|
|
|
+ .catch((err: unknown) => {
|
|
|
+ logger.logError(
|
|
|
+ `Failed to refresh session ${sessionId} token: ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:token] reconnectSession(${sessionId}) failed: ${errorMessage(err)}`,
|
|
|
+ { level: 'error' },
|
|
|
+ )
|
|
|
+ })
|
|
|
+ } else {
|
|
|
+ handle.updateAccessToken(oauthToken)
|
|
|
+ }
|
|
|
+ },
|
|
|
+ label: 'bridge',
|
|
|
+ })
|
|
|
+ : null
|
|
|
+ const loopStartTime = Date.now()
|
|
|
+ // Track all in-flight cleanup promises (stopWork, worktree removal) so
|
|
|
+ // the shutdown sequence can await them before process.exit().
|
|
|
+ const pendingCleanups = new Set<Promise<unknown>>()
|
|
|
+ function trackCleanup(p: Promise<unknown>): void {
|
|
|
+ pendingCleanups.add(p)
|
|
|
+ void p.finally(() => pendingCleanups.delete(p))
|
|
|
+ }
|
|
|
+ let connBackoff = 0
|
|
|
+ let generalBackoff = 0
|
|
|
+ let connErrorStart: number | null = null
|
|
|
+ let generalErrorStart: number | null = null
|
|
|
+ let lastPollErrorTime: number | null = null
|
|
|
+ let statusUpdateTimer: ReturnType<typeof setInterval> | null = null
|
|
|
+ // Set by BridgeFatalError and give-up paths so the shutdown block can
|
|
|
+ // skip the resume message (resume is impossible after env expiry/auth
|
|
|
+ // failure/sustained connection errors).
|
|
|
+ let fatalExit = false
|
|
|
+
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Starting poll loop spawnMode=${config.spawnMode} maxSessions=${config.maxSessions} environmentId=${environmentId}`,
|
|
|
+ )
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_loop_started', {
|
|
|
+ max_sessions: config.maxSessions,
|
|
|
+ spawn_mode: config.spawnMode,
|
|
|
+ })
|
|
|
+
|
|
|
+ // For ant users, show where session debug logs will land so they can tail them.
|
|
|
+ // sessionRunner.ts uses the same base path. File appears once a session spawns.
|
|
|
+ if (process.env.USER_TYPE === 'ant') {
|
|
|
+ let debugGlob: string
|
|
|
+ if (config.debugFile) {
|
|
|
+ const ext = config.debugFile.lastIndexOf('.')
|
|
|
+ debugGlob =
|
|
|
+ ext > 0
|
|
|
+ ? `${config.debugFile.slice(0, ext)}-*${config.debugFile.slice(ext)}`
|
|
|
+ : `${config.debugFile}-*`
|
|
|
+ } else {
|
|
|
+ debugGlob = join(tmpdir(), 'claude', 'bridge-session-*.log')
|
|
|
+ }
|
|
|
+ logger.setDebugLogPath(debugGlob)
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.printBanner(config, environmentId)
|
|
|
+
|
|
|
+ // Seed the logger's session count + spawn mode before any render. Without
|
|
|
+ // this, setAttached() below renders with the logger's default sessionMax=1,
|
|
|
+ // showing "Capacity: 0/1" until the status ticker kicks in (which is gated
|
|
|
+ // by !initialSessionId and only starts after the poll loop picks up work).
|
|
|
+ logger.updateSessionCount(0, config.maxSessions, config.spawnMode)
|
|
|
+
|
|
|
+ // If an initial session was pre-created, show its URL from the start so
|
|
|
+ // the user can click through immediately (matching /remote-control behavior).
|
|
|
+ if (initialSessionId) {
|
|
|
+ logger.setAttached(initialSessionId)
|
|
|
+ }
|
|
|
+
|
|
|
+ /** Refresh the inline status display. Shows idle or active depending on state. */
|
|
|
+ function updateStatusDisplay(): void {
|
|
|
+ // Push the session count (no-op when maxSessions === 1) so the
|
|
|
+ // next renderStatusLine tick shows the current count.
|
|
|
+ logger.updateSessionCount(
|
|
|
+ activeSessions.size,
|
|
|
+ config.maxSessions,
|
|
|
+ config.spawnMode,
|
|
|
+ )
|
|
|
+
|
|
|
+ // Push per-session activity into the multi-session display.
|
|
|
+ for (const [sid, handle] of activeSessions) {
|
|
|
+ const act = handle.currentActivity
|
|
|
+ if (act) {
|
|
|
+ logger.updateSessionActivity(sessionCompatIds.get(sid) ?? sid, act)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (activeSessions.size === 0) {
|
|
|
+ logger.updateIdleStatus()
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ // Show the most recently started session that is still actively working.
|
|
|
+ // Sessions whose current activity is 'result' or 'error' are between
|
|
|
+ // turns — the CLI emitted its result but the process stays alive waiting
|
|
|
+ // for the next user message. Skip updating so the status line keeps
|
|
|
+ // whatever state it had (Attached / session title).
|
|
|
+ const [sessionId, handle] = [...activeSessions.entries()].pop()!
|
|
|
+ const startTime = sessionStartTimes.get(sessionId)
|
|
|
+ if (!startTime) return
|
|
|
+
|
|
|
+ const activity = handle.currentActivity
|
|
|
+ if (!activity || activity.type === 'result' || activity.type === 'error') {
|
|
|
+ // Session is between turns — keep current status (Attached/titled).
|
|
|
+ // In multi-session mode, still refresh so bullet-list activities stay current.
|
|
|
+ if (config.maxSessions > 1) logger.refreshDisplay()
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ const elapsed = formatDuration(Date.now() - startTime)
|
|
|
+
|
|
|
+ // Build trail from recent tool activities (last 5)
|
|
|
+ const trail = handle.activities
|
|
|
+ .filter(a => a.type === 'tool_start')
|
|
|
+ .slice(-5)
|
|
|
+ .map(a => a.summary)
|
|
|
+
|
|
|
+ logger.updateSessionStatus(sessionId, elapsed, activity, trail)
|
|
|
+ }
|
|
|
+
|
|
|
+ /** Start the status display update ticker. */
|
|
|
+ function startStatusUpdates(): void {
|
|
|
+ stopStatusUpdates()
|
|
|
+ // Call immediately so the first transition (e.g. Connecting → Ready)
|
|
|
+ // happens without delay, avoiding concurrent timer races.
|
|
|
+ updateStatusDisplay()
|
|
|
+ statusUpdateTimer = setInterval(
|
|
|
+ updateStatusDisplay,
|
|
|
+ STATUS_UPDATE_INTERVAL_MS,
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ /** Stop the status display update ticker. */
|
|
|
+ function stopStatusUpdates(): void {
|
|
|
+ if (statusUpdateTimer) {
|
|
|
+ clearInterval(statusUpdateTimer)
|
|
|
+ statusUpdateTimer = null
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ function onSessionDone(
|
|
|
+ sessionId: string,
|
|
|
+ startTime: number,
|
|
|
+ handle: SessionHandle,
|
|
|
+ ): (status: SessionDoneStatus) => void {
|
|
|
+ return (rawStatus: SessionDoneStatus): void => {
|
|
|
+ const workId = sessionWorkIds.get(sessionId)
|
|
|
+ activeSessions.delete(sessionId)
|
|
|
+ sessionStartTimes.delete(sessionId)
|
|
|
+ sessionWorkIds.delete(sessionId)
|
|
|
+ sessionIngressTokens.delete(sessionId)
|
|
|
+ const compatId = sessionCompatIds.get(sessionId) ?? sessionId
|
|
|
+ sessionCompatIds.delete(sessionId)
|
|
|
+ logger.removeSession(compatId)
|
|
|
+ titledSessions.delete(compatId)
|
|
|
+ v2Sessions.delete(sessionId)
|
|
|
+ // Clear per-session timeout timer
|
|
|
+ const timer = sessionTimers.get(sessionId)
|
|
|
+ if (timer) {
|
|
|
+ clearTimeout(timer)
|
|
|
+ sessionTimers.delete(sessionId)
|
|
|
+ }
|
|
|
+ // Clear token refresh timer
|
|
|
+ tokenRefresh?.cancel(sessionId)
|
|
|
+ // Wake the at-capacity sleep so the bridge can accept new work immediately
|
|
|
+ capacityWake.wake()
|
|
|
+
|
|
|
+ // If the session was killed by the timeout watchdog, treat it as a
|
|
|
+ // failed session (not a server/shutdown interrupt) so we still call
|
|
|
+ // stopWork and archiveSession below.
|
|
|
+ const wasTimedOut = timedOutSessions.delete(sessionId)
|
|
|
+ const status: SessionDoneStatus =
|
|
|
+ wasTimedOut && rawStatus === 'interrupted' ? 'failed' : rawStatus
|
|
|
+ const durationMs = Date.now() - startTime
|
|
|
+
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] sessionId=${sessionId} workId=${workId ?? 'unknown'} exited status=${status} duration=${formatDuration(durationMs)}`,
|
|
|
+ )
|
|
|
+ logEvent('tengu_bridge_session_done', {
|
|
|
+ status:
|
|
|
+ status as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ duration_ms: durationMs,
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_session_done', {
|
|
|
+ status,
|
|
|
+ duration_ms: durationMs,
|
|
|
+ })
|
|
|
+
|
|
|
+ // Clear the status display before printing final log
|
|
|
+ logger.clearStatus()
|
|
|
+ stopStatusUpdates()
|
|
|
+
|
|
|
+ // Build error message from stderr if available
|
|
|
+ const stderrSummary =
|
|
|
+ handle.lastStderr.length > 0 ? handle.lastStderr.join('\n') : undefined
|
|
|
+ let failureMessage: string | undefined
|
|
|
+
|
|
|
+ switch (status) {
|
|
|
+ case 'completed':
|
|
|
+ logger.logSessionComplete(sessionId, durationMs)
|
|
|
+ break
|
|
|
+ case 'failed':
|
|
|
+ // Skip failure log during shutdown — the child exits non-zero when
|
|
|
+ // killed, which is expected and not a real failure.
|
|
|
+ // Also skip for timeout-killed sessions — the timeout watchdog
|
|
|
+ // already logged a clear timeout message.
|
|
|
+ if (!wasTimedOut && !loopSignal.aborted) {
|
|
|
+ failureMessage = stderrSummary ?? 'Process exited with error'
|
|
|
+ logger.logSessionFailed(sessionId, failureMessage)
|
|
|
+ logError(new Error(`Bridge session failed: ${failureMessage}`))
|
|
|
+ }
|
|
|
+ break
|
|
|
+ case 'interrupted':
|
|
|
+ logger.logVerbose(`Session ${sessionId} interrupted`)
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // Notify the server that this work item is done. Skip for interrupted
|
|
|
+ // sessions — interrupts are either server-initiated (the server already
|
|
|
+ // knows) or caused by bridge shutdown (which calls stopWork() separately).
|
|
|
+ if (status !== 'interrupted' && workId) {
|
|
|
+ trackCleanup(
|
|
|
+ stopWorkWithRetry(
|
|
|
+ api,
|
|
|
+ environmentId,
|
|
|
+ workId,
|
|
|
+ logger,
|
|
|
+ backoffConfig.stopWorkBaseDelayMs,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ completedWorkIds.add(workId)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Clean up worktree if one was created for this session
|
|
|
+ const wt = sessionWorktrees.get(sessionId)
|
|
|
+ if (wt) {
|
|
|
+ sessionWorktrees.delete(sessionId)
|
|
|
+ trackCleanup(
|
|
|
+ removeAgentWorktree(
|
|
|
+ wt.worktreePath,
|
|
|
+ wt.worktreeBranch,
|
|
|
+ wt.gitRoot,
|
|
|
+ wt.hookBased,
|
|
|
+ ).catch((err: unknown) =>
|
|
|
+ logger.logVerbose(
|
|
|
+ `Failed to remove worktree ${wt.worktreePath}: ${errorMessage(err)}`,
|
|
|
+ ),
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ // Lifecycle decision: in multi-session mode, keep the bridge running
|
|
|
+ // after a session completes. In single-session mode, abort the poll
|
|
|
+ // loop so the bridge exits cleanly.
|
|
|
+ if (status !== 'interrupted' && !loopSignal.aborted) {
|
|
|
+ if (config.spawnMode !== 'single-session') {
|
|
|
+ // Multi-session: archive the completed session so it doesn't linger
|
|
|
+ // as stale in the web UI. archiveSession is idempotent (409 if already
|
|
|
+ // archived), so double-archiving at shutdown is safe.
|
|
|
+ // sessionId arrived as cse_* from the work poll (infrastructure-layer
|
|
|
+ // tag). archiveSession hits /v1/sessions/{id}/archive which is the
|
|
|
+ // compat surface and validates TagSession (session_*). Re-tag — same
|
|
|
+ // UUID underneath.
|
|
|
+ trackCleanup(
|
|
|
+ api
|
|
|
+ .archiveSession(compatId)
|
|
|
+ .catch((err: unknown) =>
|
|
|
+ logger.logVerbose(
|
|
|
+ `Failed to archive session ${sessionId}: ${errorMessage(err)}`,
|
|
|
+ ),
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] Session ${status}, returning to idle (multi-session mode)`,
|
|
|
+ )
|
|
|
+ } else {
|
|
|
+ // Single-session: coupled lifecycle — tear down environment
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] Session ${status}, aborting poll loop to tear down environment`,
|
|
|
+ )
|
|
|
+ controller.abort()
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!loopSignal.aborted) {
|
|
|
+ startStatusUpdates()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Start the idle status display immediately — unless we have a pre-created
|
|
|
+ // session, in which case setAttached() already set up the display and the
|
|
|
+ // poll loop will start status updates when it picks up the session.
|
|
|
+ if (!initialSessionId) {
|
|
|
+ startStatusUpdates()
|
|
|
+ }
|
|
|
+
|
|
|
+ while (!loopSignal.aborted) {
|
|
|
+ // Fetched once per iteration — the GrowthBook cache refreshes every
|
|
|
+ // 5 min, so a loop running at the at-capacity rate picks up config
|
|
|
+ // changes within one sleep cycle.
|
|
|
+ const pollConfig = getPollIntervalConfig()
|
|
|
+
|
|
|
+ try {
|
|
|
+ const work = await api.pollForWork(
|
|
|
+ environmentId,
|
|
|
+ environmentSecret,
|
|
|
+ loopSignal,
|
|
|
+ pollConfig.reclaim_older_than_ms,
|
|
|
+ )
|
|
|
+
|
|
|
+ // Log reconnection if we were previously disconnected
|
|
|
+ const wasDisconnected =
|
|
|
+ connErrorStart !== null || generalErrorStart !== null
|
|
|
+ if (wasDisconnected) {
|
|
|
+ const disconnectedMs =
|
|
|
+ Date.now() - (connErrorStart ?? generalErrorStart ?? Date.now())
|
|
|
+ logger.logReconnected(disconnectedMs)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:poll] Reconnected after ${formatDuration(disconnectedMs)}`,
|
|
|
+ )
|
|
|
+ logEvent('tengu_bridge_reconnected', {
|
|
|
+ disconnected_ms: disconnectedMs,
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ connBackoff = 0
|
|
|
+ generalBackoff = 0
|
|
|
+ connErrorStart = null
|
|
|
+ generalErrorStart = null
|
|
|
+ lastPollErrorTime = null
|
|
|
+
|
|
|
+ // Null response = no work available in the queue.
|
|
|
+ // Add a minimum delay to avoid hammering the server.
|
|
|
+ if (!work) {
|
|
|
+ // Use live check (not a snapshot) since sessions can end during poll.
|
|
|
+ const atCap = activeSessions.size >= config.maxSessions
|
|
|
+ if (atCap) {
|
|
|
+ const atCapMs = pollConfig.multisession_poll_interval_ms_at_capacity
|
|
|
+ // Heartbeat loops WITHOUT polling. When at-capacity polling is also
|
|
|
+ // enabled (atCapMs > 0), the loop tracks a deadline and breaks out
|
|
|
+ // to poll at that interval — heartbeat and poll compose instead of
|
|
|
+ // one suppressing the other. We break out to poll when:
|
|
|
+ // - Poll deadline reached (atCapMs > 0 only)
|
|
|
+ // - Auth fails (JWT expired → poll refreshes tokens)
|
|
|
+ // - Capacity wake fires (session ended → poll for new work)
|
|
|
+ // - Loop aborted (shutdown)
|
|
|
+ if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) {
|
|
|
+ logEvent('tengu_bridge_heartbeat_mode_entered', {
|
|
|
+ active_sessions: activeSessions.size,
|
|
|
+ heartbeat_interval_ms:
|
|
|
+ pollConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ })
|
|
|
+ // Deadline computed once at entry — GB updates to atCapMs don't
|
|
|
+ // shift an in-flight deadline (next entry picks up the new value).
|
|
|
+ const pollDeadline = atCapMs > 0 ? Date.now() + atCapMs : null
|
|
|
+ let hbResult: 'ok' | 'auth_failed' | 'fatal' | 'failed' = 'ok'
|
|
|
+ let hbCycles = 0
|
|
|
+ while (
|
|
|
+ !loopSignal.aborted &&
|
|
|
+ activeSessions.size >= config.maxSessions &&
|
|
|
+ (pollDeadline === null || Date.now() < pollDeadline)
|
|
|
+ ) {
|
|
|
+ // Re-read config each cycle so GrowthBook updates take effect
|
|
|
+ const hbConfig = getPollIntervalConfig()
|
|
|
+ if (hbConfig.non_exclusive_heartbeat_interval_ms <= 0) break
|
|
|
+
|
|
|
+ // Capture capacity signal BEFORE the async heartbeat call so
|
|
|
+ // a session ending during the HTTP request is caught by the
|
|
|
+ // subsequent sleep (instead of being lost to a replaced controller).
|
|
|
+ const cap = capacityWake.signal()
|
|
|
+
|
|
|
+ hbResult = await heartbeatActiveWorkItems()
|
|
|
+ if (hbResult === 'auth_failed' || hbResult === 'fatal') {
|
|
|
+ cap.cleanup()
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ hbCycles++
|
|
|
+ await sleep(
|
|
|
+ hbConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ cap.cleanup()
|
|
|
+ }
|
|
|
+
|
|
|
+ // Determine exit reason for telemetry
|
|
|
+ const exitReason =
|
|
|
+ hbResult === 'auth_failed' || hbResult === 'fatal'
|
|
|
+ ? hbResult
|
|
|
+ : loopSignal.aborted
|
|
|
+ ? 'shutdown'
|
|
|
+ : activeSessions.size < config.maxSessions
|
|
|
+ ? 'capacity_changed'
|
|
|
+ : pollDeadline !== null && Date.now() >= pollDeadline
|
|
|
+ ? 'poll_due'
|
|
|
+ : 'config_disabled'
|
|
|
+ logEvent('tengu_bridge_heartbeat_mode_exited', {
|
|
|
+ reason:
|
|
|
+ exitReason as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ heartbeat_cycles: hbCycles,
|
|
|
+ active_sessions: activeSessions.size,
|
|
|
+ })
|
|
|
+ if (exitReason === 'poll_due') {
|
|
|
+ // bridgeApi throttles empty-poll logs (EMPTY_POLL_LOG_INTERVAL=100)
|
|
|
+ // so the once-per-10min poll_due poll is invisible at counter=2.
|
|
|
+ // Log it here so verification runs see both endpoints in the debug log.
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:poll] Heartbeat poll_due after ${hbCycles} cycles — falling through to pollForWork`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ // On auth_failed or fatal, sleep before polling to avoid a tight
|
|
|
+ // poll+heartbeat loop. Auth_failed: heartbeatActiveWorkItems
|
|
|
+ // already called reconnectSession — the sleep gives the server
|
|
|
+ // time to propagate the re-queue. Fatal (404/410): may be a
|
|
|
+ // single work item GCd while the environment is still valid.
|
|
|
+ // Use atCapMs if enabled, else the heartbeat interval as a floor
|
|
|
+ // (guaranteed > 0 here) so heartbeat-only configs don't tight-loop.
|
|
|
+ if (hbResult === 'auth_failed' || hbResult === 'fatal') {
|
|
|
+ const cap = capacityWake.signal()
|
|
|
+ await sleep(
|
|
|
+ atCapMs > 0
|
|
|
+ ? atCapMs
|
|
|
+ : pollConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ cap.cleanup()
|
|
|
+ }
|
|
|
+ } else if (atCapMs > 0) {
|
|
|
+ // Heartbeat disabled: slow poll as liveness signal.
|
|
|
+ const cap = capacityWake.signal()
|
|
|
+ await sleep(atCapMs, cap.signal)
|
|
|
+ cap.cleanup()
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ const interval =
|
|
|
+ activeSessions.size > 0
|
|
|
+ ? pollConfig.multisession_poll_interval_ms_partial_capacity
|
|
|
+ : pollConfig.multisession_poll_interval_ms_not_at_capacity
|
|
|
+ await sleep(interval, loopSignal)
|
|
|
+ }
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // At capacity — we polled to keep the heartbeat alive, but cannot
|
|
|
+ // accept new work right now. We still enter the switch below so that
|
|
|
+ // token refreshes for existing sessions are processed (the case
|
|
|
+ // 'session' handler checks for existing sessions before the inner
|
|
|
+ // capacity guard).
|
|
|
+ const atCapacityBeforeSwitch = activeSessions.size >= config.maxSessions
|
|
|
+
|
|
|
+ // Skip work items that have already been completed and stopped.
|
|
|
+ // The server may re-deliver stale work before processing our stop
|
|
|
+ // request, which would otherwise cause a duplicate session spawn.
|
|
|
+ if (completedWorkIds.has(work.id)) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Skipping already-completed workId=${work.id}`,
|
|
|
+ )
|
|
|
+ // Respect capacity throttle — without a sleep here, persistent stale
|
|
|
+ // redeliveries would tight-loop at poll-request speed (the !work
|
|
|
+ // branch above is the only sleep, and work != null skips it).
|
|
|
+ if (atCapacityBeforeSwitch) {
|
|
|
+ const cap = capacityWake.signal()
|
|
|
+ if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) {
|
|
|
+ await heartbeatActiveWorkItems()
|
|
|
+ await sleep(
|
|
|
+ pollConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ } else if (pollConfig.multisession_poll_interval_ms_at_capacity > 0) {
|
|
|
+ await sleep(
|
|
|
+ pollConfig.multisession_poll_interval_ms_at_capacity,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ cap.cleanup()
|
|
|
+ } else {
|
|
|
+ await sleep(1000, loopSignal)
|
|
|
+ }
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // Decode the work secret for session spawning and to extract the JWT
|
|
|
+ // used for the ack call below.
|
|
|
+ let secret
|
|
|
+ try {
|
|
|
+ secret = decodeWorkSecret(work.secret)
|
|
|
+ } catch (err) {
|
|
|
+ const errMsg = errorMessage(err)
|
|
|
+ logger.logError(
|
|
|
+ `Failed to decode work secret for workId=${work.id}: ${errMsg}`,
|
|
|
+ )
|
|
|
+ logEvent('tengu_bridge_work_secret_failed', {})
|
|
|
+ // Can't ack (needs the JWT we failed to decode). stopWork uses OAuth,
|
|
|
+ // so it's callable here — prevents XAUTOCLAIM from re-delivering this
|
|
|
+ // poisoned item every reclaim_older_than_ms cycle.
|
|
|
+ completedWorkIds.add(work.id)
|
|
|
+ trackCleanup(
|
|
|
+ stopWorkWithRetry(
|
|
|
+ api,
|
|
|
+ environmentId,
|
|
|
+ work.id,
|
|
|
+ logger,
|
|
|
+ backoffConfig.stopWorkBaseDelayMs,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ // Respect capacity throttle before retrying — without a sleep here,
|
|
|
+ // repeated decode failures at capacity would tight-loop at
|
|
|
+ // poll-request speed (work != null skips the !work sleep above).
|
|
|
+ if (atCapacityBeforeSwitch) {
|
|
|
+ const cap = capacityWake.signal()
|
|
|
+ if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) {
|
|
|
+ await heartbeatActiveWorkItems()
|
|
|
+ await sleep(
|
|
|
+ pollConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ } else if (pollConfig.multisession_poll_interval_ms_at_capacity > 0) {
|
|
|
+ await sleep(
|
|
|
+ pollConfig.multisession_poll_interval_ms_at_capacity,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ cap.cleanup()
|
|
|
+ }
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // Explicitly acknowledge after committing to handle the work — NOT
|
|
|
+ // before. The at-capacity guard inside case 'session' can break
|
|
|
+ // without spawning; acking there would permanently lose the work.
|
|
|
+ // Ack failures are non-fatal: server re-delivers, and existingHandle
|
|
|
+ // / completedWorkIds paths handle the dedup.
|
|
|
+ const ackWork = async (): Promise<void> => {
|
|
|
+ logForDebugging(`[bridge:work] Acknowledging workId=${work.id}`)
|
|
|
+ try {
|
|
|
+ await api.acknowledgeWork(
|
|
|
+ environmentId,
|
|
|
+ work.id,
|
|
|
+ secret.session_ingress_token,
|
|
|
+ )
|
|
|
+ } catch (err) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Acknowledge failed workId=${work.id}: ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ const workType: string = work.data.type
|
|
|
+ switch (work.data.type) {
|
|
|
+ case 'healthcheck':
|
|
|
+ await ackWork()
|
|
|
+ logForDebugging('[bridge:work] Healthcheck received')
|
|
|
+ logger.logVerbose('Healthcheck received')
|
|
|
+ break
|
|
|
+ case 'session': {
|
|
|
+ const sessionId = work.data.id
|
|
|
+ try {
|
|
|
+ validateBridgeId(sessionId, 'session_id')
|
|
|
+ } catch {
|
|
|
+ await ackWork()
|
|
|
+ logger.logError(`Invalid session_id received: ${sessionId}`)
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // If the session is already running, deliver the fresh token so
|
|
|
+ // the child process can reconnect its WebSocket with the new
|
|
|
+ // session ingress token. This handles the case where the server
|
|
|
+ // re-dispatches work for an existing session after the WS drops.
|
|
|
+ const existingHandle = activeSessions.get(sessionId)
|
|
|
+ if (existingHandle) {
|
|
|
+ existingHandle.updateAccessToken(secret.session_ingress_token)
|
|
|
+ sessionIngressTokens.set(sessionId, secret.session_ingress_token)
|
|
|
+ sessionWorkIds.set(sessionId, work.id)
|
|
|
+ // Re-schedule next refresh from the fresh JWT's expiry. onRefresh
|
|
|
+ // branches on v2Sessions so both v1 and v2 are safe here.
|
|
|
+ tokenRefresh?.schedule(sessionId, secret.session_ingress_token)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Updated access token for existing sessionId=${sessionId} workId=${work.id}`,
|
|
|
+ )
|
|
|
+ await ackWork()
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // At capacity — token refresh for existing sessions is handled
|
|
|
+ // above, but we cannot spawn new ones. The post-switch capacity
|
|
|
+ // sleep will throttle the loop; just break here.
|
|
|
+ if (activeSessions.size >= config.maxSessions) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] At capacity (${activeSessions.size}/${config.maxSessions}), cannot spawn new session for workId=${work.id}`,
|
|
|
+ )
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ await ackWork()
|
|
|
+ const spawnStartTime = Date.now()
|
|
|
+
|
|
|
+ // CCR v2 path: register this bridge as the session worker, get the
|
|
|
+ // epoch, and point the child at /v1/code/sessions/{id}. The child
|
|
|
+ // already has the full v2 client (SSETransport + CCRClient) — same
|
|
|
+ // code path environment-manager launches in containers.
|
|
|
+ //
|
|
|
+ // v1 path: Session-Ingress WebSocket. Uses config.sessionIngressUrl
|
|
|
+ // (not secret.api_base_url, which may point to a remote proxy tunnel
|
|
|
+ // that doesn't know about locally-created sessions).
|
|
|
+ let sdkUrl: string
|
|
|
+ let useCcrV2 = false
|
|
|
+ let workerEpoch: number | undefined
|
|
|
+ // Server decides per-session via the work secret; env var is the
|
|
|
+ // ant-dev override (e.g. forcing v2 before the server flag is on).
|
|
|
+ if (
|
|
|
+ secret.use_code_sessions === true ||
|
|
|
+ isEnvTruthy(process.env.CLAUDE_BRIDGE_USE_CCR_V2)
|
|
|
+ ) {
|
|
|
+ sdkUrl = buildCCRv2SdkUrl(config.apiBaseUrl, sessionId)
|
|
|
+ // Retry once on transient failure (network blip, 500) before
|
|
|
+ // permanently giving up and killing the session.
|
|
|
+ for (let attempt = 1; attempt <= 2; attempt++) {
|
|
|
+ try {
|
|
|
+ workerEpoch = await registerWorker(
|
|
|
+ sdkUrl,
|
|
|
+ secret.session_ingress_token,
|
|
|
+ )
|
|
|
+ useCcrV2 = true
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] CCR v2: registered worker sessionId=${sessionId} epoch=${workerEpoch} attempt=${attempt}`,
|
|
|
+ )
|
|
|
+ break
|
|
|
+ } catch (err) {
|
|
|
+ const errMsg = errorMessage(err)
|
|
|
+ if (attempt < 2) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] CCR v2: registerWorker attempt ${attempt} failed, retrying: ${errMsg}`,
|
|
|
+ )
|
|
|
+ await sleep(2_000, loopSignal)
|
|
|
+ if (loopSignal.aborted) break
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ logger.logError(
|
|
|
+ `CCR v2 worker registration failed for session ${sessionId}: ${errMsg}`,
|
|
|
+ )
|
|
|
+ logError(new Error(`registerWorker failed: ${errMsg}`))
|
|
|
+ completedWorkIds.add(work.id)
|
|
|
+ trackCleanup(
|
|
|
+ stopWorkWithRetry(
|
|
|
+ api,
|
|
|
+ environmentId,
|
|
|
+ work.id,
|
|
|
+ logger,
|
|
|
+ backoffConfig.stopWorkBaseDelayMs,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (!useCcrV2) break
|
|
|
+ } else {
|
|
|
+ sdkUrl = buildSdkUrl(config.sessionIngressUrl, sessionId)
|
|
|
+ }
|
|
|
+
|
|
|
+ // In worktree mode, on-demand sessions get an isolated git worktree
|
|
|
+ // so concurrent sessions don't interfere with each other's file
|
|
|
+ // changes. The pre-created initial session (if any) runs in
|
|
|
+ // config.dir so the user's first session lands in the directory they
|
|
|
+ // invoked `rc` from — matching the old single-session UX.
|
|
|
+ // In same-dir and single-session modes, all sessions share config.dir.
|
|
|
+ // Capture spawnMode before the await below — the `w` key handler
|
|
|
+ // mutates config.spawnMode directly, and createAgentWorktree can
|
|
|
+ // take 1-2s, so reading config.spawnMode after the await can
|
|
|
+ // produce contradictory analytics (spawn_mode:'same-dir', in_worktree:true).
|
|
|
+ const spawnModeAtDecision = config.spawnMode
|
|
|
+ let sessionDir = config.dir
|
|
|
+ let worktreeCreateMs = 0
|
|
|
+ if (
|
|
|
+ spawnModeAtDecision === 'worktree' &&
|
|
|
+ (initialSessionId === undefined ||
|
|
|
+ !sameSessionId(sessionId, initialSessionId))
|
|
|
+ ) {
|
|
|
+ const wtStart = Date.now()
|
|
|
+ try {
|
|
|
+ const wt = await createAgentWorktree(
|
|
|
+ `bridge-${safeFilenameId(sessionId)}`,
|
|
|
+ )
|
|
|
+ worktreeCreateMs = Date.now() - wtStart
|
|
|
+ sessionWorktrees.set(sessionId, {
|
|
|
+ worktreePath: wt.worktreePath,
|
|
|
+ worktreeBranch: wt.worktreeBranch,
|
|
|
+ gitRoot: wt.gitRoot,
|
|
|
+ hookBased: wt.hookBased,
|
|
|
+ })
|
|
|
+ sessionDir = wt.worktreePath
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] Created worktree for sessionId=${sessionId} at ${wt.worktreePath}`,
|
|
|
+ )
|
|
|
+ } catch (err) {
|
|
|
+ const errMsg = errorMessage(err)
|
|
|
+ logger.logError(
|
|
|
+ `Failed to create worktree for session ${sessionId}: ${errMsg}`,
|
|
|
+ )
|
|
|
+ logError(new Error(`Worktree creation failed: ${errMsg}`))
|
|
|
+ completedWorkIds.add(work.id)
|
|
|
+ trackCleanup(
|
|
|
+ stopWorkWithRetry(
|
|
|
+ api,
|
|
|
+ environmentId,
|
|
|
+ work.id,
|
|
|
+ logger,
|
|
|
+ backoffConfig.stopWorkBaseDelayMs,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] Spawning sessionId=${sessionId} sdkUrl=${sdkUrl}`,
|
|
|
+ )
|
|
|
+
|
|
|
+ // compat-surface session_* form for logger/Sessions-API calls.
|
|
|
+ // Work poll returns cse_* under v2 compat; convert before spawn so
|
|
|
+ // the onFirstUserMessage callback can close over it.
|
|
|
+ const compatSessionId = toCompatSessionId(sessionId)
|
|
|
+
|
|
|
+ const spawnResult = safeSpawn(
|
|
|
+ spawner,
|
|
|
+ {
|
|
|
+ sessionId,
|
|
|
+ sdkUrl,
|
|
|
+ accessToken: secret.session_ingress_token,
|
|
|
+ useCcrV2,
|
|
|
+ workerEpoch,
|
|
|
+ onFirstUserMessage: text => {
|
|
|
+ // Server-set titles (--name, web rename) win. fetchSessionTitle
|
|
|
+ // runs concurrently; if it already populated titledSessions,
|
|
|
+ // skip. If it hasn't resolved yet, the derived title sticks —
|
|
|
+ // acceptable since the server had no title at spawn time.
|
|
|
+ if (titledSessions.has(compatSessionId)) return
|
|
|
+ titledSessions.add(compatSessionId)
|
|
|
+ const title = deriveSessionTitle(text)
|
|
|
+ logger.setSessionTitle(compatSessionId, title)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:title] derived title for ${compatSessionId}: ${title}`,
|
|
|
+ )
|
|
|
+ void import('./createSession.js')
|
|
|
+ .then(({ updateBridgeSessionTitle }) =>
|
|
|
+ updateBridgeSessionTitle(compatSessionId, title, {
|
|
|
+ baseUrl: config.apiBaseUrl,
|
|
|
+ }),
|
|
|
+ )
|
|
|
+ .catch(err =>
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:title] failed to update title for ${compatSessionId}: ${err}`,
|
|
|
+ { level: 'error' },
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ },
|
|
|
+ },
|
|
|
+ sessionDir,
|
|
|
+ )
|
|
|
+ if (typeof spawnResult === 'string') {
|
|
|
+ logger.logError(
|
|
|
+ `Failed to spawn session ${sessionId}: ${spawnResult}`,
|
|
|
+ )
|
|
|
+ // Clean up worktree if one was created for this session
|
|
|
+ const wt = sessionWorktrees.get(sessionId)
|
|
|
+ if (wt) {
|
|
|
+ sessionWorktrees.delete(sessionId)
|
|
|
+ trackCleanup(
|
|
|
+ removeAgentWorktree(
|
|
|
+ wt.worktreePath,
|
|
|
+ wt.worktreeBranch,
|
|
|
+ wt.gitRoot,
|
|
|
+ wt.hookBased,
|
|
|
+ ).catch((err: unknown) =>
|
|
|
+ logger.logVerbose(
|
|
|
+ `Failed to remove worktree ${wt.worktreePath}: ${errorMessage(err)}`,
|
|
|
+ ),
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ }
|
|
|
+ completedWorkIds.add(work.id)
|
|
|
+ trackCleanup(
|
|
|
+ stopWorkWithRetry(
|
|
|
+ api,
|
|
|
+ environmentId,
|
|
|
+ work.id,
|
|
|
+ logger,
|
|
|
+ backoffConfig.stopWorkBaseDelayMs,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ break
|
|
|
+ }
|
|
|
+ const handle = spawnResult
|
|
|
+
|
|
|
+ const spawnDurationMs = Date.now() - spawnStartTime
|
|
|
+ logEvent('tengu_bridge_session_started', {
|
|
|
+ active_sessions: activeSessions.size,
|
|
|
+ spawn_mode:
|
|
|
+ spawnModeAtDecision as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ in_worktree: sessionWorktrees.has(sessionId),
|
|
|
+ spawn_duration_ms: spawnDurationMs,
|
|
|
+ worktree_create_ms: worktreeCreateMs,
|
|
|
+ inProtectedNamespace: isInProtectedNamespace(),
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_session_started', {
|
|
|
+ spawn_mode: spawnModeAtDecision,
|
|
|
+ in_worktree: sessionWorktrees.has(sessionId),
|
|
|
+ spawn_duration_ms: spawnDurationMs,
|
|
|
+ worktree_create_ms: worktreeCreateMs,
|
|
|
+ })
|
|
|
+
|
|
|
+ activeSessions.set(sessionId, handle)
|
|
|
+ sessionWorkIds.set(sessionId, work.id)
|
|
|
+ sessionIngressTokens.set(sessionId, secret.session_ingress_token)
|
|
|
+ sessionCompatIds.set(sessionId, compatSessionId)
|
|
|
+
|
|
|
+ const startTime = Date.now()
|
|
|
+ sessionStartTimes.set(sessionId, startTime)
|
|
|
+
|
|
|
+ // Use a generic prompt description since we no longer get startup_context
|
|
|
+ logger.logSessionStart(sessionId, `Session ${sessionId}`)
|
|
|
+
|
|
|
+ // Compute the actual debug file path (mirrors sessionRunner.ts logic)
|
|
|
+ const safeId = safeFilenameId(sessionId)
|
|
|
+ let sessionDebugFile: string | undefined
|
|
|
+ if (config.debugFile) {
|
|
|
+ const ext = config.debugFile.lastIndexOf('.')
|
|
|
+ if (ext > 0) {
|
|
|
+ sessionDebugFile = `${config.debugFile.slice(0, ext)}-${safeId}${config.debugFile.slice(ext)}`
|
|
|
+ } else {
|
|
|
+ sessionDebugFile = `${config.debugFile}-${safeId}`
|
|
|
+ }
|
|
|
+ } else if (config.verbose || process.env.USER_TYPE === 'ant') {
|
|
|
+ sessionDebugFile = join(
|
|
|
+ tmpdir(),
|
|
|
+ 'claude',
|
|
|
+ `bridge-session-${safeId}.log`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ if (sessionDebugFile) {
|
|
|
+ logger.logVerbose(`Debug log: ${sessionDebugFile}`)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Register in the sessions Map before starting status updates so the
|
|
|
+ // first render tick shows the correct count and bullet list in sync.
|
|
|
+ logger.addSession(
|
|
|
+ compatSessionId,
|
|
|
+ getRemoteSessionUrl(compatSessionId, config.sessionIngressUrl),
|
|
|
+ )
|
|
|
+
|
|
|
+ // Start live status updates and transition to "Attached" state.
|
|
|
+ startStatusUpdates()
|
|
|
+ logger.setAttached(compatSessionId)
|
|
|
+
|
|
|
+ // One-shot title fetch. If the session already has a title (set via
|
|
|
+ // --name, web rename, or /remote-control), display it and mark as
|
|
|
+ // titled so the first-user-message fallback doesn't overwrite it.
|
|
|
+ // Otherwise onFirstUserMessage derives one from the first prompt.
|
|
|
+ void fetchSessionTitle(compatSessionId, config.apiBaseUrl)
|
|
|
+ .then(title => {
|
|
|
+ if (title && activeSessions.has(sessionId)) {
|
|
|
+ titledSessions.add(compatSessionId)
|
|
|
+ logger.setSessionTitle(compatSessionId, title)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:title] server title for ${compatSessionId}: ${title}`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .catch(err =>
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:title] failed to fetch title for ${compatSessionId}: ${err}`,
|
|
|
+ { level: 'error' },
|
|
|
+ ),
|
|
|
+ )
|
|
|
+
|
|
|
+ // Start per-session timeout watchdog
|
|
|
+ const timeoutMs =
|
|
|
+ config.sessionTimeoutMs ?? DEFAULT_SESSION_TIMEOUT_MS
|
|
|
+ if (timeoutMs > 0) {
|
|
|
+ const timer = setTimeout(
|
|
|
+ onSessionTimeout,
|
|
|
+ timeoutMs,
|
|
|
+ sessionId,
|
|
|
+ timeoutMs,
|
|
|
+ logger,
|
|
|
+ timedOutSessions,
|
|
|
+ handle,
|
|
|
+ )
|
|
|
+ sessionTimers.set(sessionId, timer)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Schedule proactive token refresh before the JWT expires.
|
|
|
+ // onRefresh branches on v2Sessions: v1 delivers OAuth to the
|
|
|
+ // child, v2 triggers server re-dispatch via reconnectSession.
|
|
|
+ if (useCcrV2) {
|
|
|
+ v2Sessions.add(sessionId)
|
|
|
+ }
|
|
|
+ tokenRefresh?.schedule(sessionId, secret.session_ingress_token)
|
|
|
+
|
|
|
+ void handle.done.then(onSessionDone(sessionId, startTime, handle))
|
|
|
+ break
|
|
|
+ }
|
|
|
+ default:
|
|
|
+ await ackWork()
|
|
|
+ // Gracefully ignore unknown work types. The backend may send new
|
|
|
+ // types before the bridge client is updated.
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Unknown work type: ${workType}, skipping`,
|
|
|
+ )
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // When at capacity, throttle the loop. The switch above still runs so
|
|
|
+ // existing-session token refreshes are processed, but we sleep here
|
|
|
+ // to avoid busy-looping. Include the capacity wake signal so the
|
|
|
+ // sleep is interrupted immediately when a session completes.
|
|
|
+ if (atCapacityBeforeSwitch) {
|
|
|
+ const cap = capacityWake.signal()
|
|
|
+ if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) {
|
|
|
+ await heartbeatActiveWorkItems()
|
|
|
+ await sleep(
|
|
|
+ pollConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ } else if (pollConfig.multisession_poll_interval_ms_at_capacity > 0) {
|
|
|
+ await sleep(
|
|
|
+ pollConfig.multisession_poll_interval_ms_at_capacity,
|
|
|
+ cap.signal,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ cap.cleanup()
|
|
|
+ }
|
|
|
+ } catch (err) {
|
|
|
+ if (loopSignal.aborted) {
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fatal errors (401/403) — no point retrying, auth won't fix itself
|
|
|
+ if (err instanceof BridgeFatalError) {
|
|
|
+ fatalExit = true
|
|
|
+ // Server-enforced expiry gets a clean status message, not an error
|
|
|
+ if (isExpiredErrorType(err.errorType)) {
|
|
|
+ logger.logStatus(err.message)
|
|
|
+ } else if (isSuppressible403(err)) {
|
|
|
+ // Cosmetic 403 errors (e.g., external_poll_sessions scope,
|
|
|
+ // environments:manage permission) — don't show to user
|
|
|
+ logForDebugging(`[bridge:work] Suppressed 403 error: ${err.message}`)
|
|
|
+ } else {
|
|
|
+ logger.logError(err.message)
|
|
|
+ logError(err)
|
|
|
+ }
|
|
|
+ logEvent('tengu_bridge_fatal_error', {
|
|
|
+ status: err.status,
|
|
|
+ error_type:
|
|
|
+ err.errorType as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII(
|
|
|
+ isExpiredErrorType(err.errorType) ? 'info' : 'error',
|
|
|
+ 'bridge_fatal_error',
|
|
|
+ { status: err.status, error_type: err.errorType },
|
|
|
+ )
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ const errMsg = describeAxiosError(err)
|
|
|
+
|
|
|
+ if (isConnectionError(err) || isServerError(err)) {
|
|
|
+ const now = Date.now()
|
|
|
+
|
|
|
+ // Detect system sleep/wake: if the gap since the last poll error
|
|
|
+ // greatly exceeds the expected backoff, the machine likely slept.
|
|
|
+ // Reset error tracking so the bridge retries with a fresh budget.
|
|
|
+ if (
|
|
|
+ lastPollErrorTime !== null &&
|
|
|
+ now - lastPollErrorTime > pollSleepDetectionThresholdMs(backoffConfig)
|
|
|
+ ) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Detected system sleep (${Math.round((now - lastPollErrorTime) / 1000)}s gap), resetting error budget`,
|
|
|
+ )
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_poll_sleep_detected', {
|
|
|
+ gapMs: now - lastPollErrorTime,
|
|
|
+ })
|
|
|
+ connErrorStart = null
|
|
|
+ connBackoff = 0
|
|
|
+ generalErrorStart = null
|
|
|
+ generalBackoff = 0
|
|
|
+ }
|
|
|
+ lastPollErrorTime = now
|
|
|
+
|
|
|
+ if (!connErrorStart) {
|
|
|
+ connErrorStart = now
|
|
|
+ }
|
|
|
+ const elapsed = now - connErrorStart
|
|
|
+ if (elapsed >= backoffConfig.connGiveUpMs) {
|
|
|
+ logger.logError(
|
|
|
+ `Server unreachable for ${Math.round(elapsed / 60_000)} minutes, giving up.`,
|
|
|
+ )
|
|
|
+ logEvent('tengu_bridge_poll_give_up', {
|
|
|
+ error_type:
|
|
|
+ 'connection' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ elapsed_ms: elapsed,
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII('error', 'bridge_poll_give_up', {
|
|
|
+ error_type: 'connection',
|
|
|
+ elapsed_ms: elapsed,
|
|
|
+ })
|
|
|
+ fatalExit = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // Reset the other track when switching error types
|
|
|
+ generalErrorStart = null
|
|
|
+ generalBackoff = 0
|
|
|
+
|
|
|
+ connBackoff = connBackoff
|
|
|
+ ? Math.min(connBackoff * 2, backoffConfig.connCapMs)
|
|
|
+ : backoffConfig.connInitialMs
|
|
|
+ const delay = addJitter(connBackoff)
|
|
|
+ logger.logVerbose(
|
|
|
+ `Connection error, retrying in ${formatDelay(delay)} (${Math.round(elapsed / 1000)}s elapsed): ${errMsg}`,
|
|
|
+ )
|
|
|
+ logger.updateReconnectingStatus(
|
|
|
+ formatDelay(delay),
|
|
|
+ formatDuration(elapsed),
|
|
|
+ )
|
|
|
+ // The poll_due heartbeat-loop exit leaves a healthy lease exposed to
|
|
|
+ // this backoff path. Heartbeat before each sleep so /poll outages
|
|
|
+ // (the VerifyEnvironmentSecretAuth DB path heartbeat was introduced
|
|
|
+ // to avoid) don't kill the 300s lease TTL. No-op when activeSessions
|
|
|
+ // is empty or heartbeat is disabled.
|
|
|
+ if (getPollIntervalConfig().non_exclusive_heartbeat_interval_ms > 0) {
|
|
|
+ await heartbeatActiveWorkItems()
|
|
|
+ }
|
|
|
+ await sleep(delay, loopSignal)
|
|
|
+ } else {
|
|
|
+ const now = Date.now()
|
|
|
+
|
|
|
+ // Sleep detection for general errors (same logic as connection errors)
|
|
|
+ if (
|
|
|
+ lastPollErrorTime !== null &&
|
|
|
+ now - lastPollErrorTime > pollSleepDetectionThresholdMs(backoffConfig)
|
|
|
+ ) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Detected system sleep (${Math.round((now - lastPollErrorTime) / 1000)}s gap), resetting error budget`,
|
|
|
+ )
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_poll_sleep_detected', {
|
|
|
+ gapMs: now - lastPollErrorTime,
|
|
|
+ })
|
|
|
+ connErrorStart = null
|
|
|
+ connBackoff = 0
|
|
|
+ generalErrorStart = null
|
|
|
+ generalBackoff = 0
|
|
|
+ }
|
|
|
+ lastPollErrorTime = now
|
|
|
+
|
|
|
+ if (!generalErrorStart) {
|
|
|
+ generalErrorStart = now
|
|
|
+ }
|
|
|
+ const elapsed = now - generalErrorStart
|
|
|
+ if (elapsed >= backoffConfig.generalGiveUpMs) {
|
|
|
+ logger.logError(
|
|
|
+ `Persistent errors for ${Math.round(elapsed / 60_000)} minutes, giving up.`,
|
|
|
+ )
|
|
|
+ logEvent('tengu_bridge_poll_give_up', {
|
|
|
+ error_type:
|
|
|
+ 'general' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ elapsed_ms: elapsed,
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII('error', 'bridge_poll_give_up', {
|
|
|
+ error_type: 'general',
|
|
|
+ elapsed_ms: elapsed,
|
|
|
+ })
|
|
|
+ fatalExit = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // Reset the other track when switching error types
|
|
|
+ connErrorStart = null
|
|
|
+ connBackoff = 0
|
|
|
+
|
|
|
+ generalBackoff = generalBackoff
|
|
|
+ ? Math.min(generalBackoff * 2, backoffConfig.generalCapMs)
|
|
|
+ : backoffConfig.generalInitialMs
|
|
|
+ const delay = addJitter(generalBackoff)
|
|
|
+ logger.logVerbose(
|
|
|
+ `Poll failed, retrying in ${formatDelay(delay)} (${Math.round(elapsed / 1000)}s elapsed): ${errMsg}`,
|
|
|
+ )
|
|
|
+ logger.updateReconnectingStatus(
|
|
|
+ formatDelay(delay),
|
|
|
+ formatDuration(elapsed),
|
|
|
+ )
|
|
|
+ if (getPollIntervalConfig().non_exclusive_heartbeat_interval_ms > 0) {
|
|
|
+ await heartbeatActiveWorkItems()
|
|
|
+ }
|
|
|
+ await sleep(delay, loopSignal)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Clean up
|
|
|
+ stopStatusUpdates()
|
|
|
+ logger.clearStatus()
|
|
|
+
|
|
|
+ const loopDurationMs = Date.now() - loopStartTime
|
|
|
+ logEvent('tengu_bridge_shutdown', {
|
|
|
+ active_sessions: activeSessions.size,
|
|
|
+ loop_duration_ms: loopDurationMs,
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_shutdown', {
|
|
|
+ active_sessions: activeSessions.size,
|
|
|
+ loop_duration_ms: loopDurationMs,
|
|
|
+ })
|
|
|
+
|
|
|
+ // Graceful shutdown: kill active sessions, report them as interrupted,
|
|
|
+ // archive sessions, then deregister the environment so the web UI shows
|
|
|
+ // the bridge as offline.
|
|
|
+
|
|
|
+ // Collect all session IDs to archive on exit. This includes:
|
|
|
+ // 1. Active sessions (snapshot before killing — onSessionDone clears maps)
|
|
|
+ // 2. The initial auto-created session (may never have had work dispatched)
|
|
|
+ // api.archiveSession is idempotent (409 if already archived), so
|
|
|
+ // double-archiving is safe.
|
|
|
+ const sessionsToArchive = new Set(activeSessions.keys())
|
|
|
+ if (initialSessionId) {
|
|
|
+ sessionsToArchive.add(initialSessionId)
|
|
|
+ }
|
|
|
+ // Snapshot before killing — onSessionDone clears sessionCompatIds.
|
|
|
+ const compatIdSnapshot = new Map(sessionCompatIds)
|
|
|
+
|
|
|
+ if (activeSessions.size > 0) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:shutdown] Shutting down ${activeSessions.size} active session(s)`,
|
|
|
+ )
|
|
|
+ logger.logStatus(
|
|
|
+ `Shutting down ${activeSessions.size} active session(s)\u2026`,
|
|
|
+ )
|
|
|
+
|
|
|
+ // Snapshot work IDs before killing — onSessionDone clears the maps when
|
|
|
+ // each child exits, so we need a copy for the stopWork calls below.
|
|
|
+ const shutdownWorkIds = new Map(sessionWorkIds)
|
|
|
+
|
|
|
+ for (const [sessionId, handle] of activeSessions.entries()) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:shutdown] Sending SIGTERM to sessionId=${sessionId}`,
|
|
|
+ )
|
|
|
+ handle.kill()
|
|
|
+ }
|
|
|
+
|
|
|
+ const timeout = new AbortController()
|
|
|
+ await Promise.race([
|
|
|
+ Promise.allSettled([...activeSessions.values()].map(h => h.done)),
|
|
|
+ sleep(backoffConfig.shutdownGraceMs ?? 30_000, timeout.signal),
|
|
|
+ ])
|
|
|
+ timeout.abort()
|
|
|
+
|
|
|
+ // SIGKILL any processes that didn't respond to SIGTERM within the grace window
|
|
|
+ for (const [sid, handle] of activeSessions.entries()) {
|
|
|
+ logForDebugging(`[bridge:shutdown] Force-killing stuck sessionId=${sid}`)
|
|
|
+ handle.forceKill()
|
|
|
+ }
|
|
|
+
|
|
|
+ // Clear any remaining session timeout and refresh timers
|
|
|
+ for (const timer of sessionTimers.values()) {
|
|
|
+ clearTimeout(timer)
|
|
|
+ }
|
|
|
+ sessionTimers.clear()
|
|
|
+ tokenRefresh?.cancelAll()
|
|
|
+
|
|
|
+ // Clean up any remaining worktrees from active sessions.
|
|
|
+ // Snapshot and clear the map first so onSessionDone (which may fire
|
|
|
+ // during the await below when handle.done resolves) won't try to
|
|
|
+ // remove the same worktrees again.
|
|
|
+ if (sessionWorktrees.size > 0) {
|
|
|
+ const remainingWorktrees = [...sessionWorktrees.values()]
|
|
|
+ sessionWorktrees.clear()
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:shutdown] Cleaning up ${remainingWorktrees.length} worktree(s)`,
|
|
|
+ )
|
|
|
+ await Promise.allSettled(
|
|
|
+ remainingWorktrees.map(wt =>
|
|
|
+ removeAgentWorktree(
|
|
|
+ wt.worktreePath,
|
|
|
+ wt.worktreeBranch,
|
|
|
+ wt.gitRoot,
|
|
|
+ wt.hookBased,
|
|
|
+ ),
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ // Stop all active work items so the server knows they're done
|
|
|
+ await Promise.allSettled(
|
|
|
+ [...shutdownWorkIds.entries()].map(([sessionId, workId]) => {
|
|
|
+ return api
|
|
|
+ .stopWork(environmentId, workId, true)
|
|
|
+ .catch(err =>
|
|
|
+ logger.logVerbose(
|
|
|
+ `Failed to stop work ${workId} for session ${sessionId}: ${errorMessage(err)}`,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ }),
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ // Ensure all in-flight cleanup (stopWork, worktree removal) from
|
|
|
+ // onSessionDone completes before deregistering — otherwise
|
|
|
+ // process.exit() can kill them mid-flight.
|
|
|
+ if (pendingCleanups.size > 0) {
|
|
|
+ await Promise.allSettled([...pendingCleanups])
|
|
|
+ }
|
|
|
+
|
|
|
+ // In single-session mode with a known session, leave the session and
|
|
|
+ // environment alive so `claude remote-control --session-id=<id>` can resume.
|
|
|
+ // The backend GCs stale environments via a 4h TTL (BRIDGE_LAST_POLL_TTL).
|
|
|
+ // Archiving the session or deregistering the environment would make the
|
|
|
+ // printed resume command a lie — deregister deletes Firestore + Redis stream.
|
|
|
+ // Skip when the loop exited fatally (env expired, auth failed, give-up) —
|
|
|
+ // resume is impossible in those cases and the message would contradict the
|
|
|
+ // error already printed.
|
|
|
+ // feature('KAIROS') gate: --session-id is ant-only; without the gate,
|
|
|
+ // revert to the pre-PR behavior (archive + deregister on every shutdown).
|
|
|
+ if (
|
|
|
+ feature('KAIROS') &&
|
|
|
+ config.spawnMode === 'single-session' &&
|
|
|
+ initialSessionId &&
|
|
|
+ !fatalExit
|
|
|
+ ) {
|
|
|
+ logger.logStatus(
|
|
|
+ `Resume this session by running \`claude remote-control --continue\``,
|
|
|
+ )
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:shutdown] Skipping archive+deregister to allow resume of session ${initialSessionId}`,
|
|
|
+ )
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ // Archive all known sessions so they don't linger as idle/running on the
|
|
|
+ // server after the bridge goes offline.
|
|
|
+ if (sessionsToArchive.size > 0) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:shutdown] Archiving ${sessionsToArchive.size} session(s)`,
|
|
|
+ )
|
|
|
+ await Promise.allSettled(
|
|
|
+ [...sessionsToArchive].map(sessionId =>
|
|
|
+ api
|
|
|
+ .archiveSession(
|
|
|
+ compatIdSnapshot.get(sessionId) ?? toCompatSessionId(sessionId),
|
|
|
+ )
|
|
|
+ .catch(err =>
|
|
|
+ logger.logVerbose(
|
|
|
+ `Failed to archive session ${sessionId}: ${errorMessage(err)}`,
|
|
|
+ ),
|
|
|
+ ),
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ // Deregister the environment so the web UI shows the bridge as offline
|
|
|
+ // and the Redis stream is cleaned up.
|
|
|
+ try {
|
|
|
+ await api.deregisterEnvironment(environmentId)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:shutdown] Environment deregistered, bridge offline`,
|
|
|
+ )
|
|
|
+ logger.logVerbose('Environment deregistered.')
|
|
|
+ } catch (err) {
|
|
|
+ logger.logVerbose(`Failed to deregister environment: ${errorMessage(err)}`)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Clear the crash-recovery pointer — the env is gone, pointer would be
|
|
|
+ // stale. The early return above (resumable SIGINT shutdown) skips this,
|
|
|
+ // leaving the pointer as a backup for the printed --session-id hint.
|
|
|
+ const { clearBridgePointer } = await import('./bridgePointer.js')
|
|
|
+ await clearBridgePointer(config.dir)
|
|
|
+
|
|
|
+ logger.logVerbose('Environment offline.')
|
|
|
+}
|
|
|
+
|
|
|
+const CONNECTION_ERROR_CODES = new Set([
|
|
|
+ 'ECONNREFUSED',
|
|
|
+ 'ECONNRESET',
|
|
|
+ 'ETIMEDOUT',
|
|
|
+ 'ENETUNREACH',
|
|
|
+ 'EHOSTUNREACH',
|
|
|
+])
|
|
|
+
|
|
|
+export function isConnectionError(err: unknown): boolean {
|
|
|
+ if (
|
|
|
+ err &&
|
|
|
+ typeof err === 'object' &&
|
|
|
+ 'code' in err &&
|
|
|
+ typeof err.code === 'string' &&
|
|
|
+ CONNECTION_ERROR_CODES.has(err.code)
|
|
|
+ ) {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+/** Detect HTTP 5xx errors from axios (code: 'ERR_BAD_RESPONSE'). */
|
|
|
+export function isServerError(err: unknown): boolean {
|
|
|
+ return (
|
|
|
+ !!err &&
|
|
|
+ typeof err === 'object' &&
|
|
|
+ 'code' in err &&
|
|
|
+ typeof err.code === 'string' &&
|
|
|
+ err.code === 'ERR_BAD_RESPONSE'
|
|
|
+ )
|
|
|
+}
|
|
|
+
|
|
|
+/** Add ±25% jitter to a delay value. */
|
|
|
+function addJitter(ms: number): number {
|
|
|
+ return Math.max(0, ms + ms * 0.25 * (2 * Math.random() - 1))
|
|
|
+}
|
|
|
+
|
|
|
+function formatDelay(ms: number): string {
|
|
|
+ return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms`
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Retry stopWork with exponential backoff (3 attempts, 1s/2s/4s).
|
|
|
+ * Ensures the server learns the work item ended, preventing server-side zombies.
|
|
|
+ */
|
|
|
+async function stopWorkWithRetry(
|
|
|
+ api: BridgeApiClient,
|
|
|
+ environmentId: string,
|
|
|
+ workId: string,
|
|
|
+ logger: BridgeLogger,
|
|
|
+ baseDelayMs = 1000,
|
|
|
+): Promise<void> {
|
|
|
+ const MAX_ATTEMPTS = 3
|
|
|
+
|
|
|
+ for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
|
|
|
+ try {
|
|
|
+ await api.stopWork(environmentId, workId, false)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] stopWork succeeded for workId=${workId} on attempt ${attempt}/${MAX_ATTEMPTS}`,
|
|
|
+ )
|
|
|
+ return
|
|
|
+ } catch (err) {
|
|
|
+ // Auth/permission errors won't be fixed by retrying
|
|
|
+ if (err instanceof BridgeFatalError) {
|
|
|
+ if (isSuppressible403(err)) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:work] Suppressed stopWork 403 for ${workId}: ${err.message}`,
|
|
|
+ )
|
|
|
+ } else {
|
|
|
+ logger.logError(`Failed to stop work ${workId}: ${err.message}`)
|
|
|
+ }
|
|
|
+ logForDiagnosticsNoPII('error', 'bridge_stop_work_failed', {
|
|
|
+ attempts: attempt,
|
|
|
+ fatal: true,
|
|
|
+ })
|
|
|
+ return
|
|
|
+ }
|
|
|
+ const errMsg = errorMessage(err)
|
|
|
+ if (attempt < MAX_ATTEMPTS) {
|
|
|
+ const delay = addJitter(baseDelayMs * Math.pow(2, attempt - 1))
|
|
|
+ logger.logVerbose(
|
|
|
+ `Failed to stop work ${workId} (attempt ${attempt}/${MAX_ATTEMPTS}), retrying in ${formatDelay(delay)}: ${errMsg}`,
|
|
|
+ )
|
|
|
+ await sleep(delay)
|
|
|
+ } else {
|
|
|
+ logger.logError(
|
|
|
+ `Failed to stop work ${workId} after ${MAX_ATTEMPTS} attempts: ${errMsg}`,
|
|
|
+ )
|
|
|
+ logForDiagnosticsNoPII('error', 'bridge_stop_work_failed', {
|
|
|
+ attempts: MAX_ATTEMPTS,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+function onSessionTimeout(
|
|
|
+ sessionId: string,
|
|
|
+ timeoutMs: number,
|
|
|
+ logger: BridgeLogger,
|
|
|
+ timedOutSessions: Set<string>,
|
|
|
+ handle: SessionHandle,
|
|
|
+): void {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:session] sessionId=${sessionId} timed out after ${formatDuration(timeoutMs)}`,
|
|
|
+ )
|
|
|
+ logEvent('tengu_bridge_session_timeout', {
|
|
|
+ timeout_ms: timeoutMs,
|
|
|
+ })
|
|
|
+ logger.logSessionFailed(
|
|
|
+ sessionId,
|
|
|
+ `Session timed out after ${formatDuration(timeoutMs)}`,
|
|
|
+ )
|
|
|
+ timedOutSessions.add(sessionId)
|
|
|
+ handle.kill()
|
|
|
+}
|
|
|
+
|
|
|
+export type ParsedArgs = {
|
|
|
+ verbose: boolean
|
|
|
+ sandbox: boolean
|
|
|
+ debugFile?: string
|
|
|
+ sessionTimeoutMs?: number
|
|
|
+ permissionMode?: string
|
|
|
+ name?: string
|
|
|
+ /** Value passed to --spawn (if any); undefined if no --spawn flag was given. */
|
|
|
+ spawnMode: SpawnMode | undefined
|
|
|
+ /** Value passed to --capacity (if any); undefined if no --capacity flag was given. */
|
|
|
+ capacity: number | undefined
|
|
|
+ /** --[no-]create-session-in-dir override; undefined = use default (on). */
|
|
|
+ createSessionInDir: boolean | undefined
|
|
|
+ /** Resume an existing session instead of creating a new one. */
|
|
|
+ sessionId?: string
|
|
|
+ /** Resume the last session in this directory (reads bridge-pointer.json). */
|
|
|
+ continueSession: boolean
|
|
|
+ help: boolean
|
|
|
+ error?: string
|
|
|
+}
|
|
|
+
|
|
|
+const SPAWN_FLAG_VALUES = ['session', 'same-dir', 'worktree'] as const
|
|
|
+
|
|
|
+function parseSpawnValue(raw: string | undefined): SpawnMode | string {
|
|
|
+ if (raw === 'session') return 'single-session'
|
|
|
+ if (raw === 'same-dir') return 'same-dir'
|
|
|
+ if (raw === 'worktree') return 'worktree'
|
|
|
+ return `--spawn requires one of: ${SPAWN_FLAG_VALUES.join(', ')} (got: ${raw ?? '<missing>'})`
|
|
|
+}
|
|
|
+
|
|
|
+function parseCapacityValue(raw: string | undefined): number | string {
|
|
|
+ const n = raw === undefined ? NaN : parseInt(raw, 10)
|
|
|
+ if (isNaN(n) || n < 1) {
|
|
|
+ return `--capacity requires a positive integer (got: ${raw ?? '<missing>'})`
|
|
|
+ }
|
|
|
+ return n
|
|
|
+}
|
|
|
+
|
|
|
+export function parseArgs(args: string[]): ParsedArgs {
|
|
|
+ let verbose = false
|
|
|
+ let sandbox = false
|
|
|
+ let debugFile: string | undefined
|
|
|
+ let sessionTimeoutMs: number | undefined
|
|
|
+ let permissionMode: string | undefined
|
|
|
+ let name: string | undefined
|
|
|
+ let help = false
|
|
|
+ let spawnMode: SpawnMode | undefined
|
|
|
+ let capacity: number | undefined
|
|
|
+ let createSessionInDir: boolean | undefined
|
|
|
+ let sessionId: string | undefined
|
|
|
+ let continueSession = false
|
|
|
+
|
|
|
+ for (let i = 0; i < args.length; i++) {
|
|
|
+ const arg = args[i]!
|
|
|
+ if (arg === '--help' || arg === '-h') {
|
|
|
+ help = true
|
|
|
+ } else if (arg === '--verbose' || arg === '-v') {
|
|
|
+ verbose = true
|
|
|
+ } else if (arg === '--sandbox') {
|
|
|
+ sandbox = true
|
|
|
+ } else if (arg === '--no-sandbox') {
|
|
|
+ sandbox = false
|
|
|
+ } else if (arg === '--debug-file' && i + 1 < args.length) {
|
|
|
+ debugFile = resolve(args[++i]!)
|
|
|
+ } else if (arg.startsWith('--debug-file=')) {
|
|
|
+ debugFile = resolve(arg.slice('--debug-file='.length))
|
|
|
+ } else if (arg === '--session-timeout' && i + 1 < args.length) {
|
|
|
+ sessionTimeoutMs = parseInt(args[++i]!, 10) * 1000
|
|
|
+ } else if (arg.startsWith('--session-timeout=')) {
|
|
|
+ sessionTimeoutMs =
|
|
|
+ parseInt(arg.slice('--session-timeout='.length), 10) * 1000
|
|
|
+ } else if (arg === '--permission-mode' && i + 1 < args.length) {
|
|
|
+ permissionMode = args[++i]!
|
|
|
+ } else if (arg.startsWith('--permission-mode=')) {
|
|
|
+ permissionMode = arg.slice('--permission-mode='.length)
|
|
|
+ } else if (arg === '--name' && i + 1 < args.length) {
|
|
|
+ name = args[++i]!
|
|
|
+ } else if (arg.startsWith('--name=')) {
|
|
|
+ name = arg.slice('--name='.length)
|
|
|
+ } else if (
|
|
|
+ feature('KAIROS') &&
|
|
|
+ arg === '--session-id' &&
|
|
|
+ i + 1 < args.length
|
|
|
+ ) {
|
|
|
+ sessionId = args[++i]!
|
|
|
+ if (!sessionId) {
|
|
|
+ return makeError('--session-id requires a value')
|
|
|
+ }
|
|
|
+ } else if (feature('KAIROS') && arg.startsWith('--session-id=')) {
|
|
|
+ sessionId = arg.slice('--session-id='.length)
|
|
|
+ if (!sessionId) {
|
|
|
+ return makeError('--session-id requires a value')
|
|
|
+ }
|
|
|
+ } else if (feature('KAIROS') && (arg === '--continue' || arg === '-c')) {
|
|
|
+ continueSession = true
|
|
|
+ } else if (arg === '--spawn' || arg.startsWith('--spawn=')) {
|
|
|
+ if (spawnMode !== undefined) {
|
|
|
+ return makeError('--spawn may only be specified once')
|
|
|
+ }
|
|
|
+ const raw = arg.startsWith('--spawn=')
|
|
|
+ ? arg.slice('--spawn='.length)
|
|
|
+ : args[++i]
|
|
|
+ const v = parseSpawnValue(raw)
|
|
|
+ if (v === 'single-session' || v === 'same-dir' || v === 'worktree') {
|
|
|
+ spawnMode = v
|
|
|
+ } else {
|
|
|
+ return makeError(v)
|
|
|
+ }
|
|
|
+ } else if (arg === '--capacity' || arg.startsWith('--capacity=')) {
|
|
|
+ if (capacity !== undefined) {
|
|
|
+ return makeError('--capacity may only be specified once')
|
|
|
+ }
|
|
|
+ const raw = arg.startsWith('--capacity=')
|
|
|
+ ? arg.slice('--capacity='.length)
|
|
|
+ : args[++i]
|
|
|
+ const v = parseCapacityValue(raw)
|
|
|
+ if (typeof v === 'number') capacity = v
|
|
|
+ else return makeError(v)
|
|
|
+ } else if (arg === '--create-session-in-dir') {
|
|
|
+ createSessionInDir = true
|
|
|
+ } else if (arg === '--no-create-session-in-dir') {
|
|
|
+ createSessionInDir = false
|
|
|
+ } else {
|
|
|
+ return makeError(
|
|
|
+ `Unknown argument: ${arg}\nRun 'claude remote-control --help' for usage.`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Note: gate check for --spawn/--capacity/--create-session-in-dir is in bridgeMain
|
|
|
+ // (gate-aware error). Flag cross-validation happens here.
|
|
|
+
|
|
|
+ // --capacity only makes sense for multi-session modes.
|
|
|
+ if (spawnMode === 'single-session' && capacity !== undefined) {
|
|
|
+ return makeError(
|
|
|
+ `--capacity cannot be used with --spawn=session (single-session mode has fixed capacity 1).`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ // --session-id / --continue resume a specific session on its original
|
|
|
+ // environment; incompatible with spawn-related flags (which configure
|
|
|
+ // fresh session creation), and mutually exclusive with each other.
|
|
|
+ if (
|
|
|
+ (sessionId || continueSession) &&
|
|
|
+ (spawnMode !== undefined ||
|
|
|
+ capacity !== undefined ||
|
|
|
+ createSessionInDir !== undefined)
|
|
|
+ ) {
|
|
|
+ return makeError(
|
|
|
+ `--session-id and --continue cannot be used with --spawn, --capacity, or --create-session-in-dir.`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ if (sessionId && continueSession) {
|
|
|
+ return makeError(`--session-id and --continue cannot be used together.`)
|
|
|
+ }
|
|
|
+
|
|
|
+ return {
|
|
|
+ verbose,
|
|
|
+ sandbox,
|
|
|
+ debugFile,
|
|
|
+ sessionTimeoutMs,
|
|
|
+ permissionMode,
|
|
|
+ name,
|
|
|
+ spawnMode,
|
|
|
+ capacity,
|
|
|
+ createSessionInDir,
|
|
|
+ sessionId,
|
|
|
+ continueSession,
|
|
|
+ help,
|
|
|
+ }
|
|
|
+
|
|
|
+ function makeError(error: string): ParsedArgs {
|
|
|
+ return {
|
|
|
+ verbose,
|
|
|
+ sandbox,
|
|
|
+ debugFile,
|
|
|
+ sessionTimeoutMs,
|
|
|
+ permissionMode,
|
|
|
+ name,
|
|
|
+ spawnMode,
|
|
|
+ capacity,
|
|
|
+ createSessionInDir,
|
|
|
+ sessionId,
|
|
|
+ continueSession,
|
|
|
+ help,
|
|
|
+ error,
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+async function printHelp(): Promise<void> {
|
|
|
+ // Use EXTERNAL_PERMISSION_MODES for help text — internal modes (bubble)
|
|
|
+ // are ant-only and auto is feature-gated; they're still accepted by validation.
|
|
|
+ const { EXTERNAL_PERMISSION_MODES } = await import('../types/permissions.js')
|
|
|
+ const modes = EXTERNAL_PERMISSION_MODES.join(', ')
|
|
|
+ const showServer = await isMultiSessionSpawnEnabled()
|
|
|
+ const serverOptions = showServer
|
|
|
+ ? ` --spawn <mode> Spawn mode: same-dir, worktree, session
|
|
|
+ (default: same-dir)
|
|
|
+ --capacity <N> Max concurrent sessions in worktree or
|
|
|
+ same-dir mode (default: ${SPAWN_SESSIONS_DEFAULT})
|
|
|
+ --[no-]create-session-in-dir Pre-create a session in the current
|
|
|
+ directory; in worktree mode this session
|
|
|
+ stays in cwd while on-demand sessions get
|
|
|
+ isolated worktrees (default: on)
|
|
|
+`
|
|
|
+ : ''
|
|
|
+ const serverDescription = showServer
|
|
|
+ ? `
|
|
|
+ Remote Control runs as a persistent server that accepts multiple concurrent
|
|
|
+ sessions in the current directory. One session is pre-created on start so
|
|
|
+ you have somewhere to type immediately. Use --spawn=worktree to isolate
|
|
|
+ each on-demand session in its own git worktree, or --spawn=session for
|
|
|
+ the classic single-session mode (exits when that session ends). Press 'w'
|
|
|
+ during runtime to toggle between same-dir and worktree.
|
|
|
+`
|
|
|
+ : ''
|
|
|
+ const serverNote = showServer
|
|
|
+ ? ` - Worktree mode requires a git repository or WorktreeCreate/WorktreeRemove hooks
|
|
|
+`
|
|
|
+ : ''
|
|
|
+ const help = `
|
|
|
+Remote Control - Connect your local environment to claude.ai/code
|
|
|
+
|
|
|
+USAGE
|
|
|
+ claude remote-control [options]
|
|
|
+OPTIONS
|
|
|
+ --name <name> Name for the session (shown in claude.ai/code)
|
|
|
+${
|
|
|
+ feature('KAIROS')
|
|
|
+ ? ` -c, --continue Resume the last session in this directory
|
|
|
+ --session-id <id> Resume a specific session by ID (cannot be
|
|
|
+ used with spawn flags or --continue)
|
|
|
+`
|
|
|
+ : ''
|
|
|
+} --permission-mode <mode> Permission mode for spawned sessions
|
|
|
+ (${modes})
|
|
|
+ --debug-file <path> Write debug logs to file
|
|
|
+ -v, --verbose Enable verbose output
|
|
|
+ -h, --help Show this help
|
|
|
+${serverOptions}
|
|
|
+DESCRIPTION
|
|
|
+ Remote Control allows you to control sessions on your local device from
|
|
|
+ claude.ai/code (https://claude.ai/code). Run this command in the
|
|
|
+ directory you want to work in, then connect from the Claude app or web.
|
|
|
+${serverDescription}
|
|
|
+NOTES
|
|
|
+ - You must be logged in with a Claude account that has a subscription
|
|
|
+ - Run \`claude\` first in the directory to accept the workspace trust dialog
|
|
|
+${serverNote}`
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional help output
|
|
|
+ console.log(help)
|
|
|
+}
|
|
|
+
|
|
|
+const TITLE_MAX_LEN = 80
|
|
|
+
|
|
|
+/** Derive a session title from a user message: first line, truncated. */
|
|
|
+function deriveSessionTitle(text: string): string {
|
|
|
+ // Collapse whitespace — newlines/tabs would break the single-line status display.
|
|
|
+ const flat = text.replace(/\s+/g, ' ').trim()
|
|
|
+ return truncateToWidth(flat, TITLE_MAX_LEN)
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * One-shot fetch of a session's title via GET /v1/sessions/{id}.
|
|
|
+ *
|
|
|
+ * Uses `getBridgeSession` from createSession.ts (ccr-byoc headers + org UUID)
|
|
|
+ * rather than the environments-level bridgeApi client, whose headers make the
|
|
|
+ * Sessions API return 404. Returns undefined if the session has no title yet
|
|
|
+ * or the fetch fails — the caller falls back to deriving a title from the
|
|
|
+ * first user message.
|
|
|
+ */
|
|
|
+async function fetchSessionTitle(
|
|
|
+ compatSessionId: string,
|
|
|
+ baseUrl: string,
|
|
|
+): Promise<string | undefined> {
|
|
|
+ const { getBridgeSession } = await import('./createSession.js')
|
|
|
+ const session = await getBridgeSession(compatSessionId, { baseUrl })
|
|
|
+ return session?.title || undefined
|
|
|
+}
|
|
|
+
|
|
|
+export async function bridgeMain(args: string[]): Promise<void> {
|
|
|
+ const parsed = parseArgs(args)
|
|
|
+
|
|
|
+ if (parsed.help) {
|
|
|
+ await printHelp()
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if (parsed.error) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(`Error: ${parsed.error}`)
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ const {
|
|
|
+ verbose,
|
|
|
+ sandbox,
|
|
|
+ debugFile,
|
|
|
+ sessionTimeoutMs,
|
|
|
+ permissionMode,
|
|
|
+ name,
|
|
|
+ spawnMode: parsedSpawnMode,
|
|
|
+ capacity: parsedCapacity,
|
|
|
+ createSessionInDir: parsedCreateSessionInDir,
|
|
|
+ sessionId: parsedSessionId,
|
|
|
+ continueSession,
|
|
|
+ } = parsed
|
|
|
+ // Mutable so --continue can set it from the pointer file. The #20460
|
|
|
+ // resume flow below then treats it the same as an explicit --session-id.
|
|
|
+ let resumeSessionId = parsedSessionId
|
|
|
+ // When --continue found a pointer, this is the directory it came from
|
|
|
+ // (may be a worktree sibling, not `dir`). On resume-flow deterministic
|
|
|
+ // failure, clear THIS file so --continue doesn't keep hitting the same
|
|
|
+ // dead session. Undefined for explicit --session-id (leaves pointer alone).
|
|
|
+ let resumePointerDir: string | undefined
|
|
|
+
|
|
|
+ const usedMultiSessionFeature =
|
|
|
+ parsedSpawnMode !== undefined ||
|
|
|
+ parsedCapacity !== undefined ||
|
|
|
+ parsedCreateSessionInDir !== undefined
|
|
|
+
|
|
|
+ // Validate permission mode early so the user gets an error before
|
|
|
+ // the bridge starts polling for work.
|
|
|
+ if (permissionMode !== undefined) {
|
|
|
+ const { PERMISSION_MODES } = await import('../types/permissions.js')
|
|
|
+ const valid: readonly string[] = PERMISSION_MODES
|
|
|
+ if (!valid.includes(permissionMode)) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ `Error: Invalid permission mode '${permissionMode}'. Valid modes: ${valid.join(', ')}`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ const dir = resolve('.')
|
|
|
+
|
|
|
+ // The bridge fast-path bypasses init.ts, so we must enable config reading
|
|
|
+ // before any code that transitively calls getGlobalConfig()
|
|
|
+ const { enableConfigs, checkHasTrustDialogAccepted } = await import(
|
|
|
+ '../utils/config.js'
|
|
|
+ )
|
|
|
+ enableConfigs()
|
|
|
+
|
|
|
+ // Initialize analytics and error reporting sinks. The bridge bypasses the
|
|
|
+ // setup() init flow, so we call initSinks() directly to attach sinks here.
|
|
|
+ const { initSinks } = await import('../utils/sinks.js')
|
|
|
+ initSinks()
|
|
|
+
|
|
|
+ // Gate-aware validation: --spawn / --capacity / --create-session-in-dir require
|
|
|
+ // the multi-session gate. parseArgs has already validated flag combinations;
|
|
|
+ // here we only check the gate since that requires an async GrowthBook call.
|
|
|
+ // Runs after enableConfigs() (GrowthBook cache reads global config) and after
|
|
|
+ // initSinks() so the denial event can be enqueued.
|
|
|
+ const multiSessionEnabled = await isMultiSessionSpawnEnabled()
|
|
|
+ if (usedMultiSessionFeature && !multiSessionEnabled) {
|
|
|
+ await logEventAsync('tengu_bridge_multi_session_denied', {
|
|
|
+ used_spawn: parsedSpawnMode !== undefined,
|
|
|
+ used_capacity: parsedCapacity !== undefined,
|
|
|
+ used_create_session_in_dir: parsedCreateSessionInDir !== undefined,
|
|
|
+ })
|
|
|
+ // logEventAsync only enqueues — process.exit() discards buffered events.
|
|
|
+ // Flush explicitly, capped at 500ms to match gracefulShutdown.ts.
|
|
|
+ // (sleep() doesn't unref its timer, but process.exit() follows immediately
|
|
|
+ // so the ref'd timer can't delay shutdown.)
|
|
|
+ await Promise.race([
|
|
|
+ Promise.all([shutdown1PEventLogging(), shutdownDatadog()]),
|
|
|
+ sleep(500, undefined, { unref: true }),
|
|
|
+ ]).catch(() => {})
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ 'Error: Multi-session Remote Control is not enabled for your account yet.',
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Set the bootstrap CWD so that trust checks, project config lookups, and
|
|
|
+ // git utilities (getBranch, getRemoteUrl) resolve against the correct path.
|
|
|
+ const { setOriginalCwd, setCwdState } = await import('../bootstrap/state.js')
|
|
|
+ setOriginalCwd(dir)
|
|
|
+ setCwdState(dir)
|
|
|
+
|
|
|
+ // The bridge bypasses main.tsx (which renders the interactive TrustDialog via showSetupScreens),
|
|
|
+ // so we must verify trust was previously established by a normal `claude` session.
|
|
|
+ if (!checkHasTrustDialogAccepted()) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole:: intentional console output
|
|
|
+ console.error(
|
|
|
+ `Error: Workspace not trusted. Please run \`claude\` in ${dir} first to review and accept the workspace trust dialog.`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Resolve auth
|
|
|
+ const { clearOAuthTokenCache, checkAndRefreshOAuthTokenIfNeeded } =
|
|
|
+ await import('../utils/auth.js')
|
|
|
+ const { getBridgeAccessToken, getBridgeBaseUrl } = await import(
|
|
|
+ './bridgeConfig.js'
|
|
|
+ )
|
|
|
+
|
|
|
+ const bridgeToken = getBridgeAccessToken()
|
|
|
+ if (!bridgeToken) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole:: intentional console output
|
|
|
+ console.error(BRIDGE_LOGIN_ERROR)
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // First-time remote dialog — explain what bridge does and get consent
|
|
|
+ const {
|
|
|
+ getGlobalConfig,
|
|
|
+ saveGlobalConfig,
|
|
|
+ getCurrentProjectConfig,
|
|
|
+ saveCurrentProjectConfig,
|
|
|
+ } = await import('../utils/config.js')
|
|
|
+ if (!getGlobalConfig().remoteDialogSeen) {
|
|
|
+ const readline = await import('readline')
|
|
|
+ const rl = readline.createInterface({
|
|
|
+ input: process.stdin,
|
|
|
+ output: process.stdout,
|
|
|
+ })
|
|
|
+ // biome-ignore lint/suspicious/noConsole:: intentional console output
|
|
|
+ console.log(
|
|
|
+ '\nRemote Control lets you access this CLI session from the web (claude.ai/code)\nor the Claude app, so you can pick up where you left off on any device.\n\nYou can disconnect remote access anytime by running /remote-control again.\n',
|
|
|
+ )
|
|
|
+ const answer = await new Promise<string>(resolve => {
|
|
|
+ rl.question('Enable Remote Control? (y/n) ', resolve)
|
|
|
+ })
|
|
|
+ rl.close()
|
|
|
+ saveGlobalConfig(current => {
|
|
|
+ if (current.remoteDialogSeen) return current
|
|
|
+ return { ...current, remoteDialogSeen: true }
|
|
|
+ })
|
|
|
+ if (answer.toLowerCase() !== 'y' && answer.toLowerCase() !== 'yes') {
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(0)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // --continue: resolve the most recent session from the crash-recovery
|
|
|
+ // pointer and chain into the #20460 --session-id flow. Worktree-aware:
|
|
|
+ // checks current dir first (fast path, zero exec), then fans out to git
|
|
|
+ // worktree siblings if that misses — the REPL bridge writes to
|
|
|
+ // getOriginalCwd() which EnterWorktreeTool/activeWorktreeSession can
|
|
|
+ // point at a worktree while the user's shell is at the repo root.
|
|
|
+ // KAIROS-gated at parseArgs — continueSession is always false in external
|
|
|
+ // builds, so this block tree-shakes.
|
|
|
+ if (feature('KAIROS') && continueSession) {
|
|
|
+ const { readBridgePointerAcrossWorktrees } = await import(
|
|
|
+ './bridgePointer.js'
|
|
|
+ )
|
|
|
+ const found = await readBridgePointerAcrossWorktrees(dir)
|
|
|
+ if (!found) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ `Error: No recent session found in this directory or its worktrees. Run \`claude remote-control\` to start a new one.`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+ const { pointer, dir: pointerDir } = found
|
|
|
+ const ageMin = Math.round(pointer.ageMs / 60_000)
|
|
|
+ const ageStr = ageMin < 60 ? `${ageMin}m` : `${Math.round(ageMin / 60)}h`
|
|
|
+ const fromWt = pointerDir !== dir ? ` from worktree ${pointerDir}` : ''
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional info output
|
|
|
+ console.error(
|
|
|
+ `Resuming session ${pointer.sessionId} (${ageStr} ago)${fromWt}\u2026`,
|
|
|
+ )
|
|
|
+ resumeSessionId = pointer.sessionId
|
|
|
+ // Track where the pointer came from so the #20460 exit(1) paths below
|
|
|
+ // clear the RIGHT file on deterministic failure — otherwise --continue
|
|
|
+ // would keep hitting the same dead session. May be a worktree sibling.
|
|
|
+ resumePointerDir = pointerDir
|
|
|
+ }
|
|
|
+
|
|
|
+ // In production, baseUrl is the Anthropic API (from OAuth config).
|
|
|
+ // CLAUDE_BRIDGE_BASE_URL overrides this for ant local dev only.
|
|
|
+ const baseUrl = getBridgeBaseUrl()
|
|
|
+
|
|
|
+ // For non-localhost targets, require HTTPS to protect credentials.
|
|
|
+ if (
|
|
|
+ baseUrl.startsWith('http://') &&
|
|
|
+ !baseUrl.includes('localhost') &&
|
|
|
+ !baseUrl.includes('127.0.0.1')
|
|
|
+ ) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole:: intentional console output
|
|
|
+ console.error(
|
|
|
+ 'Error: Remote Control base URL uses HTTP. Only HTTPS or localhost HTTP is allowed.',
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Session ingress URL for WebSocket connections. In production this is the
|
|
|
+ // same as baseUrl (Envoy routes /v1/session_ingress/* to session-ingress).
|
|
|
+ // Locally, session-ingress runs on a different port (9413) than the
|
|
|
+ // contain-provide-api (8211), so CLAUDE_BRIDGE_SESSION_INGRESS_URL must be
|
|
|
+ // set explicitly. Ant-only, matching CLAUDE_BRIDGE_BASE_URL.
|
|
|
+ const sessionIngressUrl =
|
|
|
+ process.env.USER_TYPE === 'ant' &&
|
|
|
+ process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL
|
|
|
+ ? process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL
|
|
|
+ : baseUrl
|
|
|
+
|
|
|
+ const { getBranch, getRemoteUrl, findGitRoot } = await import(
|
|
|
+ '../utils/git.js'
|
|
|
+ )
|
|
|
+
|
|
|
+ // Precheck worktree availability for the first-run dialog and the `w`
|
|
|
+ // toggle. Unconditional so we know upfront whether worktree is an option.
|
|
|
+ const { hasWorktreeCreateHook } = await import('../utils/hooks.js')
|
|
|
+ const worktreeAvailable = hasWorktreeCreateHook() || findGitRoot(dir) !== null
|
|
|
+
|
|
|
+ // Load saved per-project spawn-mode preference. Gated by multiSessionEnabled
|
|
|
+ // so a GrowthBook rollback cleanly reverts users to single-session —
|
|
|
+ // otherwise a saved pref would silently re-enable multi-session behavior
|
|
|
+ // (worktree isolation, 32 max sessions, w toggle) despite the gate being off.
|
|
|
+ // Also guard against a stale worktree pref left over from when this dir WAS
|
|
|
+ // a git repo (or the user copied config) — clear it on disk so the warning
|
|
|
+ // doesn't repeat on every launch.
|
|
|
+ let savedSpawnMode = multiSessionEnabled
|
|
|
+ ? getCurrentProjectConfig().remoteControlSpawnMode
|
|
|
+ : undefined
|
|
|
+ if (savedSpawnMode === 'worktree' && !worktreeAvailable) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional warning output
|
|
|
+ console.error(
|
|
|
+ 'Warning: Saved spawn mode is worktree but this directory is not a git repository. Falling back to same-dir.',
|
|
|
+ )
|
|
|
+ savedSpawnMode = undefined
|
|
|
+ saveCurrentProjectConfig(current => {
|
|
|
+ if (current.remoteControlSpawnMode === undefined) return current
|
|
|
+ return { ...current, remoteControlSpawnMode: undefined }
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ // First-run spawn-mode choice: ask once per project when the choice is
|
|
|
+ // meaningful (gate on, both modes available, no explicit override, not
|
|
|
+ // resuming). Saves to ProjectConfig so subsequent runs skip this.
|
|
|
+ if (
|
|
|
+ multiSessionEnabled &&
|
|
|
+ !savedSpawnMode &&
|
|
|
+ worktreeAvailable &&
|
|
|
+ parsedSpawnMode === undefined &&
|
|
|
+ !resumeSessionId &&
|
|
|
+ process.stdin.isTTY
|
|
|
+ ) {
|
|
|
+ const readline = await import('readline')
|
|
|
+ const rl = readline.createInterface({
|
|
|
+ input: process.stdin,
|
|
|
+ output: process.stdout,
|
|
|
+ })
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional dialog output
|
|
|
+ console.log(
|
|
|
+ `\nClaude Remote Control is launching in spawn mode which lets you create new sessions in this project from Claude Code on Web or your Mobile app. Learn more here: https://code.claude.com/docs/en/remote-control\n\n` +
|
|
|
+ `Spawn mode for this project:\n` +
|
|
|
+ ` [1] same-dir \u2014 sessions share the current directory (default)\n` +
|
|
|
+ ` [2] worktree \u2014 each session gets an isolated git worktree\n\n` +
|
|
|
+ `This can be changed later or explicitly set with --spawn=same-dir or --spawn=worktree.\n`,
|
|
|
+ )
|
|
|
+ const answer = await new Promise<string>(resolve => {
|
|
|
+ rl.question('Choose [1/2] (default: 1): ', resolve)
|
|
|
+ })
|
|
|
+ rl.close()
|
|
|
+ const chosen: 'same-dir' | 'worktree' =
|
|
|
+ answer.trim() === '2' ? 'worktree' : 'same-dir'
|
|
|
+ savedSpawnMode = chosen
|
|
|
+ logEvent('tengu_bridge_spawn_mode_chosen', {
|
|
|
+ spawn_mode:
|
|
|
+ chosen as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ })
|
|
|
+ saveCurrentProjectConfig(current => {
|
|
|
+ if (current.remoteControlSpawnMode === chosen) return current
|
|
|
+ return { ...current, remoteControlSpawnMode: chosen }
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ // Determine effective spawn mode.
|
|
|
+ // Precedence: resume > explicit --spawn > saved project pref > gate default
|
|
|
+ // - resuming via --continue / --session-id: always single-session (resume
|
|
|
+ // targets one specific session in its original directory)
|
|
|
+ // - explicit --spawn flag: use that value directly (does not persist)
|
|
|
+ // - saved ProjectConfig.remoteControlSpawnMode: set by first-run dialog or `w`
|
|
|
+ // - default with gate on: same-dir (persistent multi-session, shared cwd)
|
|
|
+ // - default with gate off: single-session (unchanged legacy behavior)
|
|
|
+ // Track how spawn mode was determined, for rollout analytics.
|
|
|
+ type SpawnModeSource = 'resume' | 'flag' | 'saved' | 'gate_default'
|
|
|
+ let spawnModeSource: SpawnModeSource
|
|
|
+ let spawnMode: SpawnMode
|
|
|
+ if (resumeSessionId) {
|
|
|
+ spawnMode = 'single-session'
|
|
|
+ spawnModeSource = 'resume'
|
|
|
+ } else if (parsedSpawnMode !== undefined) {
|
|
|
+ spawnMode = parsedSpawnMode
|
|
|
+ spawnModeSource = 'flag'
|
|
|
+ } else if (savedSpawnMode !== undefined) {
|
|
|
+ spawnMode = savedSpawnMode
|
|
|
+ spawnModeSource = 'saved'
|
|
|
+ } else {
|
|
|
+ spawnMode = multiSessionEnabled ? 'same-dir' : 'single-session'
|
|
|
+ spawnModeSource = 'gate_default'
|
|
|
+ }
|
|
|
+ const maxSessions =
|
|
|
+ spawnMode === 'single-session'
|
|
|
+ ? 1
|
|
|
+ : (parsedCapacity ?? SPAWN_SESSIONS_DEFAULT)
|
|
|
+ // Pre-create an empty session on start so the user has somewhere to type
|
|
|
+ // immediately, running in the current directory (exempted from worktree
|
|
|
+ // creation in the spawn loop). On by default; --no-create-session-in-dir
|
|
|
+ // opts out for a pure on-demand server where every session is isolated.
|
|
|
+ // The effectiveResumeSessionId guard at the creation site handles the
|
|
|
+ // resume case (skip creation when resume succeeded; fall through to
|
|
|
+ // fresh creation on env-mismatch fallback).
|
|
|
+ const preCreateSession = parsedCreateSessionInDir ?? true
|
|
|
+
|
|
|
+ // Without --continue: a leftover pointer means the previous run didn't
|
|
|
+ // shut down cleanly (crash, kill -9, terminal closed). Clear it so the
|
|
|
+ // stale env doesn't linger past its relevance. Runs in all modes
|
|
|
+ // (clearBridgePointer is a no-op when no file exists) — covers the
|
|
|
+ // gate-transition case where a user crashed in single-session mode then
|
|
|
+ // starts fresh in worktree mode. Only single-session mode writes new
|
|
|
+ // pointers.
|
|
|
+ if (!resumeSessionId) {
|
|
|
+ const { clearBridgePointer } = await import('./bridgePointer.js')
|
|
|
+ await clearBridgePointer(dir)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Worktree mode requires either git or WorktreeCreate/WorktreeRemove hooks.
|
|
|
+ // Only reachable via explicit --spawn=worktree (default is same-dir);
|
|
|
+ // saved worktree pref was already guarded above.
|
|
|
+ if (spawnMode === 'worktree' && !worktreeAvailable) {
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ `Error: Worktree mode requires a git repository or WorktreeCreate hooks configured. Use --spawn=session for single-session mode.`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ const branch = await getBranch()
|
|
|
+ const gitRepoUrl = await getRemoteUrl()
|
|
|
+ const machineName = hostname()
|
|
|
+ const bridgeId = randomUUID()
|
|
|
+
|
|
|
+ const { handleOAuth401Error } = await import('../utils/auth.js')
|
|
|
+ const api = createBridgeApiClient({
|
|
|
+ baseUrl,
|
|
|
+ getAccessToken: getBridgeAccessToken,
|
|
|
+ runnerVersion: MACRO.VERSION,
|
|
|
+ onDebug: logForDebugging,
|
|
|
+ onAuth401: handleOAuth401Error,
|
|
|
+ getTrustedDeviceToken,
|
|
|
+ })
|
|
|
+
|
|
|
+ // When resuming a session via --session-id, fetch it to learn its
|
|
|
+ // environment_id and reuse that for registration (idempotent on the
|
|
|
+ // backend). Left undefined otherwise — the backend rejects
|
|
|
+ // client-generated UUIDs and will allocate a fresh environment.
|
|
|
+ // feature('KAIROS') gate: --session-id is ant-only; parseArgs already
|
|
|
+ // rejects the flag when the gate is off, so resumeSessionId is always
|
|
|
+ // undefined here in external builds — this guard is for tree-shaking.
|
|
|
+ let reuseEnvironmentId: string | undefined
|
|
|
+ if (feature('KAIROS') && resumeSessionId) {
|
|
|
+ try {
|
|
|
+ validateBridgeId(resumeSessionId, 'sessionId')
|
|
|
+ } catch {
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ `Error: Invalid session ID "${resumeSessionId}". Session IDs must not contain unsafe characters.`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+ // Proactively refresh the OAuth token — getBridgeSession uses raw axios
|
|
|
+ // without the withOAuthRetry 401-refresh logic. An expired-but-present
|
|
|
+ // token would otherwise produce a misleading "not found" error.
|
|
|
+ await checkAndRefreshOAuthTokenIfNeeded()
|
|
|
+ clearOAuthTokenCache()
|
|
|
+ const { getBridgeSession } = await import('./createSession.js')
|
|
|
+ const session = await getBridgeSession(resumeSessionId, {
|
|
|
+ baseUrl,
|
|
|
+ getAccessToken: getBridgeAccessToken,
|
|
|
+ })
|
|
|
+ if (!session) {
|
|
|
+ // Session gone on server → pointer is stale. Clear it so the user
|
|
|
+ // isn't re-prompted next launch. (Explicit --session-id leaves the
|
|
|
+ // pointer alone — it's an independent file they may not even have.)
|
|
|
+ // resumePointerDir may be a worktree sibling — clear THAT file.
|
|
|
+ if (resumePointerDir) {
|
|
|
+ const { clearBridgePointer } = await import('./bridgePointer.js')
|
|
|
+ await clearBridgePointer(resumePointerDir)
|
|
|
+ }
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ `Error: Session ${resumeSessionId} not found. It may have been archived or expired, or your login may have lapsed (run \`claude /login\`).`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+ if (!session.environment_id) {
|
|
|
+ if (resumePointerDir) {
|
|
|
+ const { clearBridgePointer } = await import('./bridgePointer.js')
|
|
|
+ await clearBridgePointer(resumePointerDir)
|
|
|
+ }
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ `Error: Session ${resumeSessionId} has no environment_id. It may never have been attached to a bridge.`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+ reuseEnvironmentId = session.environment_id
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] Resuming session ${resumeSessionId} on environment ${reuseEnvironmentId}`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ const config: BridgeConfig = {
|
|
|
+ dir,
|
|
|
+ machineName,
|
|
|
+ branch,
|
|
|
+ gitRepoUrl,
|
|
|
+ maxSessions,
|
|
|
+ spawnMode,
|
|
|
+ verbose,
|
|
|
+ sandbox,
|
|
|
+ bridgeId,
|
|
|
+ workerType: 'claude_code',
|
|
|
+ environmentId: randomUUID(),
|
|
|
+ reuseEnvironmentId,
|
|
|
+ apiBaseUrl: baseUrl,
|
|
|
+ sessionIngressUrl,
|
|
|
+ debugFile,
|
|
|
+ sessionTimeoutMs,
|
|
|
+ }
|
|
|
+
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] bridgeId=${bridgeId}${reuseEnvironmentId ? ` reuseEnvironmentId=${reuseEnvironmentId}` : ''} dir=${dir} branch=${branch} gitRepoUrl=${gitRepoUrl} machine=${machineName}`,
|
|
|
+ )
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] apiBaseUrl=${baseUrl} sessionIngressUrl=${sessionIngressUrl}`,
|
|
|
+ )
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] sandbox=${sandbox}${debugFile ? ` debugFile=${debugFile}` : ''}`,
|
|
|
+ )
|
|
|
+
|
|
|
+ // Register the bridge environment before entering the poll loop.
|
|
|
+ let environmentId: string
|
|
|
+ let environmentSecret: string
|
|
|
+ try {
|
|
|
+ const reg = await api.registerBridgeEnvironment(config)
|
|
|
+ environmentId = reg.environment_id
|
|
|
+ environmentSecret = reg.environment_secret
|
|
|
+ } catch (err) {
|
|
|
+ logEvent('tengu_bridge_registration_failed', {
|
|
|
+ status: err instanceof BridgeFatalError ? err.status : undefined,
|
|
|
+ })
|
|
|
+ // Registration failures are fatal — print a clean message instead of a stack trace.
|
|
|
+ // biome-ignore lint/suspicious/noConsole:: intentional console output
|
|
|
+ console.error(
|
|
|
+ err instanceof BridgeFatalError && err.status === 404
|
|
|
+ ? 'Remote Control environments are not available for your account.'
|
|
|
+ : `Error: ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tracks whether the --session-id resume flow completed successfully.
|
|
|
+ // Used below to skip fresh session creation and seed initialSessionId.
|
|
|
+ // Cleared on env mismatch so we gracefully fall back to a new session.
|
|
|
+ let effectiveResumeSessionId: string | undefined
|
|
|
+ if (feature('KAIROS') && resumeSessionId) {
|
|
|
+ if (reuseEnvironmentId && environmentId !== reuseEnvironmentId) {
|
|
|
+ // Backend returned a different environment_id — the original env
|
|
|
+ // expired or was reaped. Reconnect won't work against the new env
|
|
|
+ // (session is bound to the old one). Log to sentry for visibility
|
|
|
+ // and fall through to fresh session creation on the new env.
|
|
|
+ logError(
|
|
|
+ new Error(
|
|
|
+ `Bridge resume env mismatch: requested ${reuseEnvironmentId}, backend returned ${environmentId}. Falling back to fresh session.`,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional warning output
|
|
|
+ console.warn(
|
|
|
+ `Warning: Could not resume session ${resumeSessionId} — its environment has expired. Creating a fresh session instead.`,
|
|
|
+ )
|
|
|
+ // Don't deregister — we're going to use this new environment.
|
|
|
+ // effectiveResumeSessionId stays undefined → fresh session path below.
|
|
|
+ } else {
|
|
|
+ // Force-stop any stale worker instances for this session and re-queue
|
|
|
+ // it so our poll loop picks it up. Must happen after registration so
|
|
|
+ // the backend knows a live worker exists for the environment.
|
|
|
+ //
|
|
|
+ // The pointer stores a session_* ID but /bridge/reconnect looks
|
|
|
+ // sessions up by their infra tag (cse_*) when ccr_v2_compat_enabled
|
|
|
+ // is on. Try both; the conversion is a no-op if already cse_*.
|
|
|
+ const infraResumeId = toInfraSessionId(resumeSessionId)
|
|
|
+ const reconnectCandidates =
|
|
|
+ infraResumeId === resumeSessionId
|
|
|
+ ? [resumeSessionId]
|
|
|
+ : [resumeSessionId, infraResumeId]
|
|
|
+ let reconnected = false
|
|
|
+ let lastReconnectErr: unknown
|
|
|
+ for (const candidateId of reconnectCandidates) {
|
|
|
+ try {
|
|
|
+ await api.reconnectSession(environmentId, candidateId)
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] Session ${candidateId} re-queued via bridge/reconnect`,
|
|
|
+ )
|
|
|
+ effectiveResumeSessionId = resumeSessionId
|
|
|
+ reconnected = true
|
|
|
+ break
|
|
|
+ } catch (err) {
|
|
|
+ lastReconnectErr = err
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] reconnectSession(${candidateId}) failed: ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (!reconnected) {
|
|
|
+ const err = lastReconnectErr
|
|
|
+
|
|
|
+ // Do NOT deregister on transient reconnect failure — at this point
|
|
|
+ // environmentId IS the session's own environment. Deregistering
|
|
|
+ // would make retry impossible. The backend's 4h TTL cleans up.
|
|
|
+ const isFatal = err instanceof BridgeFatalError
|
|
|
+ // Clear pointer only on fatal reconnect failure. Transient failures
|
|
|
+ // ("try running the same command again") should keep the pointer so
|
|
|
+ // next launch re-prompts — that IS the retry mechanism.
|
|
|
+ if (resumePointerDir && isFatal) {
|
|
|
+ const { clearBridgePointer } = await import('./bridgePointer.js')
|
|
|
+ await clearBridgePointer(resumePointerDir)
|
|
|
+ }
|
|
|
+ // biome-ignore lint/suspicious/noConsole: intentional error output
|
|
|
+ console.error(
|
|
|
+ isFatal
|
|
|
+ ? `Error: ${errorMessage(err)}`
|
|
|
+ : `Error: Failed to reconnect session ${resumeSessionId}: ${errorMessage(err)}\nThe session may still be resumable — try running the same command again.`,
|
|
|
+ )
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(1)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] Registered, server environmentId=${environmentId}`,
|
|
|
+ )
|
|
|
+ const startupPollConfig = getPollIntervalConfig()
|
|
|
+ logEvent('tengu_bridge_started', {
|
|
|
+ max_sessions: config.maxSessions,
|
|
|
+ has_debug_file: !!config.debugFile,
|
|
|
+ sandbox: config.sandbox,
|
|
|
+ verbose: config.verbose,
|
|
|
+ heartbeat_interval_ms:
|
|
|
+ startupPollConfig.non_exclusive_heartbeat_interval_ms,
|
|
|
+ spawn_mode:
|
|
|
+ config.spawnMode as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ spawn_mode_source:
|
|
|
+ spawnModeSource as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ multi_session_gate: multiSessionEnabled,
|
|
|
+ pre_create_session: preCreateSession,
|
|
|
+ worktree_available: worktreeAvailable,
|
|
|
+ })
|
|
|
+ logForDiagnosticsNoPII('info', 'bridge_started', {
|
|
|
+ max_sessions: config.maxSessions,
|
|
|
+ sandbox: config.sandbox,
|
|
|
+ spawn_mode: config.spawnMode,
|
|
|
+ })
|
|
|
+
|
|
|
+ const spawner = createSessionSpawner({
|
|
|
+ execPath: process.execPath,
|
|
|
+ scriptArgs: spawnScriptArgs(),
|
|
|
+ env: process.env,
|
|
|
+ verbose,
|
|
|
+ sandbox,
|
|
|
+ debugFile,
|
|
|
+ permissionMode,
|
|
|
+ onDebug: logForDebugging,
|
|
|
+ onActivity: (sessionId, activity) => {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:activity] sessionId=${sessionId} ${activity.type} ${activity.summary}`,
|
|
|
+ )
|
|
|
+ },
|
|
|
+ onPermissionRequest: (sessionId, request, _accessToken) => {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:perm] sessionId=${sessionId} tool=${request.request.tool_name} request_id=${request.request_id} (not auto-approving)`,
|
|
|
+ )
|
|
|
+ },
|
|
|
+ })
|
|
|
+
|
|
|
+ const logger = createBridgeLogger({ verbose })
|
|
|
+ const { parseGitHubRepository } = await import('../utils/detectRepository.js')
|
|
|
+ const ownerRepo = gitRepoUrl ? parseGitHubRepository(gitRepoUrl) : null
|
|
|
+ // Use the repo name from the parsed owner/repo, or fall back to the dir basename
|
|
|
+ const repoName = ownerRepo ? ownerRepo.split('/').pop()! : basename(dir)
|
|
|
+ logger.setRepoInfo(repoName, branch)
|
|
|
+
|
|
|
+ // `w` toggle is available iff we're in a multi-session mode AND worktree
|
|
|
+ // is a valid option. When unavailable, the mode suffix and hint are hidden.
|
|
|
+ const toggleAvailable = spawnMode !== 'single-session' && worktreeAvailable
|
|
|
+ if (toggleAvailable) {
|
|
|
+ // Safe cast: spawnMode is not single-session (checked above), and the
|
|
|
+ // saved-worktree-in-non-git guard + exit check above ensure worktree
|
|
|
+ // is only reached when available.
|
|
|
+ logger.setSpawnModeDisplay(spawnMode as 'same-dir' | 'worktree')
|
|
|
+ }
|
|
|
+
|
|
|
+ // Listen for keys: space toggles QR code, w toggles spawn mode
|
|
|
+ const onStdinData = (data: Buffer): void => {
|
|
|
+ if (data[0] === 0x03 || data[0] === 0x04) {
|
|
|
+ // Ctrl+C / Ctrl+D — trigger graceful shutdown
|
|
|
+ process.emit('SIGINT')
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if (data[0] === 0x20 /* space */) {
|
|
|
+ logger.toggleQr()
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if (data[0] === 0x77 /* 'w' */) {
|
|
|
+ if (!toggleAvailable) return
|
|
|
+ const newMode: 'same-dir' | 'worktree' =
|
|
|
+ config.spawnMode === 'same-dir' ? 'worktree' : 'same-dir'
|
|
|
+ config.spawnMode = newMode
|
|
|
+ logEvent('tengu_bridge_spawn_mode_toggled', {
|
|
|
+ spawn_mode:
|
|
|
+ newMode as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
|
+ })
|
|
|
+ logger.logStatus(
|
|
|
+ newMode === 'worktree'
|
|
|
+ ? 'Spawn mode: worktree (new sessions get isolated git worktrees)'
|
|
|
+ : 'Spawn mode: same-dir (new sessions share the current directory)',
|
|
|
+ )
|
|
|
+ logger.setSpawnModeDisplay(newMode)
|
|
|
+ logger.refreshDisplay()
|
|
|
+ saveCurrentProjectConfig(current => {
|
|
|
+ if (current.remoteControlSpawnMode === newMode) return current
|
|
|
+ return { ...current, remoteControlSpawnMode: newMode }
|
|
|
+ })
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (process.stdin.isTTY) {
|
|
|
+ process.stdin.setRawMode(true)
|
|
|
+ process.stdin.resume()
|
|
|
+ process.stdin.on('data', onStdinData)
|
|
|
+ }
|
|
|
+
|
|
|
+ const controller = new AbortController()
|
|
|
+ const onSigint = (): void => {
|
|
|
+ logForDebugging('[bridge:shutdown] SIGINT received, shutting down')
|
|
|
+ controller.abort()
|
|
|
+ }
|
|
|
+ const onSigterm = (): void => {
|
|
|
+ logForDebugging('[bridge:shutdown] SIGTERM received, shutting down')
|
|
|
+ controller.abort()
|
|
|
+ }
|
|
|
+ process.on('SIGINT', onSigint)
|
|
|
+ process.on('SIGTERM', onSigterm)
|
|
|
+
|
|
|
+ // Auto-create an empty session so the user has somewhere to type
|
|
|
+ // immediately (matching /remote-control behavior). Controlled by
|
|
|
+ // preCreateSession: on by default; --no-create-session-in-dir opts out.
|
|
|
+ // When a --session-id resume succeeded, skip creation entirely — the
|
|
|
+ // session already exists and bridge/reconnect has re-queued it.
|
|
|
+ // When resume was requested but failed on env mismatch, effectiveResumeSessionId
|
|
|
+ // is undefined, so we fall through to fresh session creation (honoring the
|
|
|
+ // "Creating a fresh session instead" warning printed above).
|
|
|
+ let initialSessionId: string | null =
|
|
|
+ feature('KAIROS') && effectiveResumeSessionId
|
|
|
+ ? effectiveResumeSessionId
|
|
|
+ : null
|
|
|
+ if (preCreateSession && !(feature('KAIROS') && effectiveResumeSessionId)) {
|
|
|
+ const { createBridgeSession } = await import('./createSession.js')
|
|
|
+ try {
|
|
|
+ initialSessionId = await createBridgeSession({
|
|
|
+ environmentId,
|
|
|
+ title: name,
|
|
|
+ events: [],
|
|
|
+ gitRepoUrl,
|
|
|
+ branch,
|
|
|
+ signal: controller.signal,
|
|
|
+ baseUrl,
|
|
|
+ getAccessToken: getBridgeAccessToken,
|
|
|
+ permissionMode,
|
|
|
+ })
|
|
|
+ if (initialSessionId) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] Created initial session ${initialSessionId}`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ } catch (err) {
|
|
|
+ logForDebugging(
|
|
|
+ `[bridge:init] Session creation failed (non-fatal): ${errorMessage(err)}`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Crash-recovery pointer: write immediately so kill -9 at any point
|
|
|
+ // after this leaves a recoverable trail. Covers both fresh sessions and
|
|
|
+ // resumed ones (so a second crash after resume is still recoverable).
|
|
|
+ // Cleared when runBridgeLoop falls through to archive+deregister; left in
|
|
|
+ // place on the SIGINT resumable-shutdown return (backup for when the user
|
|
|
+ // closes the terminal before copying the printed --session-id hint).
|
|
|
+ // Refreshed hourly so a 5h+ session that crashes still has a fresh
|
|
|
+ // pointer (staleness checks file mtime, backend TTL is rolling-from-poll).
|
|
|
+ let pointerRefreshTimer: ReturnType<typeof setInterval> | null = null
|
|
|
+ // Single-session only: --continue forces single-session mode on resume,
|
|
|
+ // so a pointer written in multi-session mode would contradict the user's
|
|
|
+ // config when they try to resume. The resumable-shutdown path is also
|
|
|
+ // gated to single-session (line ~1254) so the pointer would be orphaned.
|
|
|
+ if (initialSessionId && spawnMode === 'single-session') {
|
|
|
+ const { writeBridgePointer } = await import('./bridgePointer.js')
|
|
|
+ const pointerPayload = {
|
|
|
+ sessionId: initialSessionId,
|
|
|
+ environmentId,
|
|
|
+ source: 'standalone' as const,
|
|
|
+ }
|
|
|
+ await writeBridgePointer(config.dir, pointerPayload)
|
|
|
+ pointerRefreshTimer = setInterval(
|
|
|
+ writeBridgePointer,
|
|
|
+ 60 * 60 * 1000,
|
|
|
+ config.dir,
|
|
|
+ pointerPayload,
|
|
|
+ )
|
|
|
+ // Don't let the interval keep the process alive on its own.
|
|
|
+ pointerRefreshTimer.unref?.()
|
|
|
+ }
|
|
|
+
|
|
|
+ try {
|
|
|
+ await runBridgeLoop(
|
|
|
+ config,
|
|
|
+ environmentId,
|
|
|
+ environmentSecret,
|
|
|
+ api,
|
|
|
+ spawner,
|
|
|
+ logger,
|
|
|
+ controller.signal,
|
|
|
+ undefined,
|
|
|
+ initialSessionId ?? undefined,
|
|
|
+ async () => {
|
|
|
+ // Clear the memoized OAuth token cache so we re-read from secure
|
|
|
+ // storage, picking up tokens refreshed by child processes.
|
|
|
+ clearOAuthTokenCache()
|
|
|
+ // Proactively refresh the token if it's expired on disk too.
|
|
|
+ await checkAndRefreshOAuthTokenIfNeeded()
|
|
|
+ return getBridgeAccessToken()
|
|
|
+ },
|
|
|
+ )
|
|
|
+ } finally {
|
|
|
+ if (pointerRefreshTimer !== null) {
|
|
|
+ clearInterval(pointerRefreshTimer)
|
|
|
+ }
|
|
|
+ process.off('SIGINT', onSigint)
|
|
|
+ process.off('SIGTERM', onSigterm)
|
|
|
+ process.stdin.off('data', onStdinData)
|
|
|
+ if (process.stdin.isTTY) {
|
|
|
+ process.stdin.setRawMode(false)
|
|
|
+ }
|
|
|
+ process.stdin.pause()
|
|
|
+ }
|
|
|
+
|
|
|
+ // The bridge bypasses init.ts (and its graceful shutdown handler), so we
|
|
|
+ // must exit explicitly.
|
|
|
+ // eslint-disable-next-line custom-rules/no-process-exit
|
|
|
+ process.exit(0)
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Headless bridge (daemon worker) ────────────────────────────────────────
|
|
|
+
|
|
|
+/**
|
|
|
+ * Thrown by runBridgeHeadless for configuration issues the supervisor should
|
|
|
+ * NOT retry (trust not accepted, worktree unavailable, http-not-https). The
|
|
|
+ * daemon worker catches this and exits with EXIT_CODE_PERMANENT so the
|
|
|
+ * supervisor parks the worker instead of respawning it on backoff.
|
|
|
+ */
|
|
|
+export class BridgeHeadlessPermanentError extends Error {
|
|
|
+ constructor(message: string) {
|
|
|
+ super(message)
|
|
|
+ this.name = 'BridgeHeadlessPermanentError'
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+export type HeadlessBridgeOpts = {
|
|
|
+ dir: string
|
|
|
+ name?: string
|
|
|
+ spawnMode: 'same-dir' | 'worktree'
|
|
|
+ capacity: number
|
|
|
+ permissionMode?: string
|
|
|
+ sandbox: boolean
|
|
|
+ sessionTimeoutMs?: number
|
|
|
+ createSessionOnStart: boolean
|
|
|
+ getAccessToken: () => string | undefined
|
|
|
+ onAuth401: (failedToken: string) => Promise<boolean>
|
|
|
+ log: (s: string) => void
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Non-interactive bridge entrypoint for the `remoteControl` daemon worker.
|
|
|
+ *
|
|
|
+ * Linear subset of bridgeMain(): no readline dialogs, no stdin key handlers,
|
|
|
+ * no TUI, no process.exit(). Config comes from the caller (daemon.json), auth
|
|
|
+ * comes via IPC (supervisor's AuthManager), logs go to the worker's stdout
|
|
|
+ * pipe. Throws on fatal errors — the worker catches and maps permanent vs
|
|
|
+ * transient to the right exit code.
|
|
|
+ *
|
|
|
+ * Resolves cleanly when `signal` aborts and the poll loop tears down.
|
|
|
+ */
|
|
|
+export async function runBridgeHeadless(
|
|
|
+ opts: HeadlessBridgeOpts,
|
|
|
+ signal: AbortSignal,
|
|
|
+): Promise<void> {
|
|
|
+ const { dir, log } = opts
|
|
|
+
|
|
|
+ // Worker inherits the supervisor's CWD. chdir first so git utilities
|
|
|
+ // (getBranch/getRemoteUrl) — which read from bootstrap CWD state set
|
|
|
+ // below — resolve against the right repo.
|
|
|
+ process.chdir(dir)
|
|
|
+ const { setOriginalCwd, setCwdState } = await import('../bootstrap/state.js')
|
|
|
+ setOriginalCwd(dir)
|
|
|
+ setCwdState(dir)
|
|
|
+
|
|
|
+ const { enableConfigs, checkHasTrustDialogAccepted } = await import(
|
|
|
+ '../utils/config.js'
|
|
|
+ )
|
|
|
+ enableConfigs()
|
|
|
+ const { initSinks } = await import('../utils/sinks.js')
|
|
|
+ initSinks()
|
|
|
+
|
|
|
+ if (!checkHasTrustDialogAccepted()) {
|
|
|
+ throw new BridgeHeadlessPermanentError(
|
|
|
+ `Workspace not trusted: ${dir}. Run \`claude\` in that directory first to accept the trust dialog.`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!opts.getAccessToken()) {
|
|
|
+ // Transient — supervisor's AuthManager may pick up a token on next cycle.
|
|
|
+ throw new Error(BRIDGE_LOGIN_ERROR)
|
|
|
+ }
|
|
|
+
|
|
|
+ const { getBridgeBaseUrl } = await import('./bridgeConfig.js')
|
|
|
+ const baseUrl = getBridgeBaseUrl()
|
|
|
+ if (
|
|
|
+ baseUrl.startsWith('http://') &&
|
|
|
+ !baseUrl.includes('localhost') &&
|
|
|
+ !baseUrl.includes('127.0.0.1')
|
|
|
+ ) {
|
|
|
+ throw new BridgeHeadlessPermanentError(
|
|
|
+ 'Remote Control base URL uses HTTP. Only HTTPS or localhost HTTP is allowed.',
|
|
|
+ )
|
|
|
+ }
|
|
|
+ const sessionIngressUrl =
|
|
|
+ process.env.USER_TYPE === 'ant' &&
|
|
|
+ process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL
|
|
|
+ ? process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL
|
|
|
+ : baseUrl
|
|
|
+
|
|
|
+ const { getBranch, getRemoteUrl, findGitRoot } = await import(
|
|
|
+ '../utils/git.js'
|
|
|
+ )
|
|
|
+ const { hasWorktreeCreateHook } = await import('../utils/hooks.js')
|
|
|
+
|
|
|
+ if (opts.spawnMode === 'worktree') {
|
|
|
+ const worktreeAvailable =
|
|
|
+ hasWorktreeCreateHook() || findGitRoot(dir) !== null
|
|
|
+ if (!worktreeAvailable) {
|
|
|
+ throw new BridgeHeadlessPermanentError(
|
|
|
+ `Worktree mode requires a git repository or WorktreeCreate hooks. Directory ${dir} has neither.`,
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ const branch = await getBranch()
|
|
|
+ const gitRepoUrl = await getRemoteUrl()
|
|
|
+ const machineName = hostname()
|
|
|
+ const bridgeId = randomUUID()
|
|
|
+
|
|
|
+ const config: BridgeConfig = {
|
|
|
+ dir,
|
|
|
+ machineName,
|
|
|
+ branch,
|
|
|
+ gitRepoUrl,
|
|
|
+ maxSessions: opts.capacity,
|
|
|
+ spawnMode: opts.spawnMode,
|
|
|
+ verbose: false,
|
|
|
+ sandbox: opts.sandbox,
|
|
|
+ bridgeId,
|
|
|
+ workerType: 'claude_code',
|
|
|
+ environmentId: randomUUID(),
|
|
|
+ apiBaseUrl: baseUrl,
|
|
|
+ sessionIngressUrl,
|
|
|
+ sessionTimeoutMs: opts.sessionTimeoutMs,
|
|
|
+ }
|
|
|
+
|
|
|
+ const api = createBridgeApiClient({
|
|
|
+ baseUrl,
|
|
|
+ getAccessToken: opts.getAccessToken,
|
|
|
+ runnerVersion: MACRO.VERSION,
|
|
|
+ onDebug: log,
|
|
|
+ onAuth401: opts.onAuth401,
|
|
|
+ getTrustedDeviceToken,
|
|
|
+ })
|
|
|
+
|
|
|
+ let environmentId: string
|
|
|
+ let environmentSecret: string
|
|
|
+ try {
|
|
|
+ const reg = await api.registerBridgeEnvironment(config)
|
|
|
+ environmentId = reg.environment_id
|
|
|
+ environmentSecret = reg.environment_secret
|
|
|
+ } catch (err) {
|
|
|
+ // Transient — let supervisor backoff-retry.
|
|
|
+ throw new Error(`Bridge registration failed: ${errorMessage(err)}`)
|
|
|
+ }
|
|
|
+
|
|
|
+ const spawner = createSessionSpawner({
|
|
|
+ execPath: process.execPath,
|
|
|
+ scriptArgs: spawnScriptArgs(),
|
|
|
+ env: process.env,
|
|
|
+ verbose: false,
|
|
|
+ sandbox: opts.sandbox,
|
|
|
+ permissionMode: opts.permissionMode,
|
|
|
+ onDebug: log,
|
|
|
+ })
|
|
|
+
|
|
|
+ const logger = createHeadlessBridgeLogger(log)
|
|
|
+ logger.printBanner(config, environmentId)
|
|
|
+
|
|
|
+ let initialSessionId: string | undefined
|
|
|
+ if (opts.createSessionOnStart) {
|
|
|
+ const { createBridgeSession } = await import('./createSession.js')
|
|
|
+ try {
|
|
|
+ const sid = await createBridgeSession({
|
|
|
+ environmentId,
|
|
|
+ title: opts.name,
|
|
|
+ events: [],
|
|
|
+ gitRepoUrl,
|
|
|
+ branch,
|
|
|
+ signal,
|
|
|
+ baseUrl,
|
|
|
+ getAccessToken: opts.getAccessToken,
|
|
|
+ permissionMode: opts.permissionMode,
|
|
|
+ })
|
|
|
+ if (sid) {
|
|
|
+ initialSessionId = sid
|
|
|
+ log(`created initial session ${sid}`)
|
|
|
+ }
|
|
|
+ } catch (err) {
|
|
|
+ log(`session pre-creation failed (non-fatal): ${errorMessage(err)}`)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ await runBridgeLoop(
|
|
|
+ config,
|
|
|
+ environmentId,
|
|
|
+ environmentSecret,
|
|
|
+ api,
|
|
|
+ spawner,
|
|
|
+ logger,
|
|
|
+ signal,
|
|
|
+ undefined,
|
|
|
+ initialSessionId,
|
|
|
+ async () => opts.getAccessToken(),
|
|
|
+ )
|
|
|
+}
|
|
|
+
|
|
|
+/** BridgeLogger adapter that routes everything to a single line-log fn. */
|
|
|
+function createHeadlessBridgeLogger(log: (s: string) => void): BridgeLogger {
|
|
|
+ const noop = (): void => {}
|
|
|
+ return {
|
|
|
+ printBanner: (cfg, envId) =>
|
|
|
+ log(
|
|
|
+ `registered environmentId=${envId} dir=${cfg.dir} spawnMode=${cfg.spawnMode} capacity=${cfg.maxSessions}`,
|
|
|
+ ),
|
|
|
+ logSessionStart: (id, _prompt) => log(`session start ${id}`),
|
|
|
+ logSessionComplete: (id, ms) => log(`session complete ${id} (${ms}ms)`),
|
|
|
+ logSessionFailed: (id, err) => log(`session failed ${id}: ${err}`),
|
|
|
+ logStatus: log,
|
|
|
+ logVerbose: log,
|
|
|
+ logError: s => log(`error: ${s}`),
|
|
|
+ logReconnected: ms => log(`reconnected after ${ms}ms`),
|
|
|
+ addSession: (id, _url) => log(`session attached ${id}`),
|
|
|
+ removeSession: id => log(`session detached ${id}`),
|
|
|
+ updateIdleStatus: noop,
|
|
|
+ updateReconnectingStatus: noop,
|
|
|
+ updateSessionStatus: noop,
|
|
|
+ updateSessionActivity: noop,
|
|
|
+ updateSessionCount: noop,
|
|
|
+ updateFailedStatus: noop,
|
|
|
+ setSpawnModeDisplay: noop,
|
|
|
+ setRepoInfo: noop,
|
|
|
+ setDebugLogPath: noop,
|
|
|
+ setAttached: noop,
|
|
|
+ setSessionTitle: noop,
|
|
|
+ clearStatus: noop,
|
|
|
+ toggleQr: noop,
|
|
|
+ refreshDisplay: noop,
|
|
|
+ }
|
|
|
+}
|