| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649 |
- /**
- * Tool dispatch. Every security decision from plan §2 is enforced HERE,
- * before any executor method is called.
- *
- * Enforcement order, every call:
- * 1. Kill switch (`adapter.isDisabled()`).
- * 2. TCC gate (`adapter.ensureOsPermissions()`). `request_access` is
- * exempted — it threads the ungranted state to the renderer so the
- * user can grant TCC perms from inside the approval dialog.
- * 3. Tool-specific gates (see dispatch table) — ANY exception in a gate
- * returns a tool error, executor never called.
- * 4. Executor call.
- *
- * For input actions (click/type/key/scroll/drag/move_mouse) the tool-specific
- * gates are, in order:
- * a. `prepareForAction` — hide every non-allowlisted app, then defocus us
- * (battle-tested pre-action sequence from the Vercept acquisition).
- * Sub-gated via `hideBeforeAction`. After this runs the screenshot is
- * TRUE (what the
- * model sees IS what's at each pixel) and we are not keyboard-focused.
- * b. Frontmost gate — branched by actionKind:
- * mouse: frontmost ∈ allowlist ∪ {hostBundleId, Finder} → pass.
- * hostBundleId passes because the executor's
- * `withClickThrough` bracket makes us click-through.
- * keyboard: frontmost ∈ allowlist ∪ {Finder} → pass.
- * hostBundleId → ERROR (safety net — defocus should have
- * moved us off; if it didn't, typing would go into our
- * own chat box).
- * After step (a) this gate fires RARELY — only when something popped
- * up between prepare and action, or the 5-try hide loop gave up.
- * Checked FRESH on every call, not cached across calls.
- *
- * For click variants only, AFTER the above gates but BEFORE the executor call:
- * c. Pixel-validation staleness check (sub-gated).
- */
- import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
- import { randomUUID } from "node:crypto";
- import { getDefaultTierForApp, getDeniedCategoryForApp, isPolicyDenied } from "./deniedApps.js";
- import type {
- ComputerExecutor,
- DisplayGeometry,
- InstalledApp,
- ScreenshotResult,
- } from "./executor.js";
- import { isSystemKeyCombo } from "./keyBlocklist.js";
- import { validateClickTarget } from "./pixelCompare.js";
- import { SENTINEL_BUNDLE_IDS } from "./sentinelApps.js";
- import type {
- AppGrant,
- ComputerUseHostAdapter,
- ComputerUseOverrides,
- CoordinateMode,
- CuAppPermTier,
- CuGrantFlags,
- CuPermissionRequest,
- CuSubGates,
- CuTeachPermissionRequest,
- Logger,
- ResolvedAppRequest,
- TeachStepRequest,
- } from "./types.js";
- /**
- * Finder is never hidden by the hide loop (hiding Finder kills the Desktop),
- * so it's always a valid frontmost.
- */
- const FINDER_BUNDLE_ID = "com.apple.finder";
- /**
- * Categorical error classes for the cu_tool_call telemetry event. Never
- * free text — error messages may contain file paths / app content (PII).
- */
- export type CuErrorKind =
- | "allowlist_empty"
- | "tcc_not_granted"
- | "cu_lock_held"
- | "teach_mode_conflict"
- | "teach_mode_not_active"
- | "executor_threw"
- | "capture_failed"
- | "app_denied" // no longer emitted (tiered model replaced hard-deny); kept for schema compat
- | "bad_args" // malformed tool args (type/shape/range/unknown value)
- | "app_not_granted" // target app not in session allowlist (distinct from allowlist_empty)
- | "tier_insufficient" // app in allowlist but at a tier too low for the action
- | "feature_unavailable" // tool callable but session not wired for it
- | "state_conflict" // wrong state for action (call sequence, mouse already held)
- | "grant_flag_required" // action needs a grant flag (systemKeyCombos, clipboard*) from request_access
- | "display_error" // display enumeration failed (platform)
- | "other";
- /**
- * Telemetry payload piggybacked on the result — populated by handlers,
- * consumed and stripped by the host wrapper (serverDef.ts) before the
- * result goes to the SDK. Same pattern as `screenshot`.
- */
- export interface CuCallTelemetry {
- /** request_access / request_teach_access: apps NEWLY granted in THIS call
- * (does NOT include idempotent re-grants of already-allowed apps). */
- granted_count?: number;
- /** request_access / request_teach_access: apps denied in THIS call */
- denied_count?: number;
- /** request_access / request_teach_access: apps safety-denied (browser) this call */
- denied_browser_count?: number;
- /** request_access / request_teach_access: apps safety-denied (terminal) this call */
- denied_terminal_count?: number;
- /** Categorical error class (only set when isError) */
- error_kind?: CuErrorKind;
- }
- /**
- * `CallToolResult` augmented with the screenshot payload. `bindSessionContext`
- * reads `result.screenshot` after a `screenshot` tool call and stashes it in a
- * closure cell for the next pixel-validation. MCP clients never see this
- * field — the host wrapper strips it before returning to the SDK.
- */
- export type CuCallToolResult = CallToolResult & {
- screenshot?: ScreenshotResult;
- /** Piggybacked telemetry — stripped by the host wrapper before SDK return. */
- telemetry?: CuCallTelemetry;
- };
- // ---------------------------------------------------------------------------
- // Small result helpers (mirror of chrome-mcp's inline `{content, isError}`)
- // ---------------------------------------------------------------------------
- function errorResult(text: string, errorKind?: CuErrorKind): CuCallToolResult {
- return {
- content: [{ type: "text", text }],
- isError: true,
- telemetry: errorKind ? { error_kind: errorKind } : undefined,
- };
- }
- function okText(text: string): CuCallToolResult {
- return { content: [{ type: "text", text }] };
- }
- function okJson(obj: unknown, telemetry?: CuCallTelemetry): CuCallToolResult {
- return {
- content: [{ type: "text", text: JSON.stringify(obj) }],
- telemetry,
- };
- }
- // ---------------------------------------------------------------------------
- // Arg validation — lightweight, no zod (mirrors chrome-mcp's cast-and-check)
- // ---------------------------------------------------------------------------
- function asRecord(args: unknown): Record<string, unknown> {
- if (typeof args === "object" && args !== null) {
- return args as Record<string, unknown>;
- }
- return {};
- }
- function requireNumber(
- args: Record<string, unknown>,
- key: string,
- ): number | Error {
- const v = args[key];
- if (typeof v !== "number" || !Number.isFinite(v)) {
- return new Error(`"${key}" must be a finite number.`);
- }
- return v;
- }
- function requireString(
- args: Record<string, unknown>,
- key: string,
- ): string | Error {
- const v = args[key];
- if (typeof v !== "string") {
- return new Error(`"${key}" must be a string.`);
- }
- return v;
- }
- /**
- * Extract (x, y) from `coordinate: [x, y]` tuple.
- * array of length 2, both non-negative numbers.
- */
- function extractCoordinate(
- args: Record<string, unknown>,
- paramName: string = "coordinate",
- ): [number, number] | Error {
- const coord = args[paramName];
- if (coord === undefined) {
- return new Error(`${paramName} is required`);
- }
- if (!Array.isArray(coord) || coord.length !== 2) {
- return new Error(`${paramName} must be an array of length 2`);
- }
- const [x, y] = coord;
- if (typeof x !== "number" || typeof y !== "number" || x < 0 || y < 0) {
- return new Error(`${paramName} must be a tuple of non-negative numbers`);
- }
- return [x, y];
- }
- // ---------------------------------------------------------------------------
- // Coordinate scaling
- // ---------------------------------------------------------------------------
- /**
- * Convert model-space coordinates to the logical points that enigo expects.
- *
- * - `normalized_0_100`: (x / 100) * display.width. `display` is fetched
- * fresh per tool call — never cached across calls —
- * so a mid-session display-settings change doesn't leave us stale.
- * - `pixels`: the model sent image-space pixel coords (it read them off the
- * last screenshot). With the 1568-px long-edge downsample, the
- * screenshot-px → logical-pt ratio is `displayWidth / screenshotWidth`,
- * NOT `1/scaleFactor`. Uses the display geometry stashed at CAPTURE time
- * (`lastScreenshot.displayWidth`), not fresh — so the transform matches
- * what the model actually saw even if the user changed display settings
- * since. (Chrome's ScreenshotContext pattern — CDPService.ts:1486-1493.)
- */
- function scaleCoord(
- rawX: number,
- rawY: number,
- mode: CoordinateMode,
- display: DisplayGeometry,
- lastScreenshot: ScreenshotResult | undefined,
- logger: Logger,
- ): { x: number; y: number } {
- if (mode === "normalized_0_100") {
- // Origin offset targets the selected display in virtual-screen space.
- return {
- x: Math.round((rawX / 100) * display.width) + display.originX,
- y: Math.round((rawY / 100) * display.height) + display.originY,
- };
- }
- // mode === "pixels": model sent image-space pixel coords.
- if (lastScreenshot) {
- // The transform. Chrome coordinateScaling.ts:22-34 + claude-in-a-box
- // ComputerTool.swift:70-80 — two independent convergent impls.
- // Uses the display geometry stashed AT CAPTURE TIME, not fresh.
- // Origin from the same snapshot keeps clicks coherent with the captured display.
- return {
- x:
- Math.round(
- rawX * (lastScreenshot.displayWidth / lastScreenshot.width),
- ) + lastScreenshot.originX,
- y:
- Math.round(
- rawY * (lastScreenshot.displayHeight / lastScreenshot.height),
- ) + lastScreenshot.originY,
- };
- }
- // Cold start: model sent pixel coords without having taken a screenshot.
- // Degenerate — fall back to the old /sf behavior and warn.
- logger.warn(
- "[computer-use] pixels-mode coordinate received with no prior screenshot; " +
- "falling back to /scaleFactor. Click may be off if downsample is active.",
- );
- return {
- x: Math.round(rawX / display.scaleFactor) + display.originX,
- y: Math.round(rawY / display.scaleFactor) + display.originY,
- };
- }
- /**
- * Convert model-space coordinates to the 0–100 percentage that
- * pixelCompare.ts works in. The staleness check operates in screenshot-image
- * space; comparing by percentage lets us crop both last and fresh screenshots
- * at the same relative location without caring about their absolute dims.
- *
- * With the 1568-px downsample, `screenshot.width != display.width * sf`, so
- * the old `rawX / (display.width * sf)` formula is wrong. The correct
- * denominator is just `lastScreenshot.width` — the model's raw pixel coord is
- * already in that image's coordinate space. `DisplayGeometry` is no longer
- * consumed at all.
- */
- function coordToPercentageForPixelCompare(
- rawX: number,
- rawY: number,
- mode: CoordinateMode,
- lastScreenshot: ScreenshotResult | undefined,
- ): { xPct: number; yPct: number } {
- if (mode === "normalized_0_100") {
- // Unchanged — already a percentage.
- return { xPct: rawX, yPct: rawY };
- }
- // mode === "pixels"
- if (!lastScreenshot) {
- // validateClickTarget at pixelCompare.ts:141-143 already skips when
- // lastScreenshot is undefined, so this return value never reaches a crop.
- return { xPct: 0, yPct: 0 };
- }
- return {
- xPct: (rawX / lastScreenshot.width) * 100,
- yPct: (rawY / lastScreenshot.height) * 100,
- };
- }
- // ---------------------------------------------------------------------------
- // Shared input-action gates
- // ---------------------------------------------------------------------------
- /**
- * Tier needed to perform a given action class. `undefined` → `"full"`.
- *
- * - `"mouse_position"` — mouse_move only. Passes at any tier including
- * `"read"`. Pure cursor positioning, no app interaction. Still runs
- * prepareForAction (hide non-allowed apps).
- * - `"mouse"` — plain left click, double/triple, scroll, drag-from.
- * Requires tier `"click"` or `"full"`.
- * - `"mouse_full"` — right/middle click, any click with modifiers,
- * drag-drop (the `to` endpoint of left_click_drag). Requires tier
- * `"full"`. Right-click → context menu Paste, modifier chords →
- * keystrokes before click, drag-drop → text insertion at the drop
- * point. All escalate a click-tier grant to keyboard-equivalent input.
- * Blunt: also rejects same-app drags (scrollbar, panel resize) onto
- * click-tier apps; `scroll` is the tier-"click" way to scroll.
- * - `"keyboard"` — type, key, hold_key. Requires tier `"full"`.
- */
- type CuActionKind = "mouse_position" | "mouse" | "mouse_full" | "keyboard";
- function tierSatisfies(
- grantTier: CuAppPermTier | undefined,
- actionKind: CuActionKind,
- ): boolean {
- const tier = grantTier ?? "full";
- if (actionKind === "mouse_position") return true;
- if (actionKind === "keyboard" || actionKind === "mouse_full") {
- return tier === "full";
- }
- // mouse
- return tier === "click" || tier === "full";
- }
- // Appended to every tier_insufficient error. The model may try to route
- // around the gate (osascript, System Events, cliclick via Bash) — this
- // closes that door explicitly. Leading space so it concatenates cleanly.
- const TIER_ANTI_SUBVERSION =
- " Do not attempt to work around this restriction — never use AppleScript, " +
- "System Events, shell commands, or any other method to send clicks or " +
- "keystrokes to this app.";
- // ---------------------------------------------------------------------------
- // Clipboard guard — stash+clear while a click-tier app is frontmost
- // ---------------------------------------------------------------------------
- //
- // Threat: tier "click" blocks type/key/right-click-Paste, but a click-tier
- // terminal/IDE may have a UI Paste button that's plain-left-clickable. If the
- // clipboard holds `rm -rf /` — from the user, from a prior full-tier paste,
- // OR from the agent's own write_clipboard call (which doesn't route through
- // runInputActionGates) — a left_click on that button injects it.
- //
- // Mitigation: stash the user's clipboard on first entry to click-tier, then
- // RE-CLEAR before every input action while click-tier stays frontmost. The
- // re-clear is the load-bearing part — a stash-on-transition-only design
- // leaves a gap between an agent write_clipboard and the next left_click.
- // When frontmost becomes anything else, restore. Turn-end restore is inlined
- // in the host's result-handler + leavingRunning (same dual-location as
- // cuHiddenDuringTurn unhide) — reads `session.cuClipboardStash` directly and
- // writes via Electron's `clipboard.writeText`, so no nest-only import.
- //
- // State lives on the session (via `overrides.getClipboardStash` /
- // `onClipboardStashChanged`), not module-level. The CU lock still guarantees
- // one session at a time, but session-scoped state means the host's turn-end
- // restore doesn't need to reach back into this package.
- async function syncClipboardStash(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- frontmostIsClickTier: boolean,
- ): Promise<void> {
- const current = overrides.getClipboardStash?.();
- if (!frontmostIsClickTier) {
- // Restore + clear. Idempotent — if nothing is stashed, no-op.
- if (current === undefined) return;
- try {
- await adapter.executor.writeClipboard(current);
- // Clear only after a successful write — a transient pasteboard
- // failure must not irrecoverably drop the stash.
- overrides.onClipboardStashChanged?.(undefined);
- } catch {
- // Best effort — stash held, next non-click action retries.
- }
- return;
- }
- // Stash the user's clipboard on FIRST entry to click-tier only.
- if (current === undefined) {
- try {
- const read = await adapter.executor.readClipboard();
- overrides.onClipboardStashChanged?.(read);
- } catch {
- // readClipboard failed — use empty sentinel so we don't retry the stash
- // on the next action; restore becomes a harmless writeClipboard("").
- overrides.onClipboardStashChanged?.("");
- }
- }
- // Re-clear on EVERY click-tier action, not just the first. Defeats the
- // bypass where the agent calls write_clipboard (which doesn't route
- // through runInputActionGates) between stash and a left_click on a UI
- // Paste button — the next action's clear clobbers the agent's write
- // before the click lands.
- try {
- await adapter.executor.writeClipboard("");
- } catch {
- // Transient pasteboard failure. The tier-"click" right-click/modifier
- // block still holds; this is a net, not a promise.
- }
- }
- /** Every click/type/key/scroll/drag/move_mouse runs through this before
- * touching the executor. Returns null on pass, error-result on block.
- * Any throw inside → caught by handleToolCall's outer try → tool error. */
- async function runInputActionGates(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- actionKind: CuActionKind,
- ): Promise<CuCallToolResult | null> {
- // Step A+B — hide non-allowlisted apps + defocus us. Sub-gated. After this
- // runs, the frontmost gate below becomes a rare edge-case detector (something
- // popped up between prepare and action) rather than a normal-path blocker.
- // ALL grant tiers stay visible — visibility is the baseline (tier "read").
- if (subGates.hideBeforeAction) {
- const hidden = await adapter.executor.prepareForAction(
- overrides.allowedApps.map((a) => a.bundleId),
- overrides.selectedDisplayId,
- );
- // Empty-check so we don't spam the callback on every action when nothing
- // was hidden (the common case after the first action of a turn).
- if (hidden.length > 0) {
- overrides.onAppsHidden?.(hidden);
- }
- }
- // Frontmost gate. Check FRESH on every call.
- const frontmost = await adapter.executor.getFrontmostApp();
- const tierByBundleId = new Map(
- overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
- );
- // After handleToolCall's tier backfill, every grant has a concrete tier —
- // .get() returning undefined means the app is not in the allowlist at all.
- const frontmostTier = frontmost
- ? tierByBundleId.get(frontmost.bundleId)
- : undefined;
- // Clipboard guard. Per-action, not per-tool-call — runs for every sub-action
- // inside computer_batch and teach_step/teach_batch, so clicking into a
- // click-tier app mid-batch stashes+clears before the next click lands.
- // Lives here (not in handleToolCall) so deferAcquire tools (request_access,
- // list_granted_applications), `wait`, and the teach_step blocking-dialog
- // phase don't trigger a sync — only input actions do.
- if (subGates.clipboardGuard) {
- await syncClipboardStash(adapter, overrides, frontmostTier === "click");
- }
- if (!frontmost) {
- // No frontmost app (rare — login window?). Let it through; the click
- // will land somewhere and PixelCompare catches staleness.
- return null;
- }
- const { hostBundleId } = adapter.executor.capabilities;
- if (frontmostTier !== undefined) {
- if (tierSatisfies(frontmostTier, actionKind)) return null;
- // In the allowlist but tier doesn't cover this action. Tailor the
- // guidance to the actual tier — at "read", suggesting left_click or Bash
- // is wrong (nothing is allowed; use Chrome MCP). At "click", the
- // mouse_full/keyboard-specific messages apply.
- if (frontmostTier === "read") {
- // tier "read" is not category-unique (browser AND trading map to it) —
- // re-look-up so the CiC hint only shows for actual browsers.
- const isBrowser =
- getDeniedCategoryForApp(frontmost.bundleId, frontmost.displayName) ===
- "browser";
- return errorResult(
- `"${frontmost.displayName}" is granted at tier "read" — ` +
- `visible in screenshots only, no clicks or typing.` +
- (isBrowser
- ? " Use the Claude-in-Chrome MCP for browser interaction (tools " +
- "named `mcp__Claude_in_Chrome__*`; load via ToolSearch if " +
- "deferred)."
- : " No interaction is permitted; ask the user to take any " +
- "actions in this app themselves.") +
- TIER_ANTI_SUBVERSION,
- "tier_insufficient",
- );
- }
- // frontmostTier === "click" (tier === "full" would have passed tierSatisfies)
- if (actionKind === "keyboard") {
- return errorResult(
- `"${frontmost.displayName}" is granted at tier "click" — ` +
- `typing, key presses, and paste require tier "full". The keys ` +
- `would go to this app's text fields or integrated terminal. To ` +
- `type into a different app, click it first to bring it forward. ` +
- `For shell commands, use the Bash tool.` + TIER_ANTI_SUBVERSION,
- "tier_insufficient",
- );
- }
- // actionKind === "mouse_full" ("mouse" and "mouse_position" pass at "click")
- return errorResult(
- `"${frontmost.displayName}" is granted at tier "click" — ` +
- `right-click, middle-click, and clicks with modifier keys require ` +
- `tier "full". Right-click opens a context menu with Paste/Cut, and ` +
- `modifier chords fire as keystrokes before the click. Plain ` +
- `left_click is allowed here.` + TIER_ANTI_SUBVERSION,
- "tier_insufficient",
- );
- }
- // Finder is never-hide, always allowed.
- if (frontmost.bundleId === FINDER_BUNDLE_ID) return null;
- if (frontmost.bundleId === hostBundleId) {
- if (actionKind !== "keyboard") {
- // mouse and mouse_full are both click events — click-through works.
- // We're click-through (executor's withClickThrough). Pass.
- return null;
- }
- // Keyboard safety net — defocus (prepareForAction step B) should have
- // moved us off. If we're still here, typing would go to our chat box.
- return errorResult(
- "Claude's own window still has keyboard focus. This should not happen " +
- "after the pre-action defocus. Click on the target application first.",
- "state_conflict",
- );
- }
- // Non-allowlisted, non-us, non-Finder. RARE after the hide loop — means
- // something popped up between prepare and action, or the 5-try loop gave up.
- return errorResult(
- `"${frontmost.displayName}" is not in the allowed applications and is ` +
- `currently in front. Take a new screenshot — it may have appeared ` +
- `since your last one.`,
- "app_not_granted",
- );
- }
- /**
- * Hit-test gate: reject a mouse action if the window under (x, y) belongs
- * to an app whose tier doesn't cover mouse input. Closes the gap where a
- * tier-"full" app is frontmost but the click lands on a tier-"read" window
- * overlapping it — `runInputActionGates` passes (frontmost is fine), but the
- * click actually goes to the read-tier app.
- *
- * Runs AFTER `scaleCoord` (needs global coords) and BEFORE the executor call.
- * Returns null on pass (target is tier-"click"/"full", or desktop/Finder/us),
- * error-result on block.
- *
- * When `appUnderPoint` returns null (desktop, or platform without hit-test),
- * falls through — the frontmost check in `runInputActionGates` already ran.
- */
- async function runHitTestGate(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- x: number,
- y: number,
- actionKind: CuActionKind,
- ): Promise<CuCallToolResult | null> {
- const target = await adapter.executor.appUnderPoint(x, y);
- if (!target) return null; // desktop / nothing under point / platform no-op
- // Finder (desktop, file dialogs) is always clickable — same exemption as
- // runInputActionGates. Our own overlay is filtered by Swift (pid != self).
- if (target.bundleId === FINDER_BUNDLE_ID) return null;
- const tierByBundleId = new Map(
- overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
- );
- if (!tierByBundleId.has(target.bundleId)) {
- // Not in the allowlist at all. The frontmost check would catch this if
- // the target were frontmost, but here a different app is in front. This
- // is the "something popped up" edge case — a new window appeared between
- // screenshot and click, or a background app's window overlaps the target.
- return errorResult(
- `Click at these coordinates would land on "${target.displayName}", ` +
- `which is not in the allowed applications. Take a fresh screenshot ` +
- `to see the current window layout.`,
- "app_not_granted",
- );
- }
- const targetTier = tierByBundleId.get(target.bundleId);
- // Frontmost-based sync (runInputActionGates) misses the case where
- // the click lands on a NON-FRONTMOST click-tier window. Re-sync by
- // the hit-test target's tier — if target is click-tier, stash+clear
- // before the click lands, regardless of what's frontmost.
- if (subGates.clipboardGuard && targetTier === "click") {
- await syncClipboardStash(adapter, overrides, true);
- }
- if (tierSatisfies(targetTier, actionKind)) return null;
- // Target is in the allowlist but tier doesn't cover this action.
- // runHitTestGate is only called with mouse/mouse_full (keyboard routes to
- // frontmost, not window-under-cursor). The branch above catches
- // mouse_full ∧ click; the only remaining fall-through is tier "read".
- if (actionKind === "mouse_full" && targetTier === "click") {
- return errorResult(
- `Click at these coordinates would land on "${target.displayName}", ` +
- `which is granted at tier "click" — right-click, middle-click, and ` +
- `clicks with modifier keys require tier "full" (they can Paste via ` +
- `the context menu or fire modifier-chord keystrokes). Plain ` +
- `left_click is allowed here.` + TIER_ANTI_SUBVERSION,
- "tier_insufficient",
- );
- }
- const isBrowser =
- getDeniedCategoryForApp(target.bundleId, target.displayName) === "browser";
- return errorResult(
- `Click at these coordinates would land on "${target.displayName}", ` +
- `which is granted at tier "read" (screenshots only, no interaction). ` +
- (isBrowser
- ? "Use the Claude-in-Chrome MCP for browser interaction."
- : "Ask the user to take any actions in this app themselves.") +
- TIER_ANTI_SUBVERSION,
- "tier_insufficient",
- );
- }
- // ---------------------------------------------------------------------------
- // Screenshot helpers
- // ---------------------------------------------------------------------------
- /**
- * §6 item 9 — screenshot retry on implausibly-small buffer. Battle-tested
- * threshold (1024 bytes). We retry exactly once.
- */
- const MIN_SCREENSHOT_BYTES = 1024;
- function decodedByteLength(base64: string): number {
- // 3 bytes per 4 chars, minus padding. Good enough for a threshold check.
- const padding = base64.endsWith("==") ? 2 : base64.endsWith("=") ? 1 : 0;
- return Math.floor((base64.length * 3) / 4) - padding;
- }
- async function takeScreenshotWithRetry(
- executor: ComputerExecutor,
- allowedBundleIds: string[],
- logger: ComputerUseHostAdapter["logger"],
- displayId?: number,
- ): Promise<ScreenshotResult> {
- let shot = await executor.screenshot({ allowedBundleIds, displayId });
- if (decodedByteLength(shot.base64) < MIN_SCREENSHOT_BYTES) {
- logger.warn(
- `[computer-use] screenshot implausibly small (${decodedByteLength(shot.base64)} bytes decoded), retrying once`,
- );
- shot = await executor.screenshot({ allowedBundleIds, displayId });
- }
- return shot;
- }
- // ---------------------------------------------------------------------------
- // Grapheme iteration — §6 item 7, ported from the Vercept acquisition
- // ---------------------------------------------------------------------------
- const INTER_GRAPHEME_SLEEP_MS = 8; // §6 item 4 — 125 Hz USB polling
- function segmentGraphemes(text: string): string[] {
- try {
- // Node 18+ has Intl.Segmenter; the try is defence against a stripped-
- // -down runtime (falls back to code points).
- const Segmenter = (
- Intl as typeof Intl & {
- Segmenter?: new (
- locale?: string,
- options?: { granularity: "grapheme" | "word" | "sentence" },
- ) => { segment: (s: string) => Iterable<{ segment: string }> };
- }
- ).Segmenter;
- if (typeof Segmenter === "function") {
- const seg = new Segmenter(undefined, { granularity: "grapheme" });
- return Array.from(seg.segment(text), (s) => s.segment);
- }
- } catch {
- // fall through
- }
- // Code-point iteration. Keeps surrogate pairs together but splits ZWJ.
- return Array.from(text);
- }
- function sleep(ms: number): Promise<void> {
- return new Promise((r) => setTimeout(r, ms));
- }
- /**
- * Split a chord string like "ctrl+shift" into individual key names.
- * Same parsing as `key` tool / executor.key / keyBlocklist.normalizeKeySequence.
- */
- function parseKeyChord(text: string): string[] {
- return text
- .split("+")
- .map((s) => s.trim())
- .filter(Boolean);
- }
- // ---------------------------------------------------------------------------
- // left_mouse_down / left_mouse_up held-state tracking
- // ---------------------------------------------------------------------------
- /**
- * Errors on double-down but not on up-without-down. Module-level, but
- * reset on every lock acquire (handleToolCall → acquireCuLock branch) so
- * a session interrupted mid-drag (overlay stop during left_mouse_down)
- * doesn't leave the flag true for the next lock holder.
- *
- * Still scoped wrong within a single lock cycle if sessions could interleave
- * tool calls, but the lock enforces at-most-one-session-uses-CU so they
- * can't. The per-turn reset is the correctness boundary.
- */
- let mouseButtonHeld = false;
- /** Whether mouse_move occurred between left_mouse_down and left_mouse_up.
- * When false at mouseUp, the decomposed sequence is a click-release (not a
- * drop) — hit-test at "mouse", not "mouse_full". */
- let mouseMoved = false;
- /** Clears the cross-call drag flags. Called from Gate-3 on lock-acquire and
- * from `bindSessionContext` in mcpServer.ts — a fresh lock holder must not
- * inherit a prior session's mid-drag state. */
- export function resetMouseButtonHeld(): void {
- mouseButtonHeld = false;
- mouseMoved = false;
- }
- /** If a left_mouse_down set the OS button without a matching left_mouse_up
- * ever getting its turn, release it now. Same release-before-return as
- * handleClick. No-op when not held — callers don't need to check. */
- async function releaseHeldMouse(
- adapter: ComputerUseHostAdapter,
- ): Promise<void> {
- if (!mouseButtonHeld) return;
- await adapter.executor.mouseUp();
- mouseButtonHeld = false;
- mouseMoved = false;
- }
- /**
- * Tools that check the lock but don't acquire it. `request_access` and
- * `list_granted_applications` hit the CHECK (so a blocked session doesn't
- * show an approval dialog for access it can't use) but defer ACQUIRE — the
- * enter-CU notification/overlay only fires on the first action tool.
- *
- * `request_teach_access` is NOT here: approving teach mode hides the main
- * window, and the lock must be held before that. See Gate-3 block in
- * `handleToolCall` for the full explanation.
- *
- * Exported for `bindSessionContext` in mcpServer.ts so the async lock gate
- * uses the same set as the sync one.
- */
- export function defersLockAcquire(toolName: string): boolean {
- return (
- toolName === "request_access" ||
- toolName === "list_granted_applications"
- );
- }
- // ---------------------------------------------------------------------------
- // request_access helpers
- // ---------------------------------------------------------------------------
- /** Reverse-DNS-ish: contains at least one dot, no spaces, no slashes. Lets
- * raw bundle IDs pass through resolution. */
- const REVERSE_DNS_RE = /^[A-Za-z0-9][\w.-]*\.[A-Za-z0-9][\w.-]*$/;
- function looksLikeBundleId(s: string): boolean {
- return REVERSE_DNS_RE.test(s) && !s.includes(" ");
- }
- function resolveRequestedApps(
- requestedNames: string[],
- installed: InstalledApp[],
- alreadyGrantedBundleIds: ReadonlySet<string>,
- ): ResolvedAppRequest[] {
- const byLowerDisplayName = new Map<string, InstalledApp>();
- const byBundleId = new Map<string, InstalledApp>();
- for (const app of installed) {
- byBundleId.set(app.bundleId, app);
- // Last write wins on collisions. Ambiguous-name handling (multiple
- // candidates in the dialog) is plan-documented but deferred — the
- // InstalledApps enumerator dedupes by bundle ID, so true display-name
- // collisions are rare. TODO(chicago, post-P1): surface all candidates.
- byLowerDisplayName.set(app.displayName.toLowerCase(), app);
- }
- return requestedNames.map((requested): ResolvedAppRequest => {
- let resolved: InstalledApp | undefined;
- if (looksLikeBundleId(requested)) {
- resolved = byBundleId.get(requested);
- }
- if (!resolved) {
- resolved = byLowerDisplayName.get(requested.toLowerCase());
- }
- const bundleId = resolved?.bundleId;
- // When unresolved AND the requested string looks like a bundle ID, use it
- // directly for tier lookup (e.g. "company.thebrowser.Browser" with Arc not
- // installed — the reverse-DNS string won't match any display-name substring).
- const bundleIdCandidate =
- bundleId ?? (looksLikeBundleId(requested) ? requested : undefined);
- return {
- requestedName: requested,
- resolved,
- isSentinel: bundleId ? SENTINEL_BUNDLE_IDS.has(bundleId) : false,
- alreadyGranted: bundleId ? alreadyGrantedBundleIds.has(bundleId) : false,
- proposedTier: getDefaultTierForApp(
- bundleIdCandidate,
- resolved?.displayName ?? requested,
- ),
- };
- });
- }
- // ---------------------------------------------------------------------------
- // Individual tool handlers
- // ---------------------------------------------------------------------------
- async function handleRequestAccess(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- tccState: { accessibility: boolean; screenRecording: boolean } | undefined,
- ): Promise<CuCallToolResult> {
- if (!overrides.onPermissionRequest) {
- return errorResult(
- "This session was not wired with a permission handler. Computer control is not available here.",
- "feature_unavailable",
- );
- }
- // Teach mode hides the main window; permission dialogs render in that
- // window. Without this, handleToolPermission blocks on an invisible
- // prompt and the overlay spins forever. Tell the model to exit teach
- // mode, request access, then re-enter.
- if (overrides.getTeachModeActive?.()) {
- return errorResult(
- "Cannot request additional permissions during teach mode — the permission dialog would be hidden. End teach mode (finish the tour or let the turn complete), then call request_access, then start a new tour.",
- "teach_mode_conflict",
- );
- }
- const reason = requireString(args, "reason");
- if (reason instanceof Error) return errorResult(reason.message, "bad_args");
- // TCC-ungranted branch. The renderer shows a toggle panel INSTEAD OF the
- // app list when `tccState` is present on the request, so we skip app
- // resolution entirely (listInstalledApps() may fail without Screen
- // Recording anyway). The user grants the OS perms from inside the dialog,
- // then clicks "Ask again" — both buttons resolve with deny by design
- // (ComputerUseApproval.tsx) so the model re-calls request_access and
- // gets the app list on the next call.
- if (tccState) {
- const req: CuPermissionRequest = {
- requestId: randomUUID(),
- reason,
- apps: [],
- requestedFlags: {},
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- tccState,
- };
- await overrides.onPermissionRequest(req);
- // Re-check: the user may have granted in System Settings while the
- // dialog was up. The `tccState` arg is a pre-dialog snapshot — reading
- // it here would tell the model "not yet granted" even after the user
- // granted, and the model waits for confirmation instead of retrying.
- // The renderer's TCC panel already live-polls (computerUseTccStore);
- // this is the same re-check on the tool-result side.
- const recheck = await adapter.ensureOsPermissions();
- if (recheck.granted) {
- return errorResult(
- "macOS Accessibility and Screen Recording are now both granted. " +
- "Call request_access again immediately — the next call will show " +
- "the app selection list.",
- );
- }
- const missing: string[] = [];
- if (!recheck.accessibility) missing.push("Accessibility");
- if (!recheck.screenRecording) missing.push("Screen Recording");
- return errorResult(
- `macOS ${missing.join(" and ")} permission(s) not yet granted. ` +
- `The permission panel has been shown. Once the user grants the ` +
- `missing permission(s), call request_access again.`,
- "tcc_not_granted",
- );
- }
- const rawApps = args.apps;
- if (!Array.isArray(rawApps) || !rawApps.every((a) => typeof a === "string")) {
- return errorResult('"apps" must be an array of strings.', "bad_args");
- }
- const apps = rawApps as string[];
- const requestedFlags: Partial<CuGrantFlags> = {};
- if (typeof args.clipboardRead === "boolean") {
- requestedFlags.clipboardRead = args.clipboardRead;
- }
- if (typeof args.clipboardWrite === "boolean") {
- requestedFlags.clipboardWrite = args.clipboardWrite;
- }
- if (typeof args.systemKeyCombos === "boolean") {
- requestedFlags.systemKeyCombos = args.systemKeyCombos;
- }
- const {
- needDialog,
- skipDialogGrants,
- willHide,
- tieredApps,
- userDenied,
- policyDenied,
- } = await buildAccessRequest(
- adapter,
- apps,
- overrides.allowedApps,
- new Set(overrides.userDeniedBundleIds),
- overrides.selectedDisplayId,
- );
- let dialogGranted: AppGrant[] = [];
- let dialogDenied: Array<{
- bundleId: string;
- reason: "user_denied" | "not_installed";
- }> = [];
- let dialogFlags: CuGrantFlags = overrides.grantFlags;
- if (needDialog.length > 0 || Object.keys(requestedFlags).length > 0) {
- const req: CuPermissionRequest = {
- requestId: randomUUID(),
- reason,
- apps: needDialog,
- requestedFlags,
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- // Undefined when empty so the renderer skips the section cleanly.
- ...(willHide.length > 0 && {
- willHide,
- autoUnhideEnabled: adapter.getAutoUnhideEnabled(),
- }),
- };
- const response = await overrides.onPermissionRequest(req);
- dialogGranted = response.granted;
- dialogDenied = response.denied;
- dialogFlags = response.flags;
- }
- // Do NOT return display geometry or coordinateMode. See COORDINATES.md
- // ("Never give the model a number that invites rescaling"). scaleCoord
- // already transforms server-side; the coordinate convention is baked into
- // the tool param descriptions at server-construction time.
- const allGranted = [...skipDialogGrants, ...dialogGranted];
- // Filter tieredApps to what was actually granted — if the user unchecked
- // Chrome in the dialog, don't explain Chrome's tier.
- const grantedBundleIds = new Set(allGranted.map((g) => g.bundleId));
- const grantedTieredApps = tieredApps.filter((t) =>
- grantedBundleIds.has(t.bundleId),
- );
- // Best-effort — grants are already persisted by wrappedPermissionHandler;
- // a listDisplays/findWindowDisplays failure (monitor hot-unplug, NAPI
- // error) must not tank the grant response. Same discipline as
- // buildMonitorNote's listDisplays try/catch.
- let windowLocations: Awaited<ReturnType<typeof buildWindowLocations>> = [];
- try {
- windowLocations = await buildWindowLocations(adapter, allGranted);
- } catch (e) {
- adapter.logger.warn(
- `[computer-use] buildWindowLocations failed: ${String(e)}`,
- );
- }
- return okJson(
- {
- granted: allGranted,
- denied: dialogDenied,
- // Policy blocklist — precedes userDenied in precedence and response
- // order. No escape hatch; the agent is told to find another approach.
- ...(policyDenied.length > 0 && {
- policyDenied: {
- apps: policyDenied,
- guidance: buildPolicyDeniedGuidance(policyDenied),
- },
- }),
- // User-configured auto-deny — stripped before the dialog; this is the
- // agent's only signal that these apps exist but are user-blocked.
- ...(userDenied.length > 0 && {
- userDenied: {
- apps: userDenied,
- guidance: buildUserDeniedGuidance(userDenied),
- },
- }),
- // Upfront guidance so the model knows what each tier allows BEFORE
- // hitting the gate. Only included when something was tier-restricted.
- ...(grantedTieredApps.length > 0 && {
- tierGuidance: buildTierGuidanceMessage(grantedTieredApps),
- }),
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- // Where each granted app currently has open windows, across monitors.
- // Omitted when the app isn't running or has no normal windows.
- ...(windowLocations.length > 0 ? { windowLocations } : {}),
- },
- {
- // dialogGranted only — skipDialogGrants are idempotent re-grants of
- // apps already in the allowlist (no user action, dialog skips them).
- // Matching denied_count's this-call-only semantics.
- granted_count: dialogGranted.length,
- denied_count: dialogDenied.length,
- ...tierAssignmentTelemetry(grantedTieredApps),
- },
- );
- }
- /**
- * For each granted app with open windows, which displays those windows are
- * on. Single-monitor setups return an empty array (no multi-monitor signal
- * to give). Apps not running, or running with no normal windows, are omitted.
- */
- async function buildWindowLocations(
- adapter: ComputerUseHostAdapter,
- granted: AppGrant[],
- ): Promise<
- Array<{
- bundleId: string;
- displayName: string;
- displays: Array<{ id: number; label?: string; isPrimary?: boolean }>;
- }>
- > {
- if (granted.length === 0) return [];
- const displays = await adapter.executor.listDisplays();
- if (displays.length <= 1) return [];
- const grantedBundleIds = granted.map((g) => g.bundleId);
- const windowLocs = await adapter.executor.findWindowDisplays(grantedBundleIds);
- const displayById = new Map(displays.map((d) => [d.displayId, d]));
- const idsByBundle = new Map(windowLocs.map((w) => [w.bundleId, w.displayIds]));
- const out = [];
- for (const g of granted) {
- const displayIds = idsByBundle.get(g.bundleId);
- if (!displayIds || displayIds.length === 0) continue;
- out.push({
- bundleId: g.bundleId,
- displayName: g.displayName,
- displays: displayIds.map((id) => {
- const d = displayById.get(id);
- return { id, label: d?.label, isPrimary: d?.isPrimary };
- }),
- });
- }
- return out;
- }
- /**
- * Shared app-resolution + partition + hide-preview pipeline. Extracted from
- * `handleRequestAccess` so `handleRequestTeachAccess` can call the same path.
- *
- * Does the full app-name→InstalledApp resolution, assigns each a tier
- * (browser→"read", terminal/IDE→"click", else "full" — see deniedApps.ts),
- * splits into already-granted (skip the dialog, preserve grantedAt+tier) vs
- * need-dialog, and computes the willHide preview. Unlike the previous
- * hard-deny model, ALL apps proceed to the dialog; the tier just constrains
- * what actions are allowed once granted.
- */
- /** An app assigned a restricted tier (not `"full"`). Used to build the
- * guidance message telling the model what it can/can't do. */
- interface TieredApp {
- bundleId: string;
- displayName: string;
- /** Never `"full"` — only restricted tiers are collected. */
- tier: "read" | "click";
- }
- interface AccessRequestParts {
- needDialog: ResolvedAppRequest[];
- skipDialogGrants: AppGrant[];
- willHide: Array<{ bundleId: string; displayName: string }>;
- /** Resolved apps with `proposedTier !== "full"` — for the guidance text.
- * Unresolved apps are omitted (they go to `denied` with `not_installed`). */
- tieredApps: TieredApp[];
- /** Apps stripped by the user's Settings auto-deny list. Surfaced in the
- * response with guidance; never reach the dialog. */
- userDenied: Array<{ requestedName: string; displayName: string }>;
- /** Apps stripped by the baked-in policy blocklist (streaming/music/ebooks,
- * etc. — `deniedApps.isPolicyDenied`). Precedence over userDenied. */
- policyDenied: Array<{ requestedName: string; displayName: string }>;
- }
- async function buildAccessRequest(
- adapter: ComputerUseHostAdapter,
- apps: string[],
- allowedApps: AppGrant[],
- userDeniedBundleIds: ReadonlySet<string>,
- selectedDisplayId?: number,
- ): Promise<AccessRequestParts> {
- const alreadyGranted = new Set(allowedApps.map((g) => g.bundleId));
- const installed = await adapter.executor.listInstalledApps();
- const resolved = resolveRequestedApps(apps, installed, alreadyGranted);
- // Policy-level auto-deny (baked-in, not user-configurable). Stripped
- // before userDenied — checks bundle ID AND display name (covers
- // unresolved requests). Precedence: policy > user setting > tier.
- const policyDenied: Array<{ requestedName: string; displayName: string }> =
- [];
- const afterPolicy: typeof resolved = [];
- for (const r of resolved) {
- const displayName = r.resolved?.displayName ?? r.requestedName;
- if (isPolicyDenied(r.resolved?.bundleId, displayName)) {
- policyDenied.push({ requestedName: r.requestedName, displayName });
- } else {
- afterPolicy.push(r);
- }
- }
- // User-configured auto-deny (Settings → Desktop app → Computer Use).
- // Stripped BEFORE
- // tier assignment — these never reach the dialog regardless of category.
- // Bundle-ID match only (the Settings UI picks from installed apps, which
- // always have a bundle ID). Unresolved requests pass through to the tier
- // system; the user can't preemptively deny an app that isn't installed.
- const userDenied: Array<{ requestedName: string; displayName: string }> = [];
- const surviving: typeof afterPolicy = [];
- for (const r of afterPolicy) {
- if (r.resolved && userDeniedBundleIds.has(r.resolved.bundleId)) {
- userDenied.push({
- requestedName: r.requestedName,
- displayName: r.resolved.displayName,
- });
- } else {
- surviving.push(r);
- }
- }
- // Collect resolved apps with a restricted tier for the guidance message.
- // Unresolved apps with a restricted tier (e.g. model asks for "Chrome" but
- // it's not installed) are omitted — they'll end up in the `denied` list
- // with reason "not_installed" and the model will see that instead.
- const tieredApps: TieredApp[] = [];
- for (const r of surviving) {
- if (r.proposedTier === "full" || !r.resolved) continue;
- tieredApps.push({
- bundleId: r.resolved.bundleId,
- displayName: r.resolved.displayName,
- tier: r.proposedTier,
- });
- }
- // Idempotence: apps that are already granted skip the dialog and are
- // merged into the `granted` response. Existing grants keep their tier
- // (which may differ from the current proposedTier if policy changed).
- const skipDialog = surviving.filter((r) => r.alreadyGranted);
- const needDialog = surviving.filter((r) => !r.alreadyGranted);
- // Populate icons only for what the dialog will actually show. Sequential
- // awaits are fine — the Swift module is cached (listInstalledApps above
- // loaded it), each N-API call is synchronous, and the darwin executor
- // memoizes by path. Failures leave iconDataUrl undefined; renderer falls
- // back to a grey box.
- for (const r of needDialog) {
- if (!r.resolved) continue;
- try {
- r.resolved.iconDataUrl = await adapter.executor.getAppIcon(
- r.resolved.path,
- );
- } catch {
- // leave undefined
- }
- }
- const now = Date.now();
- const skipDialogGrants: AppGrant[] = skipDialog
- .filter((r) => r.resolved)
- .map((r) => {
- // Reuse the existing grant (preserving grantedAt + tier) rather than
- // synthesizing a new one — keeps Settings-page "Granted 3m ago" honest.
- const existing = allowedApps.find(
- (g) => g.bundleId === r.resolved!.bundleId,
- );
- return (
- existing ?? {
- bundleId: r.resolved!.bundleId,
- displayName: r.resolved!.displayName,
- grantedAt: now,
- tier: r.proposedTier,
- }
- );
- });
- // Preview what will be hidden if the user approves exactly the requested
- // set plus what they already have. All tiers are visible, so everything
- // resolved goes in the exempt set.
- const exemptForPreview = [
- ...allowedApps.map((a) => a.bundleId),
- ...surviving.filter((r) => r.resolved).map((r) => r.resolved!.bundleId),
- ];
- const willHide = await adapter.executor.previewHideSet(
- exemptForPreview,
- selectedDisplayId,
- );
- return {
- needDialog,
- skipDialogGrants,
- willHide,
- tieredApps,
- userDenied,
- policyDenied,
- };
- }
- /**
- * Build guidance text for apps granted at a restricted tier. Returned
- * inline in the okJson response so the model knows upfront what it can
- * do with each app, instead of learning by hitting the tier gate.
- */
- function buildTierGuidanceMessage(tiered: TieredApp[]): string {
- // tier "read" is not category-unique — split so browsers get the CiC hint
- // and trading platforms get "ask the user" instead.
- const readBrowsers = tiered.filter(
- (t) =>
- t.tier === "read" &&
- getDeniedCategoryForApp(t.bundleId, t.displayName) === "browser",
- );
- const readOther = tiered.filter(
- (t) =>
- t.tier === "read" &&
- getDeniedCategoryForApp(t.bundleId, t.displayName) !== "browser",
- );
- const clickTier = tiered.filter((t) => t.tier === "click");
- const parts: string[] = [];
- if (readBrowsers.length > 0) {
- const names = readBrowsers.map((b) => `"${b.displayName}"`).join(", ");
- parts.push(
- `${names} ${readBrowsers.length === 1 ? "is a browser" : "are browsers"} — ` +
- `granted at tier "read" (visible in screenshots only; no clicks or ` +
- `typing). You can read what's on screen but cannot navigate, click, ` +
- `or type into ${readBrowsers.length === 1 ? "it" : "them"}. For browser ` +
- `interaction, use the Claude-in-Chrome MCP (tools named ` +
- `\`mcp__Claude_in_Chrome__*\`; load via ToolSearch if deferred).`,
- );
- }
- if (readOther.length > 0) {
- const names = readOther.map((t) => `"${t.displayName}"`).join(", ");
- parts.push(
- `${names} ${readOther.length === 1 ? "is" : "are"} granted at tier ` +
- `"read" (visible in screenshots only; no clicks or typing). You can ` +
- `read what's on screen but cannot interact. Ask the user to take any ` +
- `actions in ${readOther.length === 1 ? "this app" : "these apps"} ` +
- `themselves.`,
- );
- }
- if (clickTier.length > 0) {
- const names = clickTier.map((t) => `"${t.displayName}"`).join(", ");
- parts.push(
- `${names} ${clickTier.length === 1 ? "has" : "have"} terminal or IDE ` +
- `capabilities — granted at tier "click" (visible + plain left-click ` +
- `only; NO typing, key presses, right-click, modifier-clicks, or ` +
- `drag-drop). You can click buttons and scroll output, but ` +
- `${clickTier.length === 1 ? "its" : "their"} integrated terminal and ` +
- `editor are off-limits to keyboard input. Right-click (context-menu ` +
- `Paste) and dragging text onto ${clickTier.length === 1 ? "it" : "them"} ` +
- `require tier "full". For shell commands, use the Bash tool.`,
- );
- }
- if (parts.length === 0) return "";
- // Same anti-subversion clause the gate errors carry — said upfront so the
- // model doesn't reach for osascript/cliclick after seeing "no clicks/typing".
- return parts.join("\n\n") + TIER_ANTI_SUBVERSION;
- }
- /**
- * Build guidance text for apps stripped by the user's Settings auto-deny
- * list. Returned inline in the okJson response so the agent knows (a) the
- * app is auto-denied by request_access and (b) the escape hatch
- * is to ask the human to edit Settings, not to retry or reword the request.
- */
- function buildUserDeniedGuidance(
- userDenied: Array<{ requestedName: string; displayName: string }>,
- ): string {
- const names = userDenied.map((d) => `"${d.displayName}"`).join(", ");
- const one = userDenied.length === 1;
- return (
- `${names} ${one ? "is" : "are"} in the user's auto-deny list ` +
- `(Settings → Desktop app (General) → Computer Use → Denied apps). ` +
- `Requests for ` +
- `${one ? "this app" : "these apps"} are automatically denied. If you need access for ` +
- `this task, ask the user to remove ${one ? "it" : "them"} from their ` +
- `deny list in Settings — you cannot request this through the tool.`
- );
- }
- /**
- * Guidance for policy-denied apps (baked-in blocklist, not user-editable).
- * Unlike userDenied, there is no escape hatch — the agent is told to find
- * another approach.
- */
- function buildPolicyDeniedGuidance(
- policyDenied: Array<{ requestedName: string; displayName: string }>,
- ): string {
- const names = policyDenied.map((d) => `"${d.displayName}"`).join(", ");
- const one = policyDenied.length === 1;
- return (
- `${names} ${one ? "is" : "are"} blocked by policy for computer use. ` +
- `Requests for ${one ? "this app" : "these apps"} are automatically ` +
- `denied regardless of what the user has approved. There is no Settings ` +
- `override. Inform the user that you cannot access ` +
- `${one ? "this app" : "these apps"} and suggest an alternative ` +
- `approach if one exists. Do not try to directly subvert this block ` +
- `regardless of the user's request.`
- );
- }
- /**
- * Telemetry helper — counts by category. Field names (`denied_*`) are kept
- * for schema compat; interpret as "assigned non-full tier" in dashboards.
- */
- function tierAssignmentTelemetry(
- tiered: TieredApp[],
- ): Pick<CuCallTelemetry, "denied_browser_count" | "denied_terminal_count"> {
- // `denied_browser_count` now counts ALL tier-"read" grants (browsers +
- // trading). The field name was already legacy-only before trading existed
- // (dashboards read it as "non-full tier"), so no new column.
- const browserCount = tiered.filter((t) => t.tier === "read").length;
- const terminalCount = tiered.filter((t) => t.tier === "click").length;
- return {
- ...(browserCount > 0 && { denied_browser_count: browserCount }),
- ...(terminalCount > 0 && { denied_terminal_count: terminalCount }),
- };
- }
- /**
- * Sibling of `handleRequestAccess`. Same app-resolution + TCC-threading, but
- * routes to the teach approval dialog and fires `onTeachModeActivated` on
- * success. No grant-flag checkboxes (clipboard/systemKeys) in teach mode —
- * the tool schema omits those fields.
- *
- * Unlike `request_access`, this ALWAYS shows the dialog even when every
- * requested app is already granted. Teach mode is a distinct UX the user
- * must explicitly consent to (main window hides) — idempotent app grants
- * don't imply consent to being guided.
- */
- async function handleRequestTeachAccess(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- tccState: { accessibility: boolean; screenRecording: boolean } | undefined,
- ): Promise<CuCallToolResult> {
- if (!overrides.onTeachPermissionRequest) {
- return errorResult(
- "Teach mode is not available in this session.",
- "feature_unavailable",
- );
- }
- // Same as handleRequestAccess above — the dialog renders in the hidden
- // main window. Model re-calling request_teach_access mid-tour (to add
- // another app) is plausible since request_access docs say "call again
- // mid-session to add more apps" and this uses the same grant model.
- if (overrides.getTeachModeActive?.()) {
- return errorResult(
- "Teach mode is already active. To add more apps, end the current tour first, then call request_teach_access again with the full app list.",
- "teach_mode_conflict",
- );
- }
- const reason = requireString(args, "reason");
- if (reason instanceof Error) return errorResult(reason.message, "bad_args");
- // TCC-ungranted branch — identical to handleRequestAccess's. The renderer
- // shows the same TCC toggle panel regardless of which request tool got here.
- if (tccState) {
- const req: CuTeachPermissionRequest = {
- requestId: randomUUID(),
- reason,
- apps: [],
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- tccState,
- };
- await overrides.onTeachPermissionRequest(req);
- // Same re-check as handleRequestAccess — user may have granted while the
- // dialog was up, and the pre-dialog snapshot would mislead the model.
- const recheck = await adapter.ensureOsPermissions();
- if (recheck.granted) {
- return errorResult(
- "macOS Accessibility and Screen Recording are now both granted. " +
- "Call request_teach_access again immediately — the next call will " +
- "show the app selection list.",
- );
- }
- const missing: string[] = [];
- if (!recheck.accessibility) missing.push("Accessibility");
- if (!recheck.screenRecording) missing.push("Screen Recording");
- return errorResult(
- `macOS ${missing.join(" and ")} permission(s) not yet granted. ` +
- `The permission panel has been shown. Once the user grants the ` +
- `missing permission(s), call request_teach_access again.`,
- "tcc_not_granted",
- );
- }
- const rawApps = args.apps;
- if (!Array.isArray(rawApps) || !rawApps.every((a) => typeof a === "string")) {
- return errorResult('"apps" must be an array of strings.', "bad_args");
- }
- const apps = rawApps as string[];
- const {
- needDialog,
- skipDialogGrants,
- willHide,
- tieredApps,
- userDenied,
- policyDenied,
- } = await buildAccessRequest(
- adapter,
- apps,
- overrides.allowedApps,
- new Set(overrides.userDeniedBundleIds),
- overrides.selectedDisplayId,
- );
- // All requested apps were user-denied (or unresolvable) and none pre-granted
- // — skip the dialog entirely. Without this, onTeachPermissionRequest fires
- // with apps:[] and the user sees an empty approval dialog where Allow and
- // Deny produce the same result (granted=[] → teachModeActive stays false).
- // handleRequestAccess has the equivalent guard at the needDialog.length
- // check; teach didn't need one before user-deny because needDialog=[]
- // previously implied skipDialogGrants.length > 0 (all-already-granted).
- if (needDialog.length === 0 && skipDialogGrants.length === 0) {
- return okJson(
- {
- granted: [],
- denied: [],
- ...(policyDenied.length > 0 && {
- policyDenied: {
- apps: policyDenied,
- guidance: buildPolicyDeniedGuidance(policyDenied),
- },
- }),
- ...(userDenied.length > 0 && {
- userDenied: {
- apps: userDenied,
- guidance: buildUserDeniedGuidance(userDenied),
- },
- }),
- teachModeActive: false,
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- },
- { granted_count: 0, denied_count: 0 },
- );
- }
- const req: CuTeachPermissionRequest = {
- requestId: randomUUID(),
- reason,
- apps: needDialog,
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- ...(willHide.length > 0 && {
- willHide,
- autoUnhideEnabled: adapter.getAutoUnhideEnabled(),
- }),
- };
- const response = await overrides.onTeachPermissionRequest(req);
- const granted = [...skipDialogGrants, ...response.granted];
- // Gate on explicit dialog consent, NOT on merged grant length.
- // skipDialogGrants are pre-existing idempotent app grants — they don't
- // imply the user said yes to THIS dialog. Without the userConsented
- // check, Deny would still activate teach mode whenever any requested
- // app was previously granted (worst case: needDialog=[] → Allow and
- // Deny payloads are structurally identical).
- const teachModeActive = response.userConsented === true && granted.length > 0;
- if (teachModeActive) {
- overrides.onTeachModeActivated?.();
- }
- const grantedBundleIds = new Set(granted.map((g) => g.bundleId));
- const grantedTieredApps = tieredApps.filter((t) =>
- grantedBundleIds.has(t.bundleId),
- );
- return okJson(
- {
- granted,
- denied: response.denied,
- ...(policyDenied.length > 0 && {
- policyDenied: {
- apps: policyDenied,
- guidance: buildPolicyDeniedGuidance(policyDenied),
- },
- }),
- ...(userDenied.length > 0 && {
- userDenied: {
- apps: userDenied,
- guidance: buildUserDeniedGuidance(userDenied),
- },
- }),
- ...(grantedTieredApps.length > 0 && {
- tierGuidance: buildTierGuidanceMessage(grantedTieredApps),
- }),
- teachModeActive,
- screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
- },
- {
- // response.granted only — skipDialogGrants are idempotent re-grants.
- // See handleRequestAccess's parallel comment.
- granted_count: response.granted.length,
- denied_count: response.denied.length,
- ...tierAssignmentTelemetry(grantedTieredApps),
- },
- );
- }
- // ---------------------------------------------------------------------------
- // teach_step + teach_batch — shared step primitives
- // ---------------------------------------------------------------------------
- /** A fully-validated teach step, anchor already scaled to logical points. */
- interface ValidatedTeachStep {
- explanation: string;
- nextPreview: string;
- anchorLogical: TeachStepRequest["anchorLogical"];
- actions: Array<Record<string, unknown>>;
- }
- /**
- * Validate one raw step record and scale its anchor. `label` is prefixed to
- * error messages so teach_batch can say `steps[2].actions[0]` instead of
- * just `actions[0]`.
- *
- * The anchor transform is the whole coordinate story: model sends image-pixel
- * coords (same space as click coords, per COORDINATES.md), `scaleCoord` turns
- * them into logical points against `overrides.lastScreenshot`. For
- * teach_batch, lastScreenshot stays at its pre-call value for the entire
- * batch — same invariant as computer_batch's "coordinates refer to the
- * PRE-BATCH screenshot". Anchors for step 2+ must therefore target elements
- * the model can predict will be at those coordinates after step 1's actions.
- */
- async function validateTeachStepArgs(
- raw: Record<string, unknown>,
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- label: string,
- ): Promise<ValidatedTeachStep | Error> {
- const explanation = requireString(raw, "explanation");
- if (explanation instanceof Error) {
- return new Error(`${label}: ${explanation.message}`);
- }
- const nextPreview = requireString(raw, "next_preview");
- if (nextPreview instanceof Error) {
- return new Error(`${label}: ${nextPreview.message}`);
- }
- const actions = raw.actions;
- if (!Array.isArray(actions)) {
- return new Error(
- `${label}: "actions" must be an array (empty is allowed).`,
- );
- }
- for (const [i, act] of actions.entries()) {
- if (typeof act !== "object" || act === null) {
- return new Error(`${label}: actions[${i}] must be an object`);
- }
- const action = (act as Record<string, unknown>).action;
- if (typeof action !== "string") {
- return new Error(`${label}: actions[${i}].action must be a string`);
- }
- if (!BATCHABLE_ACTIONS.has(action)) {
- return new Error(
- `${label}: actions[${i}].action="${action}" is not allowed. ` +
- `Allowed: ${[...BATCHABLE_ACTIONS].join(", ")}.`,
- );
- }
- }
- let anchorLogical: TeachStepRequest["anchorLogical"];
- if (raw.anchor !== undefined) {
- const anchor = raw.anchor;
- if (
- !Array.isArray(anchor) ||
- anchor.length !== 2 ||
- typeof anchor[0] !== "number" ||
- typeof anchor[1] !== "number" ||
- !Number.isFinite(anchor[0]) ||
- !Number.isFinite(anchor[1])
- ) {
- return new Error(
- `${label}: "anchor" must be a [x, y] number tuple or omitted.`,
- );
- }
- const display = await adapter.executor.getDisplaySize(
- overrides.selectedDisplayId,
- );
- anchorLogical = scaleCoord(
- anchor[0],
- anchor[1],
- overrides.coordinateMode,
- display,
- overrides.lastScreenshot,
- adapter.logger,
- );
- }
- return {
- explanation,
- nextPreview,
- anchorLogical,
- actions: actions as Array<Record<string, unknown>>,
- };
- }
- /** Outcome of showing one tooltip + running its actions. */
- type TeachStepOutcome =
- | { kind: "exit" }
- | { kind: "ok"; results: BatchActionResult[] }
- | {
- kind: "action_error";
- executed: number;
- failed: BatchActionResult;
- remaining: number;
- /** The inner action's telemetry (error_kind), forwarded so the
- * caller can pass it to okJson and keep cu_tool_call accurate
- * when the failure happened inside a batch. */
- telemetry: CuCallTelemetry | undefined;
- };
- /**
- * Show the tooltip, block for Next/Exit, run actions on Next.
- *
- * Action execution is a straight lift from `handleComputerBatch`:
- * prepareForAction ONCE per step (the user clicked Next — they consented to
- * that step's sequence), pixelValidation OFF (committed sequence), frontmost
- * gate still per-action, stop-on-first-error with partial results.
- *
- * Empty `actions` is valid — "read this, click Next to continue" steps.
- * Assumes `overrides.onTeachStep` is set (caller guards).
- */
- async function executeTeachStep(
- step: ValidatedTeachStep,
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<TeachStepOutcome> {
- // Block until Next or Exit. Same pending-promise pattern as
- // onPermissionRequest — host stores the resolver, overlay IPC fires it.
- // `!` is safe: both callers guard on overrides.onTeachStep before reaching here.
- const stepResult = await overrides.onTeachStep!({
- explanation: step.explanation,
- nextPreview: step.nextPreview,
- anchorLogical: step.anchorLogical,
- });
- if (stepResult.action === "exit") {
- // The host's Exit handler also calls stopSession, so the turn is
- // already unwinding. Caller decides what to return for the transcript.
- // A PREVIOUS step's left_mouse_down may have left the OS button held.
- await releaseHeldMouse(adapter);
- return { kind: "exit" };
- }
- // Next clicked. Flip overlay to spinner before we start driving.
- overrides.onTeachWorking?.();
- if (step.actions.length === 0) {
- return { kind: "ok", results: [] };
- }
- if (subGates.hideBeforeAction) {
- const hidden = await adapter.executor.prepareForAction(
- overrides.allowedApps.map((a) => a.bundleId),
- overrides.selectedDisplayId,
- );
- if (hidden.length > 0) {
- overrides.onAppsHidden?.(hidden);
- }
- }
- const stepSubGates: CuSubGates = {
- ...subGates,
- hideBeforeAction: false,
- pixelValidation: false,
- // Anchors are pre-computed against the display at batch start.
- // A mid-batch resolver switch would break tooltip positioning.
- autoTargetDisplay: false,
- };
- const results: BatchActionResult[] = [];
- for (const [i, act] of step.actions.entries()) {
- // Same abort check as handleComputerBatch — Exit calls stopSession so
- // this IS the exit path, just caught mid-dispatch instead of at the
- // onTeachStep await above. Callers already handle { kind: "exit" }.
- if (overrides.isAborted?.()) {
- await releaseHeldMouse(adapter);
- return { kind: "exit" };
- }
- // Same inter-step settle as handleComputerBatch.
- if (i > 0) await sleep(10);
- const action = act.action as string;
- // Drop mid-step screenshot piggyback — same invariant as computer_batch.
- // Click coords stay anchored to the screenshot the model took BEFORE
- // calling teach_step/teach_batch.
- const { screenshot: _dropped, ...inner } = await dispatchAction(
- action,
- act,
- adapter,
- overrides,
- stepSubGates,
- );
- const text = firstTextContent(inner);
- const result = { action, ok: !inner.isError, output: text };
- results.push(result);
- if (inner.isError) {
- await releaseHeldMouse(adapter);
- return {
- kind: "action_error",
- executed: results.length - 1,
- failed: result,
- remaining: step.actions.length - results.length,
- telemetry: inner.telemetry,
- };
- }
- }
- return { kind: "ok", results };
- }
- /**
- * Fold a fresh screenshot into the result. Eliminates the separate
- * screenshot tool call the model would otherwise make before the next
- * teach_step (one fewer API round trip per step). handleScreenshot
- * runs its own prepareForAction — that's correct: actions may have
- * opened something outside the allowlist. The .screenshot piggyback
- * flows through to serverDef.ts's stash → lastScreenshot updates →
- * the next teach_step.anchor scales against THIS image, which is what
- * the model is now looking at.
- */
- async function appendTeachScreenshot(
- resultJson: unknown,
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const shotResult = await handleScreenshot(adapter, overrides, subGates);
- if (shotResult.isError) {
- // Hide+screenshot failed (rare — e.g. SCContentFilter error). Don't
- // tank the step; just omit the image. Model will call screenshot
- // itself and see the real error.
- return okJson(resultJson);
- }
- return {
- content: [
- { type: "text", text: JSON.stringify(resultJson) },
- // handleScreenshot's content is [maybeMonitorNote, maybeHiddenNote,
- // image]. Spread all — both notes are useful context and the model
- // expects them alongside screenshots.
- ...shotResult.content,
- ],
- // For serverDef.ts to stash. Next teach_step.anchor scales against this.
- screenshot: shotResult.screenshot,
- };
- }
- /**
- * Show one guided-tour tooltip and block until the user clicks Next or Exit.
- * On Next, execute `actions[]` with `computer_batch` semantics.
- */
- async function handleTeachStep(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- if (!overrides.onTeachStep) {
- return errorResult(
- "Teach mode is not active. Call request_teach_access first.",
- "teach_mode_not_active",
- );
- }
- const step = await validateTeachStepArgs(
- args,
- adapter,
- overrides,
- "teach_step",
- );
- if (step instanceof Error) return errorResult(step.message, "bad_args");
- const outcome = await executeTeachStep(step, adapter, overrides, subGates);
- if (outcome.kind === "exit") {
- return okJson({ exited: true });
- }
- if (outcome.kind === "action_error") {
- return okJson(
- {
- executed: outcome.executed,
- failed: outcome.failed,
- remaining: outcome.remaining,
- },
- outcome.telemetry,
- );
- }
- // ok. No screenshot for empty actions — screen didn't change, model's
- // existing screenshot is still accurate.
- if (step.actions.length === 0) {
- return okJson({ executed: 0, results: [] });
- }
- return appendTeachScreenshot(
- { executed: outcome.results.length, results: outcome.results },
- adapter,
- overrides,
- subGates,
- );
- }
- /**
- * Queue a whole guided tour in one tool call. Parallels `computer_batch`: N
- * steps → one model→API round trip instead of N. Each step still blocks for
- * its own Next click (the user paces the tour), but the model doesn't wait
- * for a round trip between steps.
- *
- * Validates ALL steps upfront so a typo in step 5 doesn't surface after the
- * user has already clicked through steps 1–4.
- *
- * Anchors for every step scale against the pre-call `lastScreenshot` — same
- * PRE-BATCH invariant as computer_batch. Steps 2+ should either omit anchor
- * (centered tooltip) or target elements the model predicts won't have moved.
- *
- * Result shape:
- * {exited: true, stepsCompleted: N} — user clicked Exit
- * {stepsCompleted, stepFailed, executed, failed, …} — action error at step N
- * {stepsCompleted, results: [...]} + screenshot — all steps ran
- */
- async function handleTeachBatch(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- if (!overrides.onTeachStep) {
- return errorResult(
- "Teach mode is not active. Call request_teach_access first.",
- "teach_mode_not_active",
- );
- }
- const rawSteps = args.steps;
- if (!Array.isArray(rawSteps) || rawSteps.length < 1) {
- return errorResult('"steps" must be a non-empty array.', "bad_args");
- }
- // Validate upfront — fail fast before showing any tooltip.
- const steps: ValidatedTeachStep[] = [];
- for (const [i, raw] of rawSteps.entries()) {
- if (typeof raw !== "object" || raw === null) {
- return errorResult(`steps[${i}] must be an object`, "bad_args");
- }
- const v = await validateTeachStepArgs(
- raw as Record<string, unknown>,
- adapter,
- overrides,
- `steps[${i}]`,
- );
- if (v instanceof Error) return errorResult(v.message, "bad_args");
- steps.push(v);
- }
- const allResults: BatchActionResult[][] = [];
- for (const [i, step] of steps.entries()) {
- const outcome = await executeTeachStep(step, adapter, overrides, subGates);
- if (outcome.kind === "exit") {
- return okJson({ exited: true, stepsCompleted: i });
- }
- if (outcome.kind === "action_error") {
- return okJson(
- {
- stepsCompleted: i,
- stepFailed: i,
- executed: outcome.executed,
- failed: outcome.failed,
- remaining: outcome.remaining,
- results: allResults,
- },
- outcome.telemetry,
- );
- }
- allResults.push(outcome.results);
- }
- // Final screenshot only if any step ran actions (screen changed).
- const screenChanged = steps.some((s) => s.actions.length > 0);
- const resultJson = { stepsCompleted: steps.length, results: allResults };
- if (!screenChanged) {
- return okJson(resultJson);
- }
- return appendTeachScreenshot(resultJson, adapter, overrides, subGates);
- }
- /**
- * Build the hidden-apps note that accompanies a screenshot. Tells the model
- * which apps got hidden (not in allowlist) and how to add them. Returns
- * undefined when nothing was hidden since the last screenshot.
- */
- async function buildHiddenNote(
- adapter: ComputerUseHostAdapter,
- hiddenSinceLastSeen: string[],
- ): Promise<string | undefined> {
- if (hiddenSinceLastSeen.length === 0) return undefined;
- const running = await adapter.executor.listRunningApps();
- const nameOf = new Map(running.map((a) => [a.bundleId, a.displayName]));
- const names = hiddenSinceLastSeen.map((id) => nameOf.get(id) ?? id);
- const list = names.map((n) => `"${n}"`).join(", ");
- const one = names.length === 1;
- return (
- `${list} ${one ? "was" : "were"} open and got hidden before this screenshot ` +
- `(not in the session allowlist). If a previous action was meant to open ` +
- `${one ? "it" : "one of them"}, that's why you don't see it — call ` +
- `request_access to add ${one ? "it" : "them"} to the allowlist.`
- );
- }
- /**
- * Assign a human-readable label to each display. Falls back to `display N`
- * when NSScreen.localizedName is undefined; disambiguates identical labels
- * (matched-pair external monitors) with a `(2)` suffix. Used by both
- * buildMonitorNote and handleSwitchDisplay so the name the model sees in a
- * screenshot note is the same name it can pass back to switch_display.
- */
- function uniqueDisplayLabels(
- displays: readonly DisplayGeometry[],
- ): Map<number, string> {
- // Sort by displayId so the (N) suffix is stable regardless of
- // NSScreen.screens iteration order — same label always maps to same
- // physical display across buildMonitorNote → switch_display round-trip,
- // even if display configuration reorders between the two calls.
- const sorted = [...displays].sort((a, b) => a.displayId - b.displayId);
- const counts = new Map<string, number>();
- const out = new Map<number, string>();
- for (const d of sorted) {
- const base = d.label ?? `display ${d.displayId}`;
- const n = (counts.get(base) ?? 0) + 1;
- counts.set(base, n);
- out.set(d.displayId, n === 1 ? base : `${base} (${n})`);
- }
- return out;
- }
- /**
- * Build the monitor-context text that accompanies a screenshot. Tells the
- * model which monitor it's looking at (by human name), lists other attached
- * monitors, and flags when the monitor changed vs. the previous screenshot.
- *
- * Only emitted when there are 2+ displays AND (first screenshot OR the
- * display changed). Single-monitor setups and steady-state same-monitor
- * screenshots get no text — avoids noise.
- */
- async function buildMonitorNote(
- adapter: ComputerUseHostAdapter,
- shotDisplayId: number,
- lastDisplayId: number | undefined,
- canSwitchDisplay: boolean,
- ): Promise<string | undefined> {
- // listDisplays failure (e.g. Swift returns zero screens during monitor
- // hot-unplug) must not tank the screenshot — this note is optional context.
- let displays;
- try {
- displays = await adapter.executor.listDisplays();
- } catch (e) {
- adapter.logger.warn(`[computer-use] listDisplays failed: ${String(e)}`);
- return undefined;
- }
- if (displays.length < 2) return undefined;
- const labels = uniqueDisplayLabels(displays);
- const nameOf = (id: number): string => labels.get(id) ?? `display ${id}`;
- const current = nameOf(shotDisplayId);
- const others = displays
- .filter((d) => d.displayId !== shotDisplayId)
- .map((d) => nameOf(d.displayId));
- const switchHint = canSwitchDisplay
- ? " Use switch_display to capture a different monitor."
- : "";
- const othersList =
- others.length > 0
- ? ` Other attached monitors: ${others.map((n) => `"${n}"`).join(", ")}.` +
- switchHint
- : "";
- // 0 is kCGNullDirectDisplay (sentinel from old sessions persisted
- // pre-multimon) — treat same as undefined.
- if (lastDisplayId === undefined || lastDisplayId === 0) {
- return `This screenshot was taken on monitor "${current}".` + othersList;
- }
- if (lastDisplayId !== shotDisplayId) {
- const prev = nameOf(lastDisplayId);
- return (
- `This screenshot was taken on monitor "${current}", which is different ` +
- `from your previous screenshot (taken on "${prev}").` +
- othersList
- );
- }
- return undefined;
- }
- async function handleScreenshot(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- // §2 — empty allowlist → tool error, no screenshot.
- if (overrides.allowedApps.length === 0) {
- return errorResult(
- "No applications are granted for this session. Call request_access first.",
- "allowlist_empty",
- );
- }
- // Atomic resolve→prepare→capture (one Swift call, no scheduler gap).
- // Off → fall through to separate-calls path below.
- if (subGates.autoTargetDisplay) {
- // Model's explicit switch_display pin overrides everything — Swift's
- // straight cuDisplayInfo(forDisplayID:) passthrough, no chase chain.
- // Otherwise sticky display: only auto-resolve when the allowed-app
- // set has changed since the display was last resolved. Prevents the
- // resolver yanking the display on every screenshot.
- const allowedBundleIds = overrides.allowedApps.map((a) => a.bundleId);
- const currentAppSetKey = allowedBundleIds.slice().sort().join(",");
- const appSetChanged = currentAppSetKey !== overrides.displayResolvedForApps;
- const autoResolve = !overrides.displayPinnedByModel && appSetChanged;
- const result = await adapter.executor.resolvePrepareCapture({
- allowedBundleIds,
- preferredDisplayId: overrides.selectedDisplayId,
- autoResolve,
- // Keep the hideBeforeAction sub-gate independently rollable —
- // atomic path honors the same toggle the non-atomic path checks
- // at the prepareForAction call site.
- doHide: subGates.hideBeforeAction,
- });
- // Non-atomic path's takeScreenshotWithRetry has a MIN_SCREENSHOT_BYTES
- // check + retry. The atomic call is expensive (resolve+prepare+capture),
- // so no retry here — just a warning when the result is implausibly
- // small (transient display state like sleep wake). Skip when
- // captureError is set (base64 is intentionally empty then).
- if (
- result.captureError === undefined &&
- decodedByteLength(result.base64) < MIN_SCREENSHOT_BYTES
- ) {
- adapter.logger.warn(
- `[computer-use] resolvePrepareCapture result implausibly small (${decodedByteLength(result.base64)} bytes decoded) — possible transient display state`,
- );
- }
- // Resolver picked a different display than the session had selected
- // (host window moved, or allowed app on a different display). Write
- // the pick back to session so teach overlay positioning and subsequent
- // non-resolver calls track the same display. Fire-and-forget.
- if (result.displayId !== overrides.selectedDisplayId) {
- adapter.logger.debug(
- `[computer-use] resolver: preferred=${overrides.selectedDisplayId} resolved=${result.displayId}`,
- );
- overrides.onResolvedDisplayUpdated?.(result.displayId);
- }
- // Record the app set this display was resolved for, so the next
- // screenshot skips auto-resolve until the set changes again. Gated on
- // autoResolve (not just appSetChanged) — when pinned, we didn't
- // actually resolve, so don't update the key.
- if (autoResolve) {
- overrides.onDisplayResolvedForApps?.(currentAppSetKey);
- }
- // Report hidden apps only when the model has already seen the screen.
- let hiddenSinceLastSeen: string[] = [];
- if (overrides.lastScreenshot !== undefined) {
- hiddenSinceLastSeen = result.hidden;
- }
- if (result.hidden.length > 0) {
- overrides.onAppsHidden?.(result.hidden);
- }
- // Partial-success case: hide succeeded, capture failed (SCK perm
- // revoked mid-session). onAppsHidden fired above so auto-unhide will
- // restore hidden apps at turn end. Now surface the error to the model.
- if (result.captureError !== undefined) {
- return errorResult(result.captureError, "capture_failed");
- }
- const hiddenNote = await buildHiddenNote(adapter, hiddenSinceLastSeen);
- // Cherry-pick — don't spread `result` (would leak resolver fields into lastScreenshot).
- const shot: ScreenshotResult = {
- base64: result.base64,
- width: result.width,
- height: result.height,
- displayWidth: result.displayWidth,
- displayHeight: result.displayHeight,
- displayId: result.displayId,
- originX: result.originX,
- originY: result.originY,
- };
- const monitorNote = await buildMonitorNote(
- adapter,
- shot.displayId,
- overrides.lastScreenshot?.displayId,
- overrides.onDisplayPinned !== undefined,
- );
- return {
- content: [
- ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
- ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
- {
- type: "image",
- data: shot.base64,
- mimeType: "image/jpeg",
- },
- ],
- screenshot: shot,
- };
- }
- // Same hide+defocus sequence as input actions. Screenshot needs hide too
- // — if a non-allowlisted app is on top, SCContentFilter would composite it
- // out, but the pixels BELOW it are what the model would see, and those are
- // NOT what's actually there. Hiding first makes the screenshot TRUE.
- let hiddenSinceLastSeen: string[] = [];
- if (subGates.hideBeforeAction) {
- const hidden = await adapter.executor.prepareForAction(
- overrides.allowedApps.map((a) => a.bundleId),
- overrides.selectedDisplayId,
- );
- // "Something appeared since the model last looked." Report whenever:
- // (a) prepare hid something AND
- // (b) the model has ALREADY SEEN the screen (lastScreenshot is set).
- //
- // (b) is the discriminator that silences the first screenshot's
- // expected-noise hide. NOT a delta against a cumulative set — that was
- // the earlier bug: cuHiddenDuringTurn only grows, so once Preview is in
- // it (from the first screenshot's hide), subsequent re-hides of Preview
- // delta to zero. The double-click → Preview opens → re-hide → silent
- // loop never breaks.
- //
- // With this check: every re-hide fires. If the model loops "click → file
- // opens in Preview → screenshot → Preview hidden", it gets told EVERY
- // time. Eventually it'll request_access for Preview (or give up).
- //
- // False positive: user alt-tabs mid-turn → Safari re-hidden → reported.
- // Rare, and "Safari appeared" is at worst mild noise — far better than
- // the false-negative of never explaining why the file vanished.
- if (overrides.lastScreenshot !== undefined) {
- hiddenSinceLastSeen = hidden;
- }
- if (hidden.length > 0) {
- overrides.onAppsHidden?.(hidden);
- }
- }
- const allowedBundleIds = overrides.allowedApps.map((g) => g.bundleId);
- const shot = await takeScreenshotWithRetry(
- adapter.executor,
- allowedBundleIds,
- adapter.logger,
- overrides.selectedDisplayId,
- );
- const hiddenNote = await buildHiddenNote(adapter, hiddenSinceLastSeen);
- const monitorNote = await buildMonitorNote(
- adapter,
- shot.displayId,
- overrides.lastScreenshot?.displayId,
- overrides.onDisplayPinned !== undefined,
- );
- return {
- content: [
- ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
- ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
- {
- type: "image",
- data: shot.base64,
- mimeType: "image/jpeg",
- },
- ],
- // Piggybacked for serverDef.ts to stash on InternalServerContext.
- screenshot: shot,
- };
- }
- /**
- * Region-crop upscaled screenshot. Coord invariant (computer_use_v2.py:1092):
- * click coords ALWAYS refer to the full-screen screenshot, never the zoom.
- * Enforced structurally: this handler's return has NO `.screenshot` field,
- * so serverDef.ts's `if (result.screenshot)` branch cannot fire and
- * `cuLastScreenshot` is never touched. `executor.zoom()`'s return type also
- * lacks displayWidth/displayHeight, so it's not assignable to
- * `ScreenshotResult` even by accident.
- */
- async function handleZoom(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- ): Promise<CuCallToolResult> {
- // region: [x0, y0, x1, y1] in IMAGE-PX of lastScreenshot — same space the
- // model reads click coords from.
- const region = args.region;
- if (!Array.isArray(region) || region.length !== 4) {
- return errorResult(
- "region must be an array of length 4: [x0, y0, x1, y1]",
- "bad_args",
- );
- }
- const [x0, y0, x1, y1] = region;
- if (![x0, y0, x1, y1].every((v) => typeof v === "number" && v >= 0)) {
- return errorResult(
- "region values must be non-negative numbers",
- "bad_args",
- );
- }
- if (x1 <= x0)
- return errorResult("region x1 must be greater than x0", "bad_args");
- if (y1 <= y0)
- return errorResult("region y1 must be greater than y0", "bad_args");
- const last = overrides.lastScreenshot;
- if (!last) {
- return errorResult(
- "take a screenshot before zooming (region coords are relative to it)",
- "state_conflict",
- );
- }
- if (x1 > last.width || y1 > last.height) {
- return errorResult(
- `region exceeds screenshot bounds (${last.width}×${last.height})`,
- "bad_args",
- );
- }
- // image-px → logical-pt. Same ratio as scaleCoord (:198-199) —
- // displayWidth / width, not 1/scaleFactor. The ratio is folded.
- const ratioX = last.displayWidth / last.width;
- const ratioY = last.displayHeight / last.height;
- const regionLogical = {
- x: x0 * ratioX,
- y: y0 * ratioY,
- w: (x1 - x0) * ratioX,
- h: (y1 - y0) * ratioY,
- };
- const allowedIds = overrides.allowedApps.map((g) => g.bundleId);
- // Crop from the same display as lastScreenshot so the zoom region
- // matches the image the model is reading coords from.
- const zoomed = await adapter.executor.zoom(
- regionLogical,
- allowedIds,
- last.displayId,
- );
- // Return the image. NO `.screenshot` piggyback — this is the invariant.
- return {
- content: [{ type: "image", data: zoomed.base64, mimeType: "image/jpeg" }],
- };
- }
- /** Shared handler for all five click variants. */
- async function handleClickVariant(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- button: "left" | "right" | "middle",
- count: 1 | 2 | 3,
- ): Promise<CuCallToolResult> {
- // A prior left_mouse_down may have set mouseButtonHeld without a matching
- // left_mouse_up (e.g. drag rejected by a tier gate, model falls back to
- // left_click). executor.click() does its own mouseDown+mouseUp, releasing
- // the OS button — but without this, the JS flag stays true and all
- // subsequent mouse_move calls take the held-button path ("mouse"/
- // "mouse_full" actionKind + hit-test), causing spurious rejections on
- // click-tier and read-tier windows. Release first so click() gets a clean
- // slate.
- if (mouseButtonHeld) {
- await adapter.executor.mouseUp();
- mouseButtonHeld = false;
- mouseMoved = false;
- }
- const coord = extractCoordinate(args);
- if (coord instanceof Error) return errorResult(coord.message, "bad_args");
- const [rawX, rawY] = coord;
- // left_click(coordinate=[x,y], text="shift") — hold modifiers
- // during the click. Same chord parsing as the key tool.
- let modifiers: string[] | undefined;
- if (args.text !== undefined) {
- if (typeof args.text !== "string") {
- return errorResult("text must be a string", "bad_args");
- }
- // Same gate as handleKey/handleHoldKey. withModifiers presses each name
- // via native.key(m, "press") — a non-modifier like "q" in text="cmd+q"
- // gets pressed while Cmd is held → Cmd+Q fires before the click.
- if (
- isSystemKeyCombo(args.text, adapter.executor.capabilities.platform) &&
- !overrides.grantFlags.systemKeyCombos
- ) {
- return errorResult(
- `The modifier chord "${args.text}" would fire a system shortcut. ` +
- "Request the systemKeyCombos grant flag via request_access, or use " +
- "only modifier keys (shift, ctrl, alt, cmd) in the text parameter.",
- "grant_flag_required",
- );
- }
- modifiers = parseKeyChord(args.text);
- }
- // Right/middle-click and any click with a modifier chord escalate to
- // keyboard-equivalent input at tier "click" (context-menu Paste, chord
- // keystrokes). Compute once, pass to both gates.
- const clickActionKind: CuActionKind =
- button !== "left" || (modifiers !== undefined && modifiers.length > 0)
- ? "mouse_full"
- : "mouse";
- const gate = await runInputActionGates(
- adapter,
- overrides,
- subGates,
- clickActionKind,
- );
- if (gate) return gate;
- const display = await adapter.executor.getDisplaySize(
- overrides.selectedDisplayId,
- );
- // §6 item P — pixel-validation staleness check. Sub-gated.
- // Runs AFTER the gates (no point validating if we're about to refuse
- // anyway) but BEFORE the executor call.
- if (subGates.pixelValidation) {
- const { xPct, yPct } = coordToPercentageForPixelCompare(
- rawX,
- rawY,
- overrides.coordinateMode,
- overrides.lastScreenshot,
- );
- const validation = await validateClickTarget(
- adapter.cropRawPatch,
- overrides.lastScreenshot,
- xPct,
- yPct,
- async () => {
- // The fresh screenshot for validation uses the SAME allow-set as
- // the model's last screenshot did, so we compare like with like.
- const allowedIds = overrides.allowedApps.map((g) => g.bundleId);
- try {
- // Fresh shot must match lastScreenshot's display, not the current
- // selection — pixel-compare is against the model's last image.
- return await adapter.executor.screenshot({
- allowedBundleIds: allowedIds,
- displayId: overrides.lastScreenshot?.displayId,
- });
- } catch {
- return null;
- }
- },
- adapter.logger,
- );
- if (!validation.valid && validation.warning) {
- // Warning result — model told to re-screenshot.
- return okText(validation.warning);
- }
- }
- const { x, y } = scaleCoord(
- rawX,
- rawY,
- overrides.coordinateMode,
- display,
- overrides.lastScreenshot,
- adapter.logger,
- );
- const hitGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- x,
- y,
- clickActionKind,
- );
- if (hitGate) return hitGate;
- await adapter.executor.click(x, y, button, count, modifiers);
- return okText("Clicked.");
- }
- async function handleType(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const text = requireString(args, "text");
- if (text instanceof Error) return errorResult(text.message, "bad_args");
- const gate = await runInputActionGates(
- adapter,
- overrides,
- subGates,
- "keyboard",
- );
- if (gate) return gate;
- // §6 item 3 — clipboard-paste fast path for multi-line. Sub-gated AND
- // requires clipboardWrite grant. The save/restore + read-back-verify
- // lives in the EXECUTOR (task #5), not here. Here we just route.
- const viaClipboard =
- text.includes("\n") &&
- overrides.grantFlags.clipboardWrite &&
- subGates.clipboardPasteMultiline;
- if (viaClipboard) {
- await adapter.executor.type(text, { viaClipboard: true });
- return okText("Typed (via clipboard).");
- }
- // §6 item 7 — grapheme-cluster iteration. Prevents ZWJ emoji → �.
- // §6 item 4 — 8ms between graphemes (125 Hz USB polling). Battle-tested:
- // sleep BEFORE each keystroke, not after.
- //
- // \n, \r, \t MUST route through executor.key(), not type(). Two reasons:
- // 1. enigo.text("\n") on macOS posts a stale CGEvent with virtualKey=0
- // after stripping the newline — virtualKey 0 is the 'a' key, so a
- // ghost 'a' gets typed. Upstream bug in enigo 0.6.1 fast_text().
- // 2. Unicode text-insertion of '\n' is not a Return key press. URL bars
- // and terminals ignore it; the model's intent (submit/execute) is lost.
- // CRLF (\r\n) is one grapheme cluster (UAX #29 GB3), so check for it too.
- const graphemes = segmentGraphemes(text);
- for (const [i, g] of graphemes.entries()) {
- // Same abort check as handleComputerBatch. At 8ms/grapheme a 50-char
- // type() runs ~400ms; this is where an in-flight batch actually
- // spends its time.
- if (overrides.isAborted?.()) {
- return errorResult(
- `Typing aborted after ${i} of ${graphemes.length} graphemes (user interrupt).`,
- );
- }
- await sleep(INTER_GRAPHEME_SLEEP_MS);
- if (g === "\n" || g === "\r" || g === "\r\n") {
- await adapter.executor.key("return");
- } else if (g === "\t") {
- await adapter.executor.key("tab");
- } else {
- await adapter.executor.type(g, { viaClipboard: false });
- }
- }
- return okText(`Typed ${graphemes.length} grapheme(s).`);
- }
- async function handleKey(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const keySequence = requireString(args, "text");
- if (keySequence instanceof Error)
- return errorResult("text is required", "bad_args");
- // Cap 100, error strings match.
- let repeat: number | undefined;
- if (args.repeat !== undefined) {
- if (
- typeof args.repeat !== "number" ||
- !Number.isInteger(args.repeat) ||
- args.repeat < 1
- ) {
- return errorResult("repeat must be a positive integer", "bad_args");
- }
- if (args.repeat > 100) {
- return errorResult("repeat exceeds maximum of 100", "bad_args");
- }
- repeat = args.repeat;
- }
- // §2 — blocklist check BEFORE gates. A blocked combo with an ungranted
- // app frontmost should return the blocklist error, not the frontmost
- // error — the model's fix is to request the flag, not change focus.
- if (
- isSystemKeyCombo(keySequence, adapter.executor.capabilities.platform) &&
- !overrides.grantFlags.systemKeyCombos
- ) {
- return errorResult(
- `"${keySequence}" is a system-level shortcut. Request the \`systemKeyCombos\` grant via request_access to use it.`,
- "grant_flag_required",
- );
- }
- const gate = await runInputActionGates(
- adapter,
- overrides,
- subGates,
- "keyboard",
- );
- if (gate) return gate;
- await adapter.executor.key(keySequence, repeat);
- return okText("Key pressed.");
- }
- async function handleScroll(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const coord = extractCoordinate(args);
- if (coord instanceof Error) return errorResult(coord.message, "bad_args");
- const [rawX, rawY] = coord;
- // Uses scroll_direction + scroll_amount.
- // Map to our dx/dy executor interface.
- const dir = args.scroll_direction;
- if (dir !== "up" && dir !== "down" && dir !== "left" && dir !== "right") {
- return errorResult(
- "scroll_direction must be 'up', 'down', 'left', or 'right'",
- "bad_args",
- );
- }
- const amount = args.scroll_amount;
- if (typeof amount !== "number" || !Number.isInteger(amount) || amount < 0) {
- return errorResult("scroll_amount must be a non-negative int", "bad_args");
- }
- if (amount > 100) {
- return errorResult("scroll_amount exceeds maximum of 100", "bad_args");
- }
- // up → dy = -amount; down → dy = +amount; left → dx = -amount; right → dx = +amount.
- const dx = dir === "left" ? -amount : dir === "right" ? amount : 0;
- const dy = dir === "up" ? -amount : dir === "down" ? amount : 0;
- const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
- if (gate) return gate;
- const display = await adapter.executor.getDisplaySize(
- overrides.selectedDisplayId,
- );
- const { x, y } = scaleCoord(
- rawX,
- rawY,
- overrides.coordinateMode,
- display,
- overrides.lastScreenshot,
- adapter.logger,
- );
- // When the button is held, executor.scroll's internal moveMouse generates
- // a leftMouseDragged event (enigo reads NSEvent.pressedMouseButtons) —
- // same mechanism as handleMoveMouse's held-button path. Upgrade the
- // hit-test to "mouse_full" so scroll can't be used to drag-drop text onto
- // a click-tier terminal, and mark mouseMoved so the subsequent
- // left_mouse_up hit-tests as a drop not a click-release.
- const hitGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- x,
- y,
- mouseButtonHeld ? "mouse_full" : "mouse",
- );
- if (hitGate) return hitGate;
- if (mouseButtonHeld) mouseMoved = true;
- await adapter.executor.scroll(x, y, dx, dy);
- return okText("Scrolled.");
- }
- async function handleDrag(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- // executor.drag() does its own press+release internally. Without this
- // defensive clear, a prior left_mouse_down leaves mouseButtonHeld=true
- // across the drag and desyncs the flag from OS state — same mechanism as
- // the handleClickVariant clear above. Release first so drag() gets a
- // clean slate.
- if (mouseButtonHeld) {
- await adapter.executor.mouseUp();
- mouseButtonHeld = false;
- mouseMoved = false;
- }
- // `coordinate` is the END point
- // (required). `start_coordinate` is OPTIONAL — when omitted, drag from
- // current cursor position.
- const endCoord = extractCoordinate(args, "coordinate");
- if (endCoord instanceof Error)
- return errorResult(endCoord.message, "bad_args");
- const rawTo = endCoord;
- let rawFrom: [number, number] | undefined;
- if (args.start_coordinate !== undefined) {
- const startCoord = extractCoordinate(args, "start_coordinate");
- if (startCoord instanceof Error)
- return errorResult(startCoord.message, "bad_args");
- rawFrom = startCoord;
- }
- // else: rawFrom stays undefined → executor drags from current cursor.
- const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
- if (gate) return gate;
- const display = await adapter.executor.getDisplaySize(
- overrides.selectedDisplayId,
- );
- const from =
- rawFrom === undefined
- ? undefined
- : scaleCoord(
- rawFrom[0],
- rawFrom[1],
- overrides.coordinateMode,
- display,
- overrides.lastScreenshot,
- adapter.logger,
- );
- const to = scaleCoord(
- rawTo[0],
- rawTo[1],
- overrides.coordinateMode,
- display,
- overrides.lastScreenshot,
- adapter.logger,
- );
- // Check both drag endpoints. `from` is where the mouseDown happens (picks
- // up), `to` is where mouseUp happens (drops). When start_coordinate is
- // omitted the drag begins at the cursor — same bypass as mouse_move →
- // left_mouse_down, so read the cursor and hit-test it (mirrors
- // handleLeftMouseDown).
- //
- // The `to` endpoint uses "mouse_full" (not "mouse"): dropping text onto a
- // terminal inserts it as if typed (macOS text drag-drop). Same threat as
- // right-click→Paste. `from` stays "mouse" — picking up is a read.
- const fromPoint = from ?? (await adapter.executor.getCursorPosition());
- const fromGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- fromPoint.x,
- fromPoint.y,
- "mouse",
- );
- if (fromGate) return fromGate;
- const toGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- to.x,
- to.y,
- "mouse_full",
- );
- if (toGate) return toGate;
- await adapter.executor.drag(from, to);
- return okText("Dragged.");
- }
- async function handleMoveMouse(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const coord = extractCoordinate(args);
- if (coord instanceof Error) return errorResult(coord.message, "bad_args");
- const [rawX, rawY] = coord;
- // When the button is held, moveMouse generates leftMouseDragged events on
- // the window under the cursor — that's interaction, not positioning.
- // Upgrade to "mouse" and hit-test the destination. When the button is NOT
- // held: pure positioning, passes at any tier, no hit-test (mouseDown/Up
- // hit-test the cursor to close the mouse_move→left_mouse_down decomposition).
- const actionKind: CuActionKind = mouseButtonHeld ? "mouse" : "mouse_position";
- const gate = await runInputActionGates(
- adapter,
- overrides,
- subGates,
- actionKind,
- );
- if (gate) return gate;
- const display = await adapter.executor.getDisplaySize(
- overrides.selectedDisplayId,
- );
- const { x, y } = scaleCoord(
- rawX,
- rawY,
- overrides.coordinateMode,
- display,
- overrides.lastScreenshot,
- adapter.logger,
- );
- if (mouseButtonHeld) {
- // "mouse_full" — same as left_click_drag's to-endpoint. Dragging onto a
- // click-tier terminal is text injection regardless of which primitive
- // (atomic drag vs. decomposed down/move/up) delivers the events.
- const hitGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- x,
- y,
- "mouse_full",
- );
- if (hitGate) return hitGate;
- }
- await adapter.executor.moveMouse(x, y);
- if (mouseButtonHeld) mouseMoved = true;
- return okText("Moved.");
- }
- async function handleOpenApplication(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- ): Promise<CuCallToolResult> {
- const app = requireString(args, "app");
- if (app instanceof Error) return errorResult(app.message, "bad_args");
- // Resolve display-name → bundle ID. Same logic as request_access.
- const allowed = new Set(overrides.allowedApps.map((g) => g.bundleId));
- let targetBundleId: string | undefined;
- if (looksLikeBundleId(app) && allowed.has(app)) {
- targetBundleId = app;
- } else {
- // Try display name → bundle ID, but ONLY against the allowlist itself.
- // Avoids paying the listInstalledApps() cost on the hot path and is
- // arguably more correct: if the user granted "Slack", the model asking
- // to open "Slack" should match THAT grant.
- const match = overrides.allowedApps.find(
- (g) => g.displayName.toLowerCase() === app.toLowerCase(),
- );
- targetBundleId = match?.bundleId;
- }
- if (!targetBundleId || !allowed.has(targetBundleId)) {
- return errorResult(
- `"${app}" is not granted for this session. Call request_access first.`,
- "app_not_granted",
- );
- }
- // open_application works at any tier — bringing an app forward is exactly
- // what tier "read" enables (you need it on screen to screenshot it). The
- // tier gates on click/type catch any follow-up interaction.
- await adapter.executor.openApp(targetBundleId);
- // On multi-monitor setups, macOS may place the opened window on a monitor
- // the resolver won't pick (e.g. Claude + another allowed app are co-located
- // elsewhere). Nudge the model toward switch_display BEFORE it wastes steps
- // clicking on dock icons. Single-monitor → no hint. listDisplays failure is
- // non-fatal — the hint is advisory.
- if (overrides.onDisplayPinned !== undefined) {
- let displayCount = 1;
- try {
- displayCount = (await adapter.executor.listDisplays()).length;
- } catch {
- // hint skipped
- }
- if (displayCount >= 2) {
- return okText(
- `Opened "${app}". If it isn't visible in the next screenshot, it may ` +
- `have opened on a different monitor — use switch_display to check.`,
- );
- }
- }
- return okText(`Opened "${app}".`);
- }
- async function handleSwitchDisplay(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- ): Promise<CuCallToolResult> {
- const display = requireString(args, "display");
- if (display instanceof Error) return errorResult(display.message, "bad_args");
- if (!overrides.onDisplayPinned) {
- return errorResult(
- "Display switching is not available in this session.",
- "feature_unavailable",
- );
- }
- if (display.toLowerCase() === "auto") {
- overrides.onDisplayPinned(undefined);
- return okText(
- "Returned to automatic monitor selection. Call screenshot to continue.",
- );
- }
- // Resolve label → displayId fresh. Same source buildMonitorNote reads,
- // so whatever name the model saw in a screenshot note resolves here.
- let displays;
- try {
- displays = await adapter.executor.listDisplays();
- } catch (e) {
- return errorResult(
- `Failed to enumerate displays: ${String(e)}`,
- "display_error",
- );
- }
- if (displays.length < 2) {
- return errorResult(
- "Only one monitor is connected. There is nothing to switch to.",
- "bad_args",
- );
- }
- const labels = uniqueDisplayLabels(displays);
- const wanted = display.toLowerCase();
- const target = displays.find(
- (d) => labels.get(d.displayId)?.toLowerCase() === wanted,
- );
- if (!target) {
- const available = displays
- .map((d) => `"${labels.get(d.displayId)}"`)
- .join(", ");
- return errorResult(
- `No monitor named "${display}" is connected. Available monitors: ${available}.`,
- "bad_args",
- );
- }
- overrides.onDisplayPinned(target.displayId);
- return okText(
- `Switched to monitor "${labels.get(target.displayId)}". Call screenshot to see it.`,
- );
- }
- function handleListGrantedApplications(
- overrides: ComputerUseOverrides,
- ): CuCallToolResult {
- return okJson({
- allowedApps: overrides.allowedApps,
- grantFlags: overrides.grantFlags,
- });
- }
- async function handleReadClipboard(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- if (!overrides.grantFlags.clipboardRead) {
- return errorResult(
- "Clipboard read is not granted. Request `clipboardRead` via request_access.",
- "grant_flag_required",
- );
- }
- // read_clipboard doesn't route through runInputActionGates — sync here so
- // reading after clicking into a click-tier app sees the cleared clipboard
- // (same as what the app's own Paste would see).
- if (subGates.clipboardGuard) {
- const frontmost = await adapter.executor.getFrontmostApp();
- const tierByBundleId = new Map(
- overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
- );
- const frontmostTier = frontmost
- ? tierByBundleId.get(frontmost.bundleId)
- : undefined;
- await syncClipboardStash(adapter, overrides, frontmostTier === "click");
- }
- // clipboardGuard may have stashed+cleared — read the actual (possibly
- // empty) clipboard. The agent sees what the app would see.
- const text = await adapter.executor.readClipboard();
- return okJson({ text });
- }
- async function handleWriteClipboard(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- if (!overrides.grantFlags.clipboardWrite) {
- return errorResult(
- "Clipboard write is not granted. Request `clipboardWrite` via request_access.",
- "grant_flag_required",
- );
- }
- const text = requireString(args, "text");
- if (text instanceof Error) return errorResult(text.message, "bad_args");
- if (subGates.clipboardGuard) {
- const frontmost = await adapter.executor.getFrontmostApp();
- const tierByBundleId = new Map(
- overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
- );
- const frontmostTier = frontmost
- ? tierByBundleId.get(frontmost.bundleId)
- : undefined;
- // Defense-in-depth for the clipboardGuard bypass: write_clipboard +
- // left_click on a click-tier app's UI Paste button. The re-clear in
- // syncClipboardStash already defeats it (the next action clobbers the
- // write), but rejecting here gives the agent a clear signal instead of
- // silently voiding its write.
- if (frontmost && frontmostTier === "click") {
- return errorResult(
- `"${frontmost.displayName}" is a tier-"click" app and currently ` +
- `frontmost. write_clipboard is blocked because the next action ` +
- `would clear the clipboard anyway — a UI Paste button in this ` +
- `app cannot be used to inject text. Bring a tier-"full" app ` +
- `forward before writing to the clipboard.` +
- TIER_ANTI_SUBVERSION,
- "tier_insufficient",
- );
- }
- // write_clipboard doesn't route through runInputActionGates — sync here
- // so clicking away from a click-tier app then writing restores the user's
- // stash before the agent's text lands.
- await syncClipboardStash(adapter, overrides, frontmostTier === "click");
- }
- await adapter.executor.writeClipboard(text);
- return okText("Clipboard written.");
- }
- /**
- * wait(duration=N). Sleeps N seconds, capped at 100.
- * No frontmost gate — no input, nothing to protect. Kill-switch + TCC
- * are checked in handleToolCall before dispatch reaches here.
- */
- async function handleWait(
- args: Record<string, unknown>,
- ): Promise<CuCallToolResult> {
- const duration = args.duration;
- if (typeof duration !== "number" || !Number.isFinite(duration)) {
- return errorResult("duration must be a number", "bad_args");
- }
- if (duration < 0) {
- return errorResult("duration must be non-negative", "bad_args");
- }
- if (duration > 100) {
- return errorResult(
- "duration is too long. Duration is in seconds.",
- "bad_args",
- );
- }
- await sleep(duration * 1000);
- return okText(`Waited ${duration}s.`);
- }
- /**
- * Returns "X=...,Y=..." plain text. We return richer JSON with
- * coordinateSpace annotation — the model handles both shapes.
- *
- * When lastScreenshot is present: inverse of scaleCoord — logical points →
- * image-pixels via `imageX = logicalX × (screenshotWidth / displayWidth)`.
- * Uses capture-time dims so the returned coords match what the model would
- * read off that screenshot.
- *
- * No frontmost gate — read-only, no input.
- */
- async function handleCursorPosition(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- ): Promise<CuCallToolResult> {
- const logical = await adapter.executor.getCursorPosition();
- const shot = overrides.lastScreenshot;
- if (shot) {
- // Inverse of scaleCoord: subtract capture-time origin to go from
- // virtual-screen to display-relative before the image-px transform.
- const localX = logical.x - shot.originX;
- const localY = logical.y - shot.originY;
- // Cursor off the captured display (multi-monitor): local coords go
- // negative or exceed display dims. Return logical_points + hint rather
- // than garbage image-px.
- if (
- localX < 0 ||
- localX > shot.displayWidth ||
- localY < 0 ||
- localY > shot.displayHeight
- ) {
- return okJson({
- x: logical.x,
- y: logical.y,
- coordinateSpace: "logical_points",
- note: "cursor is on a different monitor than your last screenshot; take a fresh screenshot",
- });
- }
- const x = Math.round(localX * (shot.width / shot.displayWidth));
- const y = Math.round(localY * (shot.height / shot.displayHeight));
- return okJson({ x, y, coordinateSpace: "image_pixels" });
- }
- return okJson({
- x: logical.x,
- y: logical.y,
- coordinateSpace: "logical_points",
- note: "take a screenshot first for image-pixel coordinates",
- });
- }
- /**
- * Presses each key in the
- * chord, sleeps duration seconds, releases in reverse. Same duration bounds
- * as wait. Keyboard action → frontmost gate applies; same systemKeyCombos
- * blocklist check as key.
- */
- async function handleHoldKey(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const text = requireString(args, "text");
- if (text instanceof Error) return errorResult(text.message, "bad_args");
- const duration = args.duration;
- if (typeof duration !== "number" || !Number.isFinite(duration)) {
- return errorResult("duration must be a number", "bad_args");
- }
- if (duration < 0) {
- return errorResult("duration must be non-negative", "bad_args");
- }
- if (duration > 100) {
- return errorResult(
- "duration is too long. Duration is in seconds.",
- "bad_args",
- );
- }
- // Blocklist check BEFORE gates — same reasoning as handleKey. Holding
- // cmd+q is just as dangerous as tapping it.
- if (
- isSystemKeyCombo(text, adapter.executor.capabilities.platform) &&
- !overrides.grantFlags.systemKeyCombos
- ) {
- return errorResult(
- `"${text}" is a system-level shortcut. Request the \`systemKeyCombos\` grant via request_access to use it.`,
- "grant_flag_required",
- );
- }
- const gate = await runInputActionGates(
- adapter,
- overrides,
- subGates,
- "keyboard",
- );
- if (gate) return gate;
- const keyNames = parseKeyChord(text);
- await adapter.executor.holdKey(keyNames, duration * 1000);
- return okText("Key held.");
- }
- /**
- * Raw press at current cursor, no coordinate.
- * Move first with mouse_move. Errors if already held.
- */
- async function handleLeftMouseDown(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- if (mouseButtonHeld) {
- return errorResult(
- "mouse button already held, call left_mouse_up first",
- "state_conflict",
- );
- }
- const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
- if (gate) return gate;
- // macOS routes mouseDown to the window under the cursor, not the frontmost
- // app. Without this hit-test, mouse_move (positioning, passes at any tier)
- // + left_mouse_down decomposes a click that lands on a tier-"read" window
- // overlapping a tier-"full" frontmost app — bypassing runHitTestGate's
- // whole purpose. All three are batchable, so the bypass is atomic.
- const cursor = await adapter.executor.getCursorPosition();
- const hitGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- cursor.x,
- cursor.y,
- "mouse",
- );
- if (hitGate) return hitGate;
- await adapter.executor.mouseDown();
- mouseButtonHeld = true;
- mouseMoved = false;
- return okText("Mouse button pressed.");
- }
- /**
- * Raw release at current cursor. Does NOT error
- * if not held (idempotent release).
- */
- async function handleLeftMouseUp(
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- // Any gate rejection here must release the button FIRST — otherwise the
- // OS button stays pressed and mouseButtonHeld stays true. Recovery
- // attempts (mouse_move back to a safe app) would generate leftMouseDragged
- // events into whatever window is under the cursor, including the very
- // read-tier window the gate was protecting. A single mouseUp on a
- // restricted window is one event; a stuck button is cascading damage.
- //
- // This includes the frontmost gate: focus can change between mouseDown and
- // mouseUp (something else grabbed focus), in which case runInputActionGates
- // rejects here even though it passed at mouseDown.
- const releaseFirst = async (
- err: CuCallToolResult,
- ): Promise<CuCallToolResult> => {
- await adapter.executor.mouseUp();
- mouseButtonHeld = false;
- mouseMoved = false;
- return err;
- };
- const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
- if (gate) return releaseFirst(gate);
- // When the cursor moved since mouseDown, this is a drop (text-injection
- // vector) — hit-test at "mouse_full" same as left_click_drag's `to`. When
- // NO move happened, this is a click-release — same semantics as the atomic
- // left_click, hit-test at "mouse". Without this distinction, a decomposed
- // click on a click-tier app fails here while the atomic left_click works,
- // and releaseFirst fires mouseUp anyway so the OS sees a complete click
- // while the model gets a misleading error.
- const cursor = await adapter.executor.getCursorPosition();
- const hitGate = await runHitTestGate(
- adapter,
- overrides,
- subGates,
- cursor.x,
- cursor.y,
- mouseMoved ? "mouse_full" : "mouse",
- );
- if (hitGate) return releaseFirst(hitGate);
- await adapter.executor.mouseUp();
- mouseButtonHeld = false;
- mouseMoved = false;
- return okText("Mouse button released.");
- }
- // ---------------------------------------------------------------------------
- // Batch dispatch
- // ---------------------------------------------------------------------------
- /**
- * Actions allowed inside a computer_batch call. Excludes request_access,
- * open_application, clipboard, list_granted (no latency benefit, complicates
- * security model).
- */
- const BATCHABLE_ACTIONS: ReadonlySet<string> = new Set([
- "key",
- "type",
- "mouse_move",
- "left_click",
- "left_click_drag",
- "right_click",
- "middle_click",
- "double_click",
- "triple_click",
- "scroll",
- "hold_key",
- "screenshot",
- "cursor_position",
- "left_mouse_down",
- "left_mouse_up",
- "wait",
- ]);
- interface BatchActionResult {
- action: string;
- ok: boolean;
- output: string;
- }
- /**
- * Executes `actions: [{action, …}, …]`
- * sequentially in ONE model→API round trip — the dominant latency cost
- * (seconds, vs. ~50ms local overhead per action).
- *
- * Gate semantics (the security model):
- * - Kill-switch + TCC: checked ONCE by handleToolCall before reaching here.
- * - prepareForAction: run ONCE at the top. The user approved "do this
- * sequence"; hiding apps per-action is wasted work and fast-pathed anyway.
- * - Frontmost gate: checked PER ACTION. State can change mid-batch — a
- * click might open a non-allowed app. This is the safety net: if action
- * 3 of 5 opened Safari (not allowed), action 4's frontmost check fires
- * and stops the batch there.
- * - PixelCompare: SKIPPED inside batch. The model committed to the full
- * sequence without intermediate screenshots; validating mid-batch clicks
- * against a pre-batch screenshot would false-positive constantly.
- *
- * Both skips are implemented by passing `{...subGates, hideBeforeAction:
- * false, pixelValidation: false}` to each inner dispatch — the handlers'
- * existing gate logic does the right thing, no new code paths.
- *
- * Stop-on-first-error: accumulate results, on
- * first `isError` stop executing, return everything so far + the error. The
- * model sees exactly where the batch broke and what succeeded before it.
- *
- * Mid-batch screenshots are allowed (for inspection) but NEVER piggyback —
- * their `.screenshot` field is dropped. Same invariant as zoom: click coords
- * always refer to the PRE-BATCH `lastScreenshot`. If the model wants to click
- * based on a new screenshot, it ends the batch and screenshots separately.
- */
- async function handleComputerBatch(
- adapter: ComputerUseHostAdapter,
- args: Record<string, unknown>,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- const actions = args.actions;
- if (!Array.isArray(actions) || actions.length === 0) {
- return errorResult("actions must be a non-empty array", "bad_args");
- }
- for (const [i, act] of actions.entries()) {
- if (typeof act !== "object" || act === null) {
- return errorResult(`actions[${i}] must be an object`, "bad_args");
- }
- const action = (act as Record<string, unknown>).action;
- if (typeof action !== "string") {
- return errorResult(`actions[${i}].action must be a string`, "bad_args");
- }
- if (!BATCHABLE_ACTIONS.has(action)) {
- return errorResult(
- `actions[${i}].action="${action}" is not allowed in a batch. ` +
- `Allowed: ${[...BATCHABLE_ACTIONS].join(", ")}.`,
- "bad_args",
- );
- }
- }
- // prepareForAction ONCE. After this, inner dispatches skip it via
- // hideBeforeAction:false.
- if (subGates.hideBeforeAction) {
- const hidden = await adapter.executor.prepareForAction(
- overrides.allowedApps.map((a) => a.bundleId),
- overrides.selectedDisplayId,
- );
- if (hidden.length > 0) {
- overrides.onAppsHidden?.(hidden);
- }
- }
- // Inner actions: skip prepare (already ran), skip pixelCompare (stale by
- // design). Frontmost still checked — runInputActionGates does it
- // unconditionally.
- const batchSubGates: CuSubGates = {
- ...subGates,
- hideBeforeAction: false,
- pixelValidation: false,
- // Batch already took its screenshot (appended at end); a mid-batch
- // resolver switch would make that screenshot inconsistent with
- // earlier clicks' lastScreenshot-based scaleCoord targeting.
- autoTargetDisplay: false,
- };
- const results: BatchActionResult[] = [];
- for (const [i, act] of actions.entries()) {
- // Overlay Stop → host's stopSession → lifecycleState leaves "running"
- // synchronously before query.interrupt(). The SDK abort tears down the
- // host's await but not this loop — without this check the remaining
- // actions fire into a dead session.
- if (overrides.isAborted?.()) {
- await releaseHeldMouse(adapter);
- return errorResult(
- `Batch aborted after ${results.length} of ${actions.length} actions (user interrupt).`,
- );
- }
- // Small inter-step settle. Synthetic CGEvents post instantly; some apps
- // need a tick to process step N's input before step N+1 lands (e.g. a
- // click opening a menu before the next click targets a menu item).
- if (i > 0) await sleep(10);
- const actionArgs = act as Record<string, unknown>;
- const action = actionArgs.action as string;
- // Drop mid-batch screenshot piggyback (strip .screenshot). Click coords
- // stay anchored to the pre-batch lastScreenshot.
- const { screenshot: _dropped, ...inner } = await dispatchAction(
- action,
- actionArgs,
- adapter,
- overrides,
- batchSubGates,
- );
- const text = firstTextContent(inner);
- const result = { action, ok: !inner.isError, output: text };
- results.push(result);
- if (inner.isError) {
- // Stop-on-first-error. Return everything so far + the error.
- // Forward the inner action's telemetry (error_kind) so cu_tool_call
- // reflects the actual failure — without this, batch-internal errors
- // emit error_kind: undefined despite the inner handler tagging it.
- // Release held mouse: the error may be a mid-grapheme abort in
- // handleType, or a frontmost gate, landing between mouse_down and
- // mouse_up.
- await releaseHeldMouse(adapter);
- return okJson(
- {
- completed: results.slice(0, -1),
- failed: result,
- remaining: actions.length - results.length,
- },
- inner.telemetry,
- );
- }
- }
- return okJson({ completed: results });
- }
- function firstTextContent(r: CuCallToolResult): string {
- const first = r.content[0];
- return first && first.type === "text" ? first.text : "";
- }
- /**
- * Action dispatch shared by handleToolCall and handleComputerBatch. Called
- * AFTER kill-switch + TCC gates have passed. Never sees request_access — it's
- * special-cased in handleToolCall for the tccState thread-through.
- */
- async function dispatchAction(
- name: string,
- a: Record<string, unknown>,
- adapter: ComputerUseHostAdapter,
- overrides: ComputerUseOverrides,
- subGates: CuSubGates,
- ): Promise<CuCallToolResult> {
- switch (name) {
- case "screenshot":
- return handleScreenshot(adapter, overrides, subGates);
- case "zoom":
- return handleZoom(adapter, a, overrides);
- case "left_click":
- return handleClickVariant(adapter, a, overrides, subGates, "left", 1);
- case "double_click":
- return handleClickVariant(adapter, a, overrides, subGates, "left", 2);
- case "triple_click":
- return handleClickVariant(adapter, a, overrides, subGates, "left", 3);
- case "right_click":
- return handleClickVariant(adapter, a, overrides, subGates, "right", 1);
- case "middle_click":
- return handleClickVariant(adapter, a, overrides, subGates, "middle", 1);
- case "type":
- return handleType(adapter, a, overrides, subGates);
- case "key":
- return handleKey(adapter, a, overrides, subGates);
- case "scroll":
- return handleScroll(adapter, a, overrides, subGates);
- case "left_click_drag":
- return handleDrag(adapter, a, overrides, subGates);
- case "mouse_move":
- return handleMoveMouse(adapter, a, overrides, subGates);
- case "wait":
- return handleWait(a);
- case "cursor_position":
- return handleCursorPosition(adapter, overrides);
- case "hold_key":
- return handleHoldKey(adapter, a, overrides, subGates);
- case "left_mouse_down":
- return handleLeftMouseDown(adapter, overrides, subGates);
- case "left_mouse_up":
- return handleLeftMouseUp(adapter, overrides, subGates);
- case "open_application":
- return handleOpenApplication(adapter, a, overrides);
- case "switch_display":
- return handleSwitchDisplay(adapter, a, overrides);
- case "list_granted_applications":
- return handleListGrantedApplications(overrides);
- case "read_clipboard":
- return handleReadClipboard(adapter, overrides, subGates);
- case "write_clipboard":
- return handleWriteClipboard(adapter, a, overrides, subGates);
- case "computer_batch":
- return handleComputerBatch(adapter, a, overrides, subGates);
- default:
- return errorResult(`Unknown tool "${name}".`, "bad_args");
- }
- }
- // ---------------------------------------------------------------------------
- // Main dispatch
- // ---------------------------------------------------------------------------
- export async function handleToolCall(
- adapter: ComputerUseHostAdapter,
- name: string,
- args: unknown,
- rawOverrides: ComputerUseOverrides,
- ): Promise<CuCallToolResult> {
- const { logger, serverName } = adapter;
- // Normalize the allowlist before any gate runs:
- //
- // (a) Strip user-denied. A grant from a previous session (before the user
- // added the app to Settings → Desktop app → Computer Use → Denied apps)
- // must not survive. Without
- // this, a stale grant bypasses the auto-deny. Stripped silently — the
- // agent already saw the userDenied guidance at request_access time, and
- // a live frontmost-gate rejection cites "not in allowed applications".
- //
- // (b) Strip policy-denied. Same story as (a) for a grant that predates a
- // blocklist addition. buildAccessRequest denies these up front for new
- // requests; this catches stale persisted grants.
- //
- // (c) Backfill tier. A grant persisted before the tier field existed has
- // `tier: undefined`, which `tierSatisfies` treats as `"full"` — wrong
- // for a legacy Chrome grant. Assign the hardcoded tier based on
- // bundle-ID category. Modern grants already have a tier.
- //
- // `.some()` guard keeps the hot path (empty deny list, no legacy grants)
- // zero-alloc.
- const userDeniedSet = new Set(rawOverrides.userDeniedBundleIds);
- const overrides: ComputerUseOverrides = rawOverrides.allowedApps.some(
- (a) =>
- a.tier === undefined ||
- userDeniedSet.has(a.bundleId) ||
- isPolicyDenied(a.bundleId, a.displayName),
- )
- ? {
- ...rawOverrides,
- allowedApps: rawOverrides.allowedApps
- .filter((a) => !userDeniedSet.has(a.bundleId))
- .filter((a) => !isPolicyDenied(a.bundleId, a.displayName))
- .map((a) =>
- a.tier !== undefined
- ? a
- : { ...a, tier: getDefaultTierForApp(a.bundleId, a.displayName) },
- ),
- }
- : rawOverrides;
- // ─── Gate 1: kill switch ─────────────────────────────────────────────
- if (adapter.isDisabled()) {
- return errorResult(
- "Computer control is disabled in Settings. Enable it and try again.",
- "other",
- );
- }
- // ─── Gate 2: TCC ─────────────────────────────────────────────────────
- // Accessibility + Screen Recording on macOS. Pure check — no dialog,
- // no relaunch. `request_access` is exempted: it threads the ungranted
- // state through to the renderer, which shows a TCC toggle panel instead
- // of the app list. Every other tool short-circuits here.
- const osPerms = await adapter.ensureOsPermissions();
- let tccState:
- | { accessibility: boolean; screenRecording: boolean }
- | undefined;
- if (!osPerms.granted) {
- // Both request_* tools thread tccState through to the renderer's
- // TCC toggle panel. Every other tool short-circuits.
- if (name !== "request_access" && name !== "request_teach_access") {
- return errorResult(
- "Accessibility and Screen Recording permissions are required. " +
- "Call request_access to show the permission panel.",
- "tcc_not_granted",
- );
- }
- tccState = {
- accessibility: osPerms.accessibility,
- screenRecording: osPerms.screenRecording,
- };
- }
- // ─── Gate 3: global CU lock ──────────────────────────────────────────
- // At most one session uses CU at a time. Every tool including
- // request_access hits the CHECK — even showing the approval dialog while
- // another session holds the lock would be confusing ("why approve access
- // that can't be used?").
- //
- // But ACQUIRE is split: request_access and list_granted_applications
- // check-without-acquire (the overlay + notifications are driven by
- // cuLockChanged, and showing "Claude is using your computer" while the
- // agent is only ASKING for access is premature). First action tool
- // acquires and the overlay appears. If the user denies and no action
- // follows, the overlay never shows.
- //
- // request_teach_access is NOT in this set — approving teach mode HIDES
- // the main window (via onTeachModeActivated), and the lock must be held
- // before that happens. Otherwise a concurrent session's request_access
- // would render its dialog in an invisible main window during the gap
- // between hide and the first teach_step (seconds of model inference).
- // The old acquire-always-at-Gate-3 behavior was correct for teach; only
- // the non-teach permission tools benefit from deferral.
- //
- // Host releases on idle/stop/archive; this package never releases. Both
- // Cowork (LAM) and CCD (LSM) wire checkCuLock via the shared cuLock
- // singleton. When undefined (tests/future hosts), no gate — absence of
- // the mechanism ≠ locked out.
- const deferAcquire = defersLockAcquire(name);
- const lock = overrides.checkCuLock?.();
- if (lock) {
- if (lock.holder !== undefined && !lock.isSelf) {
- return errorResult(
- "Another Claude session is currently using the computer. Wait for " +
- "the user to acknowledge it is finished (stop button in the Claude " +
- "window), or find a non-computer-use approach if one is readily " +
- "apparent.",
- "cu_lock_held",
- );
- }
- if (lock.holder === undefined && !deferAcquire) {
- // Acquire. Emits cuLockChanged → overlay shows. Idempotent — if
- // someone else acquired between check and here (won't happen on a
- // single-threaded event loop, but defensive), this is a no-op.
- overrides.acquireCuLock?.();
- // Fresh lock holder → any prior session's mouseButtonHeld is stale
- // (e.g. overlay stop mid-drag). Clear it so this session doesn't get
- // a spurious "already held" error. resetMouseButtonHeld is file-local;
- // this is the one non-test callsite.
- resetMouseButtonHeld();
- }
- // lock.isSelf → already held by us, proceed.
- // lock.holder === undefined && deferAcquire →
- // checked but not acquired — proceed, first action will acquire.
- }
- // Sub-gates read FRESH every call so a GrowthBook flip takes effect
- // mid-session (plan §3).
- const subGates = adapter.getSubGates();
- // Clipboard guard runs per-action inside runInputActionGates + inline in
- // handleReadClipboard/handleWriteClipboard. NOT here — per-tool-call sync
- // would run once for computer_batch and miss sub-actions 2..N, and would
- // fire during deferAcquire tools / `wait` / teach_step's blocking-dialog
- // phase where no input is happening.
- const a = asRecord(args);
- logger.silly(
- `[${serverName}] tool=${name} args=${JSON.stringify(a).slice(0, 200)}`,
- );
- // ─── Fail-closed dispatch ────────────────────────────────────────────
- // ANY exception below → tool error, executor never left in a half-called
- // state. Explicit inversion of the prior `catch → return true` fail-open.
- try {
- // request_access / request_teach_access: need tccState thread-through;
- // dispatchAction never sees them (not batchable).
- // teach_step: blocking UI tool, also not batchable; needs subGates for
- // its action-execution phase.
- if (name === "request_access") {
- return await handleRequestAccess(adapter, a, overrides, tccState);
- }
- if (name === "request_teach_access") {
- return await handleRequestTeachAccess(adapter, a, overrides, tccState);
- }
- if (name === "teach_step") {
- return await handleTeachStep(adapter, a, overrides, subGates);
- }
- if (name === "teach_batch") {
- return await handleTeachBatch(adapter, a, overrides, subGates);
- }
- return await dispatchAction(name, a, adapter, overrides, subGates);
- } catch (err) {
- // Fail-closed. If the gate machinery itself throws (e.g.
- // getFrontmostApp() rejects), the executor has NOT been called yet for
- // the gated tools — the gates run before the executor in every handler.
- // For ungated tools, the executor may have been mid-call; that's fine —
- // the result is still a tool error, never an implicit success.
- const msg = err instanceof Error ? err.message : String(err);
- logger.error(`[${serverName}] tool=${name} threw: ${msg}`, err);
- return errorResult(`Tool "${name}" failed: ${msg}`, "executor_threw");
- }
- }
- export const _test = {
- scaleCoord,
- coordToPercentageForPixelCompare,
- segmentGraphemes,
- decodedByteLength,
- resolveRequestedApps,
- buildAccessRequest,
- buildTierGuidanceMessage,
- buildUserDeniedGuidance,
- tierSatisfies,
- looksLikeBundleId,
- extractCoordinate,
- parseKeyChord,
- buildMonitorNote,
- handleSwitchDisplay,
- uniqueDisplayLabels,
- };
|