toolCalls.ts 134 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649
  1. /**
  2. * Tool dispatch. Every security decision from plan §2 is enforced HERE,
  3. * before any executor method is called.
  4. *
  5. * Enforcement order, every call:
  6. * 1. Kill switch (`adapter.isDisabled()`).
  7. * 2. TCC gate (`adapter.ensureOsPermissions()`). `request_access` is
  8. * exempted — it threads the ungranted state to the renderer so the
  9. * user can grant TCC perms from inside the approval dialog.
  10. * 3. Tool-specific gates (see dispatch table) — ANY exception in a gate
  11. * returns a tool error, executor never called.
  12. * 4. Executor call.
  13. *
  14. * For input actions (click/type/key/scroll/drag/move_mouse) the tool-specific
  15. * gates are, in order:
  16. * a. `prepareForAction` — hide every non-allowlisted app, then defocus us
  17. * (battle-tested pre-action sequence from the Vercept acquisition).
  18. * Sub-gated via `hideBeforeAction`. After this runs the screenshot is
  19. * TRUE (what the
  20. * model sees IS what's at each pixel) and we are not keyboard-focused.
  21. * b. Frontmost gate — branched by actionKind:
  22. * mouse: frontmost ∈ allowlist ∪ {hostBundleId, Finder} → pass.
  23. * hostBundleId passes because the executor's
  24. * `withClickThrough` bracket makes us click-through.
  25. * keyboard: frontmost ∈ allowlist ∪ {Finder} → pass.
  26. * hostBundleId → ERROR (safety net — defocus should have
  27. * moved us off; if it didn't, typing would go into our
  28. * own chat box).
  29. * After step (a) this gate fires RARELY — only when something popped
  30. * up between prepare and action, or the 5-try hide loop gave up.
  31. * Checked FRESH on every call, not cached across calls.
  32. *
  33. * For click variants only, AFTER the above gates but BEFORE the executor call:
  34. * c. Pixel-validation staleness check (sub-gated).
  35. */
  36. import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
  37. import { randomUUID } from "node:crypto";
  38. import { getDefaultTierForApp, getDeniedCategoryForApp, isPolicyDenied } from "./deniedApps.js";
  39. import type {
  40. ComputerExecutor,
  41. DisplayGeometry,
  42. InstalledApp,
  43. ScreenshotResult,
  44. } from "./executor.js";
  45. import { isSystemKeyCombo } from "./keyBlocklist.js";
  46. import { validateClickTarget } from "./pixelCompare.js";
  47. import { SENTINEL_BUNDLE_IDS } from "./sentinelApps.js";
  48. import type {
  49. AppGrant,
  50. ComputerUseHostAdapter,
  51. ComputerUseOverrides,
  52. CoordinateMode,
  53. CuAppPermTier,
  54. CuGrantFlags,
  55. CuPermissionRequest,
  56. CuSubGates,
  57. CuTeachPermissionRequest,
  58. Logger,
  59. ResolvedAppRequest,
  60. TeachStepRequest,
  61. } from "./types.js";
  62. /**
  63. * Finder is never hidden by the hide loop (hiding Finder kills the Desktop),
  64. * so it's always a valid frontmost.
  65. */
  66. const FINDER_BUNDLE_ID = "com.apple.finder";
  67. /**
  68. * Categorical error classes for the cu_tool_call telemetry event. Never
  69. * free text — error messages may contain file paths / app content (PII).
  70. */
  71. export type CuErrorKind =
  72. | "allowlist_empty"
  73. | "tcc_not_granted"
  74. | "cu_lock_held"
  75. | "teach_mode_conflict"
  76. | "teach_mode_not_active"
  77. | "executor_threw"
  78. | "capture_failed"
  79. | "app_denied" // no longer emitted (tiered model replaced hard-deny); kept for schema compat
  80. | "bad_args" // malformed tool args (type/shape/range/unknown value)
  81. | "app_not_granted" // target app not in session allowlist (distinct from allowlist_empty)
  82. | "tier_insufficient" // app in allowlist but at a tier too low for the action
  83. | "feature_unavailable" // tool callable but session not wired for it
  84. | "state_conflict" // wrong state for action (call sequence, mouse already held)
  85. | "grant_flag_required" // action needs a grant flag (systemKeyCombos, clipboard*) from request_access
  86. | "display_error" // display enumeration failed (platform)
  87. | "other";
  88. /**
  89. * Telemetry payload piggybacked on the result — populated by handlers,
  90. * consumed and stripped by the host wrapper (serverDef.ts) before the
  91. * result goes to the SDK. Same pattern as `screenshot`.
  92. */
  93. export interface CuCallTelemetry {
  94. /** request_access / request_teach_access: apps NEWLY granted in THIS call
  95. * (does NOT include idempotent re-grants of already-allowed apps). */
  96. granted_count?: number;
  97. /** request_access / request_teach_access: apps denied in THIS call */
  98. denied_count?: number;
  99. /** request_access / request_teach_access: apps safety-denied (browser) this call */
  100. denied_browser_count?: number;
  101. /** request_access / request_teach_access: apps safety-denied (terminal) this call */
  102. denied_terminal_count?: number;
  103. /** Categorical error class (only set when isError) */
  104. error_kind?: CuErrorKind;
  105. }
  106. /**
  107. * `CallToolResult` augmented with the screenshot payload. `bindSessionContext`
  108. * reads `result.screenshot` after a `screenshot` tool call and stashes it in a
  109. * closure cell for the next pixel-validation. MCP clients never see this
  110. * field — the host wrapper strips it before returning to the SDK.
  111. */
  112. export type CuCallToolResult = CallToolResult & {
  113. screenshot?: ScreenshotResult;
  114. /** Piggybacked telemetry — stripped by the host wrapper before SDK return. */
  115. telemetry?: CuCallTelemetry;
  116. };
  117. // ---------------------------------------------------------------------------
  118. // Small result helpers (mirror of chrome-mcp's inline `{content, isError}`)
  119. // ---------------------------------------------------------------------------
  120. function errorResult(text: string, errorKind?: CuErrorKind): CuCallToolResult {
  121. return {
  122. content: [{ type: "text", text }],
  123. isError: true,
  124. telemetry: errorKind ? { error_kind: errorKind } : undefined,
  125. };
  126. }
  127. function okText(text: string): CuCallToolResult {
  128. return { content: [{ type: "text", text }] };
  129. }
  130. function okJson(obj: unknown, telemetry?: CuCallTelemetry): CuCallToolResult {
  131. return {
  132. content: [{ type: "text", text: JSON.stringify(obj) }],
  133. telemetry,
  134. };
  135. }
  136. // ---------------------------------------------------------------------------
  137. // Arg validation — lightweight, no zod (mirrors chrome-mcp's cast-and-check)
  138. // ---------------------------------------------------------------------------
  139. function asRecord(args: unknown): Record<string, unknown> {
  140. if (typeof args === "object" && args !== null) {
  141. return args as Record<string, unknown>;
  142. }
  143. return {};
  144. }
  145. function requireNumber(
  146. args: Record<string, unknown>,
  147. key: string,
  148. ): number | Error {
  149. const v = args[key];
  150. if (typeof v !== "number" || !Number.isFinite(v)) {
  151. return new Error(`"${key}" must be a finite number.`);
  152. }
  153. return v;
  154. }
  155. function requireString(
  156. args: Record<string, unknown>,
  157. key: string,
  158. ): string | Error {
  159. const v = args[key];
  160. if (typeof v !== "string") {
  161. return new Error(`"${key}" must be a string.`);
  162. }
  163. return v;
  164. }
  165. /**
  166. * Extract (x, y) from `coordinate: [x, y]` tuple.
  167. * array of length 2, both non-negative numbers.
  168. */
  169. function extractCoordinate(
  170. args: Record<string, unknown>,
  171. paramName: string = "coordinate",
  172. ): [number, number] | Error {
  173. const coord = args[paramName];
  174. if (coord === undefined) {
  175. return new Error(`${paramName} is required`);
  176. }
  177. if (!Array.isArray(coord) || coord.length !== 2) {
  178. return new Error(`${paramName} must be an array of length 2`);
  179. }
  180. const [x, y] = coord;
  181. if (typeof x !== "number" || typeof y !== "number" || x < 0 || y < 0) {
  182. return new Error(`${paramName} must be a tuple of non-negative numbers`);
  183. }
  184. return [x, y];
  185. }
  186. // ---------------------------------------------------------------------------
  187. // Coordinate scaling
  188. // ---------------------------------------------------------------------------
  189. /**
  190. * Convert model-space coordinates to the logical points that enigo expects.
  191. *
  192. * - `normalized_0_100`: (x / 100) * display.width. `display` is fetched
  193. * fresh per tool call — never cached across calls —
  194. * so a mid-session display-settings change doesn't leave us stale.
  195. * - `pixels`: the model sent image-space pixel coords (it read them off the
  196. * last screenshot). With the 1568-px long-edge downsample, the
  197. * screenshot-px → logical-pt ratio is `displayWidth / screenshotWidth`,
  198. * NOT `1/scaleFactor`. Uses the display geometry stashed at CAPTURE time
  199. * (`lastScreenshot.displayWidth`), not fresh — so the transform matches
  200. * what the model actually saw even if the user changed display settings
  201. * since. (Chrome's ScreenshotContext pattern — CDPService.ts:1486-1493.)
  202. */
  203. function scaleCoord(
  204. rawX: number,
  205. rawY: number,
  206. mode: CoordinateMode,
  207. display: DisplayGeometry,
  208. lastScreenshot: ScreenshotResult | undefined,
  209. logger: Logger,
  210. ): { x: number; y: number } {
  211. if (mode === "normalized_0_100") {
  212. // Origin offset targets the selected display in virtual-screen space.
  213. return {
  214. x: Math.round((rawX / 100) * display.width) + display.originX,
  215. y: Math.round((rawY / 100) * display.height) + display.originY,
  216. };
  217. }
  218. // mode === "pixels": model sent image-space pixel coords.
  219. if (lastScreenshot) {
  220. // The transform. Chrome coordinateScaling.ts:22-34 + claude-in-a-box
  221. // ComputerTool.swift:70-80 — two independent convergent impls.
  222. // Uses the display geometry stashed AT CAPTURE TIME, not fresh.
  223. // Origin from the same snapshot keeps clicks coherent with the captured display.
  224. return {
  225. x:
  226. Math.round(
  227. rawX * (lastScreenshot.displayWidth / lastScreenshot.width),
  228. ) + lastScreenshot.originX,
  229. y:
  230. Math.round(
  231. rawY * (lastScreenshot.displayHeight / lastScreenshot.height),
  232. ) + lastScreenshot.originY,
  233. };
  234. }
  235. // Cold start: model sent pixel coords without having taken a screenshot.
  236. // Degenerate — fall back to the old /sf behavior and warn.
  237. logger.warn(
  238. "[computer-use] pixels-mode coordinate received with no prior screenshot; " +
  239. "falling back to /scaleFactor. Click may be off if downsample is active.",
  240. );
  241. return {
  242. x: Math.round(rawX / display.scaleFactor) + display.originX,
  243. y: Math.round(rawY / display.scaleFactor) + display.originY,
  244. };
  245. }
  246. /**
  247. * Convert model-space coordinates to the 0–100 percentage that
  248. * pixelCompare.ts works in. The staleness check operates in screenshot-image
  249. * space; comparing by percentage lets us crop both last and fresh screenshots
  250. * at the same relative location without caring about their absolute dims.
  251. *
  252. * With the 1568-px downsample, `screenshot.width != display.width * sf`, so
  253. * the old `rawX / (display.width * sf)` formula is wrong. The correct
  254. * denominator is just `lastScreenshot.width` — the model's raw pixel coord is
  255. * already in that image's coordinate space. `DisplayGeometry` is no longer
  256. * consumed at all.
  257. */
  258. function coordToPercentageForPixelCompare(
  259. rawX: number,
  260. rawY: number,
  261. mode: CoordinateMode,
  262. lastScreenshot: ScreenshotResult | undefined,
  263. ): { xPct: number; yPct: number } {
  264. if (mode === "normalized_0_100") {
  265. // Unchanged — already a percentage.
  266. return { xPct: rawX, yPct: rawY };
  267. }
  268. // mode === "pixels"
  269. if (!lastScreenshot) {
  270. // validateClickTarget at pixelCompare.ts:141-143 already skips when
  271. // lastScreenshot is undefined, so this return value never reaches a crop.
  272. return { xPct: 0, yPct: 0 };
  273. }
  274. return {
  275. xPct: (rawX / lastScreenshot.width) * 100,
  276. yPct: (rawY / lastScreenshot.height) * 100,
  277. };
  278. }
  279. // ---------------------------------------------------------------------------
  280. // Shared input-action gates
  281. // ---------------------------------------------------------------------------
  282. /**
  283. * Tier needed to perform a given action class. `undefined` → `"full"`.
  284. *
  285. * - `"mouse_position"` — mouse_move only. Passes at any tier including
  286. * `"read"`. Pure cursor positioning, no app interaction. Still runs
  287. * prepareForAction (hide non-allowed apps).
  288. * - `"mouse"` — plain left click, double/triple, scroll, drag-from.
  289. * Requires tier `"click"` or `"full"`.
  290. * - `"mouse_full"` — right/middle click, any click with modifiers,
  291. * drag-drop (the `to` endpoint of left_click_drag). Requires tier
  292. * `"full"`. Right-click → context menu Paste, modifier chords →
  293. * keystrokes before click, drag-drop → text insertion at the drop
  294. * point. All escalate a click-tier grant to keyboard-equivalent input.
  295. * Blunt: also rejects same-app drags (scrollbar, panel resize) onto
  296. * click-tier apps; `scroll` is the tier-"click" way to scroll.
  297. * - `"keyboard"` — type, key, hold_key. Requires tier `"full"`.
  298. */
  299. type CuActionKind = "mouse_position" | "mouse" | "mouse_full" | "keyboard";
  300. function tierSatisfies(
  301. grantTier: CuAppPermTier | undefined,
  302. actionKind: CuActionKind,
  303. ): boolean {
  304. const tier = grantTier ?? "full";
  305. if (actionKind === "mouse_position") return true;
  306. if (actionKind === "keyboard" || actionKind === "mouse_full") {
  307. return tier === "full";
  308. }
  309. // mouse
  310. return tier === "click" || tier === "full";
  311. }
  312. // Appended to every tier_insufficient error. The model may try to route
  313. // around the gate (osascript, System Events, cliclick via Bash) — this
  314. // closes that door explicitly. Leading space so it concatenates cleanly.
  315. const TIER_ANTI_SUBVERSION =
  316. " Do not attempt to work around this restriction — never use AppleScript, " +
  317. "System Events, shell commands, or any other method to send clicks or " +
  318. "keystrokes to this app.";
  319. // ---------------------------------------------------------------------------
  320. // Clipboard guard — stash+clear while a click-tier app is frontmost
  321. // ---------------------------------------------------------------------------
  322. //
  323. // Threat: tier "click" blocks type/key/right-click-Paste, but a click-tier
  324. // terminal/IDE may have a UI Paste button that's plain-left-clickable. If the
  325. // clipboard holds `rm -rf /` — from the user, from a prior full-tier paste,
  326. // OR from the agent's own write_clipboard call (which doesn't route through
  327. // runInputActionGates) — a left_click on that button injects it.
  328. //
  329. // Mitigation: stash the user's clipboard on first entry to click-tier, then
  330. // RE-CLEAR before every input action while click-tier stays frontmost. The
  331. // re-clear is the load-bearing part — a stash-on-transition-only design
  332. // leaves a gap between an agent write_clipboard and the next left_click.
  333. // When frontmost becomes anything else, restore. Turn-end restore is inlined
  334. // in the host's result-handler + leavingRunning (same dual-location as
  335. // cuHiddenDuringTurn unhide) — reads `session.cuClipboardStash` directly and
  336. // writes via Electron's `clipboard.writeText`, so no nest-only import.
  337. //
  338. // State lives on the session (via `overrides.getClipboardStash` /
  339. // `onClipboardStashChanged`), not module-level. The CU lock still guarantees
  340. // one session at a time, but session-scoped state means the host's turn-end
  341. // restore doesn't need to reach back into this package.
  342. async function syncClipboardStash(
  343. adapter: ComputerUseHostAdapter,
  344. overrides: ComputerUseOverrides,
  345. frontmostIsClickTier: boolean,
  346. ): Promise<void> {
  347. const current = overrides.getClipboardStash?.();
  348. if (!frontmostIsClickTier) {
  349. // Restore + clear. Idempotent — if nothing is stashed, no-op.
  350. if (current === undefined) return;
  351. try {
  352. await adapter.executor.writeClipboard(current);
  353. // Clear only after a successful write — a transient pasteboard
  354. // failure must not irrecoverably drop the stash.
  355. overrides.onClipboardStashChanged?.(undefined);
  356. } catch {
  357. // Best effort — stash held, next non-click action retries.
  358. }
  359. return;
  360. }
  361. // Stash the user's clipboard on FIRST entry to click-tier only.
  362. if (current === undefined) {
  363. try {
  364. const read = await adapter.executor.readClipboard();
  365. overrides.onClipboardStashChanged?.(read);
  366. } catch {
  367. // readClipboard failed — use empty sentinel so we don't retry the stash
  368. // on the next action; restore becomes a harmless writeClipboard("").
  369. overrides.onClipboardStashChanged?.("");
  370. }
  371. }
  372. // Re-clear on EVERY click-tier action, not just the first. Defeats the
  373. // bypass where the agent calls write_clipboard (which doesn't route
  374. // through runInputActionGates) between stash and a left_click on a UI
  375. // Paste button — the next action's clear clobbers the agent's write
  376. // before the click lands.
  377. try {
  378. await adapter.executor.writeClipboard("");
  379. } catch {
  380. // Transient pasteboard failure. The tier-"click" right-click/modifier
  381. // block still holds; this is a net, not a promise.
  382. }
  383. }
  384. /** Every click/type/key/scroll/drag/move_mouse runs through this before
  385. * touching the executor. Returns null on pass, error-result on block.
  386. * Any throw inside → caught by handleToolCall's outer try → tool error. */
  387. async function runInputActionGates(
  388. adapter: ComputerUseHostAdapter,
  389. overrides: ComputerUseOverrides,
  390. subGates: CuSubGates,
  391. actionKind: CuActionKind,
  392. ): Promise<CuCallToolResult | null> {
  393. // Step A+B — hide non-allowlisted apps + defocus us. Sub-gated. After this
  394. // runs, the frontmost gate below becomes a rare edge-case detector (something
  395. // popped up between prepare and action) rather than a normal-path blocker.
  396. // ALL grant tiers stay visible — visibility is the baseline (tier "read").
  397. if (subGates.hideBeforeAction) {
  398. const hidden = await adapter.executor.prepareForAction(
  399. overrides.allowedApps.map((a) => a.bundleId),
  400. overrides.selectedDisplayId,
  401. );
  402. // Empty-check so we don't spam the callback on every action when nothing
  403. // was hidden (the common case after the first action of a turn).
  404. if (hidden.length > 0) {
  405. overrides.onAppsHidden?.(hidden);
  406. }
  407. }
  408. // Frontmost gate. Check FRESH on every call.
  409. const frontmost = await adapter.executor.getFrontmostApp();
  410. const tierByBundleId = new Map(
  411. overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
  412. );
  413. // After handleToolCall's tier backfill, every grant has a concrete tier —
  414. // .get() returning undefined means the app is not in the allowlist at all.
  415. const frontmostTier = frontmost
  416. ? tierByBundleId.get(frontmost.bundleId)
  417. : undefined;
  418. // Clipboard guard. Per-action, not per-tool-call — runs for every sub-action
  419. // inside computer_batch and teach_step/teach_batch, so clicking into a
  420. // click-tier app mid-batch stashes+clears before the next click lands.
  421. // Lives here (not in handleToolCall) so deferAcquire tools (request_access,
  422. // list_granted_applications), `wait`, and the teach_step blocking-dialog
  423. // phase don't trigger a sync — only input actions do.
  424. if (subGates.clipboardGuard) {
  425. await syncClipboardStash(adapter, overrides, frontmostTier === "click");
  426. }
  427. if (!frontmost) {
  428. // No frontmost app (rare — login window?). Let it through; the click
  429. // will land somewhere and PixelCompare catches staleness.
  430. return null;
  431. }
  432. const { hostBundleId } = adapter.executor.capabilities;
  433. if (frontmostTier !== undefined) {
  434. if (tierSatisfies(frontmostTier, actionKind)) return null;
  435. // In the allowlist but tier doesn't cover this action. Tailor the
  436. // guidance to the actual tier — at "read", suggesting left_click or Bash
  437. // is wrong (nothing is allowed; use Chrome MCP). At "click", the
  438. // mouse_full/keyboard-specific messages apply.
  439. if (frontmostTier === "read") {
  440. // tier "read" is not category-unique (browser AND trading map to it) —
  441. // re-look-up so the CiC hint only shows for actual browsers.
  442. const isBrowser =
  443. getDeniedCategoryForApp(frontmost.bundleId, frontmost.displayName) ===
  444. "browser";
  445. return errorResult(
  446. `"${frontmost.displayName}" is granted at tier "read" — ` +
  447. `visible in screenshots only, no clicks or typing.` +
  448. (isBrowser
  449. ? " Use the Claude-in-Chrome MCP for browser interaction (tools " +
  450. "named `mcp__Claude_in_Chrome__*`; load via ToolSearch if " +
  451. "deferred)."
  452. : " No interaction is permitted; ask the user to take any " +
  453. "actions in this app themselves.") +
  454. TIER_ANTI_SUBVERSION,
  455. "tier_insufficient",
  456. );
  457. }
  458. // frontmostTier === "click" (tier === "full" would have passed tierSatisfies)
  459. if (actionKind === "keyboard") {
  460. return errorResult(
  461. `"${frontmost.displayName}" is granted at tier "click" — ` +
  462. `typing, key presses, and paste require tier "full". The keys ` +
  463. `would go to this app's text fields or integrated terminal. To ` +
  464. `type into a different app, click it first to bring it forward. ` +
  465. `For shell commands, use the Bash tool.` + TIER_ANTI_SUBVERSION,
  466. "tier_insufficient",
  467. );
  468. }
  469. // actionKind === "mouse_full" ("mouse" and "mouse_position" pass at "click")
  470. return errorResult(
  471. `"${frontmost.displayName}" is granted at tier "click" — ` +
  472. `right-click, middle-click, and clicks with modifier keys require ` +
  473. `tier "full". Right-click opens a context menu with Paste/Cut, and ` +
  474. `modifier chords fire as keystrokes before the click. Plain ` +
  475. `left_click is allowed here.` + TIER_ANTI_SUBVERSION,
  476. "tier_insufficient",
  477. );
  478. }
  479. // Finder is never-hide, always allowed.
  480. if (frontmost.bundleId === FINDER_BUNDLE_ID) return null;
  481. if (frontmost.bundleId === hostBundleId) {
  482. if (actionKind !== "keyboard") {
  483. // mouse and mouse_full are both click events — click-through works.
  484. // We're click-through (executor's withClickThrough). Pass.
  485. return null;
  486. }
  487. // Keyboard safety net — defocus (prepareForAction step B) should have
  488. // moved us off. If we're still here, typing would go to our chat box.
  489. return errorResult(
  490. "Claude's own window still has keyboard focus. This should not happen " +
  491. "after the pre-action defocus. Click on the target application first.",
  492. "state_conflict",
  493. );
  494. }
  495. // Non-allowlisted, non-us, non-Finder. RARE after the hide loop — means
  496. // something popped up between prepare and action, or the 5-try loop gave up.
  497. return errorResult(
  498. `"${frontmost.displayName}" is not in the allowed applications and is ` +
  499. `currently in front. Take a new screenshot — it may have appeared ` +
  500. `since your last one.`,
  501. "app_not_granted",
  502. );
  503. }
  504. /**
  505. * Hit-test gate: reject a mouse action if the window under (x, y) belongs
  506. * to an app whose tier doesn't cover mouse input. Closes the gap where a
  507. * tier-"full" app is frontmost but the click lands on a tier-"read" window
  508. * overlapping it — `runInputActionGates` passes (frontmost is fine), but the
  509. * click actually goes to the read-tier app.
  510. *
  511. * Runs AFTER `scaleCoord` (needs global coords) and BEFORE the executor call.
  512. * Returns null on pass (target is tier-"click"/"full", or desktop/Finder/us),
  513. * error-result on block.
  514. *
  515. * When `appUnderPoint` returns null (desktop, or platform without hit-test),
  516. * falls through — the frontmost check in `runInputActionGates` already ran.
  517. */
  518. async function runHitTestGate(
  519. adapter: ComputerUseHostAdapter,
  520. overrides: ComputerUseOverrides,
  521. subGates: CuSubGates,
  522. x: number,
  523. y: number,
  524. actionKind: CuActionKind,
  525. ): Promise<CuCallToolResult | null> {
  526. const target = await adapter.executor.appUnderPoint(x, y);
  527. if (!target) return null; // desktop / nothing under point / platform no-op
  528. // Finder (desktop, file dialogs) is always clickable — same exemption as
  529. // runInputActionGates. Our own overlay is filtered by Swift (pid != self).
  530. if (target.bundleId === FINDER_BUNDLE_ID) return null;
  531. const tierByBundleId = new Map(
  532. overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
  533. );
  534. if (!tierByBundleId.has(target.bundleId)) {
  535. // Not in the allowlist at all. The frontmost check would catch this if
  536. // the target were frontmost, but here a different app is in front. This
  537. // is the "something popped up" edge case — a new window appeared between
  538. // screenshot and click, or a background app's window overlaps the target.
  539. return errorResult(
  540. `Click at these coordinates would land on "${target.displayName}", ` +
  541. `which is not in the allowed applications. Take a fresh screenshot ` +
  542. `to see the current window layout.`,
  543. "app_not_granted",
  544. );
  545. }
  546. const targetTier = tierByBundleId.get(target.bundleId);
  547. // Frontmost-based sync (runInputActionGates) misses the case where
  548. // the click lands on a NON-FRONTMOST click-tier window. Re-sync by
  549. // the hit-test target's tier — if target is click-tier, stash+clear
  550. // before the click lands, regardless of what's frontmost.
  551. if (subGates.clipboardGuard && targetTier === "click") {
  552. await syncClipboardStash(adapter, overrides, true);
  553. }
  554. if (tierSatisfies(targetTier, actionKind)) return null;
  555. // Target is in the allowlist but tier doesn't cover this action.
  556. // runHitTestGate is only called with mouse/mouse_full (keyboard routes to
  557. // frontmost, not window-under-cursor). The branch above catches
  558. // mouse_full ∧ click; the only remaining fall-through is tier "read".
  559. if (actionKind === "mouse_full" && targetTier === "click") {
  560. return errorResult(
  561. `Click at these coordinates would land on "${target.displayName}", ` +
  562. `which is granted at tier "click" — right-click, middle-click, and ` +
  563. `clicks with modifier keys require tier "full" (they can Paste via ` +
  564. `the context menu or fire modifier-chord keystrokes). Plain ` +
  565. `left_click is allowed here.` + TIER_ANTI_SUBVERSION,
  566. "tier_insufficient",
  567. );
  568. }
  569. const isBrowser =
  570. getDeniedCategoryForApp(target.bundleId, target.displayName) === "browser";
  571. return errorResult(
  572. `Click at these coordinates would land on "${target.displayName}", ` +
  573. `which is granted at tier "read" (screenshots only, no interaction). ` +
  574. (isBrowser
  575. ? "Use the Claude-in-Chrome MCP for browser interaction."
  576. : "Ask the user to take any actions in this app themselves.") +
  577. TIER_ANTI_SUBVERSION,
  578. "tier_insufficient",
  579. );
  580. }
  581. // ---------------------------------------------------------------------------
  582. // Screenshot helpers
  583. // ---------------------------------------------------------------------------
  584. /**
  585. * §6 item 9 — screenshot retry on implausibly-small buffer. Battle-tested
  586. * threshold (1024 bytes). We retry exactly once.
  587. */
  588. const MIN_SCREENSHOT_BYTES = 1024;
  589. function decodedByteLength(base64: string): number {
  590. // 3 bytes per 4 chars, minus padding. Good enough for a threshold check.
  591. const padding = base64.endsWith("==") ? 2 : base64.endsWith("=") ? 1 : 0;
  592. return Math.floor((base64.length * 3) / 4) - padding;
  593. }
  594. async function takeScreenshotWithRetry(
  595. executor: ComputerExecutor,
  596. allowedBundleIds: string[],
  597. logger: ComputerUseHostAdapter["logger"],
  598. displayId?: number,
  599. ): Promise<ScreenshotResult> {
  600. let shot = await executor.screenshot({ allowedBundleIds, displayId });
  601. if (decodedByteLength(shot.base64) < MIN_SCREENSHOT_BYTES) {
  602. logger.warn(
  603. `[computer-use] screenshot implausibly small (${decodedByteLength(shot.base64)} bytes decoded), retrying once`,
  604. );
  605. shot = await executor.screenshot({ allowedBundleIds, displayId });
  606. }
  607. return shot;
  608. }
  609. // ---------------------------------------------------------------------------
  610. // Grapheme iteration — §6 item 7, ported from the Vercept acquisition
  611. // ---------------------------------------------------------------------------
  612. const INTER_GRAPHEME_SLEEP_MS = 8; // §6 item 4 — 125 Hz USB polling
  613. function segmentGraphemes(text: string): string[] {
  614. try {
  615. // Node 18+ has Intl.Segmenter; the try is defence against a stripped-
  616. // -down runtime (falls back to code points).
  617. const Segmenter = (
  618. Intl as typeof Intl & {
  619. Segmenter?: new (
  620. locale?: string,
  621. options?: { granularity: "grapheme" | "word" | "sentence" },
  622. ) => { segment: (s: string) => Iterable<{ segment: string }> };
  623. }
  624. ).Segmenter;
  625. if (typeof Segmenter === "function") {
  626. const seg = new Segmenter(undefined, { granularity: "grapheme" });
  627. return Array.from(seg.segment(text), (s) => s.segment);
  628. }
  629. } catch {
  630. // fall through
  631. }
  632. // Code-point iteration. Keeps surrogate pairs together but splits ZWJ.
  633. return Array.from(text);
  634. }
  635. function sleep(ms: number): Promise<void> {
  636. return new Promise((r) => setTimeout(r, ms));
  637. }
  638. /**
  639. * Split a chord string like "ctrl+shift" into individual key names.
  640. * Same parsing as `key` tool / executor.key / keyBlocklist.normalizeKeySequence.
  641. */
  642. function parseKeyChord(text: string): string[] {
  643. return text
  644. .split("+")
  645. .map((s) => s.trim())
  646. .filter(Boolean);
  647. }
  648. // ---------------------------------------------------------------------------
  649. // left_mouse_down / left_mouse_up held-state tracking
  650. // ---------------------------------------------------------------------------
  651. /**
  652. * Errors on double-down but not on up-without-down. Module-level, but
  653. * reset on every lock acquire (handleToolCall → acquireCuLock branch) so
  654. * a session interrupted mid-drag (overlay stop during left_mouse_down)
  655. * doesn't leave the flag true for the next lock holder.
  656. *
  657. * Still scoped wrong within a single lock cycle if sessions could interleave
  658. * tool calls, but the lock enforces at-most-one-session-uses-CU so they
  659. * can't. The per-turn reset is the correctness boundary.
  660. */
  661. let mouseButtonHeld = false;
  662. /** Whether mouse_move occurred between left_mouse_down and left_mouse_up.
  663. * When false at mouseUp, the decomposed sequence is a click-release (not a
  664. * drop) — hit-test at "mouse", not "mouse_full". */
  665. let mouseMoved = false;
  666. /** Clears the cross-call drag flags. Called from Gate-3 on lock-acquire and
  667. * from `bindSessionContext` in mcpServer.ts — a fresh lock holder must not
  668. * inherit a prior session's mid-drag state. */
  669. export function resetMouseButtonHeld(): void {
  670. mouseButtonHeld = false;
  671. mouseMoved = false;
  672. }
  673. /** If a left_mouse_down set the OS button without a matching left_mouse_up
  674. * ever getting its turn, release it now. Same release-before-return as
  675. * handleClick. No-op when not held — callers don't need to check. */
  676. async function releaseHeldMouse(
  677. adapter: ComputerUseHostAdapter,
  678. ): Promise<void> {
  679. if (!mouseButtonHeld) return;
  680. await adapter.executor.mouseUp();
  681. mouseButtonHeld = false;
  682. mouseMoved = false;
  683. }
  684. /**
  685. * Tools that check the lock but don't acquire it. `request_access` and
  686. * `list_granted_applications` hit the CHECK (so a blocked session doesn't
  687. * show an approval dialog for access it can't use) but defer ACQUIRE — the
  688. * enter-CU notification/overlay only fires on the first action tool.
  689. *
  690. * `request_teach_access` is NOT here: approving teach mode hides the main
  691. * window, and the lock must be held before that. See Gate-3 block in
  692. * `handleToolCall` for the full explanation.
  693. *
  694. * Exported for `bindSessionContext` in mcpServer.ts so the async lock gate
  695. * uses the same set as the sync one.
  696. */
  697. export function defersLockAcquire(toolName: string): boolean {
  698. return (
  699. toolName === "request_access" ||
  700. toolName === "list_granted_applications"
  701. );
  702. }
  703. // ---------------------------------------------------------------------------
  704. // request_access helpers
  705. // ---------------------------------------------------------------------------
  706. /** Reverse-DNS-ish: contains at least one dot, no spaces, no slashes. Lets
  707. * raw bundle IDs pass through resolution. */
  708. const REVERSE_DNS_RE = /^[A-Za-z0-9][\w.-]*\.[A-Za-z0-9][\w.-]*$/;
  709. function looksLikeBundleId(s: string): boolean {
  710. return REVERSE_DNS_RE.test(s) && !s.includes(" ");
  711. }
  712. function resolveRequestedApps(
  713. requestedNames: string[],
  714. installed: InstalledApp[],
  715. alreadyGrantedBundleIds: ReadonlySet<string>,
  716. ): ResolvedAppRequest[] {
  717. const byLowerDisplayName = new Map<string, InstalledApp>();
  718. const byBundleId = new Map<string, InstalledApp>();
  719. for (const app of installed) {
  720. byBundleId.set(app.bundleId, app);
  721. // Last write wins on collisions. Ambiguous-name handling (multiple
  722. // candidates in the dialog) is plan-documented but deferred — the
  723. // InstalledApps enumerator dedupes by bundle ID, so true display-name
  724. // collisions are rare. TODO(chicago, post-P1): surface all candidates.
  725. byLowerDisplayName.set(app.displayName.toLowerCase(), app);
  726. }
  727. return requestedNames.map((requested): ResolvedAppRequest => {
  728. let resolved: InstalledApp | undefined;
  729. if (looksLikeBundleId(requested)) {
  730. resolved = byBundleId.get(requested);
  731. }
  732. if (!resolved) {
  733. resolved = byLowerDisplayName.get(requested.toLowerCase());
  734. }
  735. const bundleId = resolved?.bundleId;
  736. // When unresolved AND the requested string looks like a bundle ID, use it
  737. // directly for tier lookup (e.g. "company.thebrowser.Browser" with Arc not
  738. // installed — the reverse-DNS string won't match any display-name substring).
  739. const bundleIdCandidate =
  740. bundleId ?? (looksLikeBundleId(requested) ? requested : undefined);
  741. return {
  742. requestedName: requested,
  743. resolved,
  744. isSentinel: bundleId ? SENTINEL_BUNDLE_IDS.has(bundleId) : false,
  745. alreadyGranted: bundleId ? alreadyGrantedBundleIds.has(bundleId) : false,
  746. proposedTier: getDefaultTierForApp(
  747. bundleIdCandidate,
  748. resolved?.displayName ?? requested,
  749. ),
  750. };
  751. });
  752. }
  753. // ---------------------------------------------------------------------------
  754. // Individual tool handlers
  755. // ---------------------------------------------------------------------------
  756. async function handleRequestAccess(
  757. adapter: ComputerUseHostAdapter,
  758. args: Record<string, unknown>,
  759. overrides: ComputerUseOverrides,
  760. tccState: { accessibility: boolean; screenRecording: boolean } | undefined,
  761. ): Promise<CuCallToolResult> {
  762. if (!overrides.onPermissionRequest) {
  763. return errorResult(
  764. "This session was not wired with a permission handler. Computer control is not available here.",
  765. "feature_unavailable",
  766. );
  767. }
  768. // Teach mode hides the main window; permission dialogs render in that
  769. // window. Without this, handleToolPermission blocks on an invisible
  770. // prompt and the overlay spins forever. Tell the model to exit teach
  771. // mode, request access, then re-enter.
  772. if (overrides.getTeachModeActive?.()) {
  773. return errorResult(
  774. "Cannot request additional permissions during teach mode — the permission dialog would be hidden. End teach mode (finish the tour or let the turn complete), then call request_access, then start a new tour.",
  775. "teach_mode_conflict",
  776. );
  777. }
  778. const reason = requireString(args, "reason");
  779. if (reason instanceof Error) return errorResult(reason.message, "bad_args");
  780. // TCC-ungranted branch. The renderer shows a toggle panel INSTEAD OF the
  781. // app list when `tccState` is present on the request, so we skip app
  782. // resolution entirely (listInstalledApps() may fail without Screen
  783. // Recording anyway). The user grants the OS perms from inside the dialog,
  784. // then clicks "Ask again" — both buttons resolve with deny by design
  785. // (ComputerUseApproval.tsx) so the model re-calls request_access and
  786. // gets the app list on the next call.
  787. if (tccState) {
  788. const req: CuPermissionRequest = {
  789. requestId: randomUUID(),
  790. reason,
  791. apps: [],
  792. requestedFlags: {},
  793. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  794. tccState,
  795. };
  796. await overrides.onPermissionRequest(req);
  797. // Re-check: the user may have granted in System Settings while the
  798. // dialog was up. The `tccState` arg is a pre-dialog snapshot — reading
  799. // it here would tell the model "not yet granted" even after the user
  800. // granted, and the model waits for confirmation instead of retrying.
  801. // The renderer's TCC panel already live-polls (computerUseTccStore);
  802. // this is the same re-check on the tool-result side.
  803. const recheck = await adapter.ensureOsPermissions();
  804. if (recheck.granted) {
  805. return errorResult(
  806. "macOS Accessibility and Screen Recording are now both granted. " +
  807. "Call request_access again immediately — the next call will show " +
  808. "the app selection list.",
  809. );
  810. }
  811. const missing: string[] = [];
  812. if (!recheck.accessibility) missing.push("Accessibility");
  813. if (!recheck.screenRecording) missing.push("Screen Recording");
  814. return errorResult(
  815. `macOS ${missing.join(" and ")} permission(s) not yet granted. ` +
  816. `The permission panel has been shown. Once the user grants the ` +
  817. `missing permission(s), call request_access again.`,
  818. "tcc_not_granted",
  819. );
  820. }
  821. const rawApps = args.apps;
  822. if (!Array.isArray(rawApps) || !rawApps.every((a) => typeof a === "string")) {
  823. return errorResult('"apps" must be an array of strings.', "bad_args");
  824. }
  825. const apps = rawApps as string[];
  826. const requestedFlags: Partial<CuGrantFlags> = {};
  827. if (typeof args.clipboardRead === "boolean") {
  828. requestedFlags.clipboardRead = args.clipboardRead;
  829. }
  830. if (typeof args.clipboardWrite === "boolean") {
  831. requestedFlags.clipboardWrite = args.clipboardWrite;
  832. }
  833. if (typeof args.systemKeyCombos === "boolean") {
  834. requestedFlags.systemKeyCombos = args.systemKeyCombos;
  835. }
  836. const {
  837. needDialog,
  838. skipDialogGrants,
  839. willHide,
  840. tieredApps,
  841. userDenied,
  842. policyDenied,
  843. } = await buildAccessRequest(
  844. adapter,
  845. apps,
  846. overrides.allowedApps,
  847. new Set(overrides.userDeniedBundleIds),
  848. overrides.selectedDisplayId,
  849. );
  850. let dialogGranted: AppGrant[] = [];
  851. let dialogDenied: Array<{
  852. bundleId: string;
  853. reason: "user_denied" | "not_installed";
  854. }> = [];
  855. let dialogFlags: CuGrantFlags = overrides.grantFlags;
  856. if (needDialog.length > 0 || Object.keys(requestedFlags).length > 0) {
  857. const req: CuPermissionRequest = {
  858. requestId: randomUUID(),
  859. reason,
  860. apps: needDialog,
  861. requestedFlags,
  862. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  863. // Undefined when empty so the renderer skips the section cleanly.
  864. ...(willHide.length > 0 && {
  865. willHide,
  866. autoUnhideEnabled: adapter.getAutoUnhideEnabled(),
  867. }),
  868. };
  869. const response = await overrides.onPermissionRequest(req);
  870. dialogGranted = response.granted;
  871. dialogDenied = response.denied;
  872. dialogFlags = response.flags;
  873. }
  874. // Do NOT return display geometry or coordinateMode. See COORDINATES.md
  875. // ("Never give the model a number that invites rescaling"). scaleCoord
  876. // already transforms server-side; the coordinate convention is baked into
  877. // the tool param descriptions at server-construction time.
  878. const allGranted = [...skipDialogGrants, ...dialogGranted];
  879. // Filter tieredApps to what was actually granted — if the user unchecked
  880. // Chrome in the dialog, don't explain Chrome's tier.
  881. const grantedBundleIds = new Set(allGranted.map((g) => g.bundleId));
  882. const grantedTieredApps = tieredApps.filter((t) =>
  883. grantedBundleIds.has(t.bundleId),
  884. );
  885. // Best-effort — grants are already persisted by wrappedPermissionHandler;
  886. // a listDisplays/findWindowDisplays failure (monitor hot-unplug, NAPI
  887. // error) must not tank the grant response. Same discipline as
  888. // buildMonitorNote's listDisplays try/catch.
  889. let windowLocations: Awaited<ReturnType<typeof buildWindowLocations>> = [];
  890. try {
  891. windowLocations = await buildWindowLocations(adapter, allGranted);
  892. } catch (e) {
  893. adapter.logger.warn(
  894. `[computer-use] buildWindowLocations failed: ${String(e)}`,
  895. );
  896. }
  897. return okJson(
  898. {
  899. granted: allGranted,
  900. denied: dialogDenied,
  901. // Policy blocklist — precedes userDenied in precedence and response
  902. // order. No escape hatch; the agent is told to find another approach.
  903. ...(policyDenied.length > 0 && {
  904. policyDenied: {
  905. apps: policyDenied,
  906. guidance: buildPolicyDeniedGuidance(policyDenied),
  907. },
  908. }),
  909. // User-configured auto-deny — stripped before the dialog; this is the
  910. // agent's only signal that these apps exist but are user-blocked.
  911. ...(userDenied.length > 0 && {
  912. userDenied: {
  913. apps: userDenied,
  914. guidance: buildUserDeniedGuidance(userDenied),
  915. },
  916. }),
  917. // Upfront guidance so the model knows what each tier allows BEFORE
  918. // hitting the gate. Only included when something was tier-restricted.
  919. ...(grantedTieredApps.length > 0 && {
  920. tierGuidance: buildTierGuidanceMessage(grantedTieredApps),
  921. }),
  922. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  923. // Where each granted app currently has open windows, across monitors.
  924. // Omitted when the app isn't running or has no normal windows.
  925. ...(windowLocations.length > 0 ? { windowLocations } : {}),
  926. },
  927. {
  928. // dialogGranted only — skipDialogGrants are idempotent re-grants of
  929. // apps already in the allowlist (no user action, dialog skips them).
  930. // Matching denied_count's this-call-only semantics.
  931. granted_count: dialogGranted.length,
  932. denied_count: dialogDenied.length,
  933. ...tierAssignmentTelemetry(grantedTieredApps),
  934. },
  935. );
  936. }
  937. /**
  938. * For each granted app with open windows, which displays those windows are
  939. * on. Single-monitor setups return an empty array (no multi-monitor signal
  940. * to give). Apps not running, or running with no normal windows, are omitted.
  941. */
  942. async function buildWindowLocations(
  943. adapter: ComputerUseHostAdapter,
  944. granted: AppGrant[],
  945. ): Promise<
  946. Array<{
  947. bundleId: string;
  948. displayName: string;
  949. displays: Array<{ id: number; label?: string; isPrimary?: boolean }>;
  950. }>
  951. > {
  952. if (granted.length === 0) return [];
  953. const displays = await adapter.executor.listDisplays();
  954. if (displays.length <= 1) return [];
  955. const grantedBundleIds = granted.map((g) => g.bundleId);
  956. const windowLocs = await adapter.executor.findWindowDisplays(grantedBundleIds);
  957. const displayById = new Map(displays.map((d) => [d.displayId, d]));
  958. const idsByBundle = new Map(windowLocs.map((w) => [w.bundleId, w.displayIds]));
  959. const out = [];
  960. for (const g of granted) {
  961. const displayIds = idsByBundle.get(g.bundleId);
  962. if (!displayIds || displayIds.length === 0) continue;
  963. out.push({
  964. bundleId: g.bundleId,
  965. displayName: g.displayName,
  966. displays: displayIds.map((id) => {
  967. const d = displayById.get(id);
  968. return { id, label: d?.label, isPrimary: d?.isPrimary };
  969. }),
  970. });
  971. }
  972. return out;
  973. }
  974. /**
  975. * Shared app-resolution + partition + hide-preview pipeline. Extracted from
  976. * `handleRequestAccess` so `handleRequestTeachAccess` can call the same path.
  977. *
  978. * Does the full app-name→InstalledApp resolution, assigns each a tier
  979. * (browser→"read", terminal/IDE→"click", else "full" — see deniedApps.ts),
  980. * splits into already-granted (skip the dialog, preserve grantedAt+tier) vs
  981. * need-dialog, and computes the willHide preview. Unlike the previous
  982. * hard-deny model, ALL apps proceed to the dialog; the tier just constrains
  983. * what actions are allowed once granted.
  984. */
  985. /** An app assigned a restricted tier (not `"full"`). Used to build the
  986. * guidance message telling the model what it can/can't do. */
  987. interface TieredApp {
  988. bundleId: string;
  989. displayName: string;
  990. /** Never `"full"` — only restricted tiers are collected. */
  991. tier: "read" | "click";
  992. }
  993. interface AccessRequestParts {
  994. needDialog: ResolvedAppRequest[];
  995. skipDialogGrants: AppGrant[];
  996. willHide: Array<{ bundleId: string; displayName: string }>;
  997. /** Resolved apps with `proposedTier !== "full"` — for the guidance text.
  998. * Unresolved apps are omitted (they go to `denied` with `not_installed`). */
  999. tieredApps: TieredApp[];
  1000. /** Apps stripped by the user's Settings auto-deny list. Surfaced in the
  1001. * response with guidance; never reach the dialog. */
  1002. userDenied: Array<{ requestedName: string; displayName: string }>;
  1003. /** Apps stripped by the baked-in policy blocklist (streaming/music/ebooks,
  1004. * etc. — `deniedApps.isPolicyDenied`). Precedence over userDenied. */
  1005. policyDenied: Array<{ requestedName: string; displayName: string }>;
  1006. }
  1007. async function buildAccessRequest(
  1008. adapter: ComputerUseHostAdapter,
  1009. apps: string[],
  1010. allowedApps: AppGrant[],
  1011. userDeniedBundleIds: ReadonlySet<string>,
  1012. selectedDisplayId?: number,
  1013. ): Promise<AccessRequestParts> {
  1014. const alreadyGranted = new Set(allowedApps.map((g) => g.bundleId));
  1015. const installed = await adapter.executor.listInstalledApps();
  1016. const resolved = resolveRequestedApps(apps, installed, alreadyGranted);
  1017. // Policy-level auto-deny (baked-in, not user-configurable). Stripped
  1018. // before userDenied — checks bundle ID AND display name (covers
  1019. // unresolved requests). Precedence: policy > user setting > tier.
  1020. const policyDenied: Array<{ requestedName: string; displayName: string }> =
  1021. [];
  1022. const afterPolicy: typeof resolved = [];
  1023. for (const r of resolved) {
  1024. const displayName = r.resolved?.displayName ?? r.requestedName;
  1025. if (isPolicyDenied(r.resolved?.bundleId, displayName)) {
  1026. policyDenied.push({ requestedName: r.requestedName, displayName });
  1027. } else {
  1028. afterPolicy.push(r);
  1029. }
  1030. }
  1031. // User-configured auto-deny (Settings → Desktop app → Computer Use).
  1032. // Stripped BEFORE
  1033. // tier assignment — these never reach the dialog regardless of category.
  1034. // Bundle-ID match only (the Settings UI picks from installed apps, which
  1035. // always have a bundle ID). Unresolved requests pass through to the tier
  1036. // system; the user can't preemptively deny an app that isn't installed.
  1037. const userDenied: Array<{ requestedName: string; displayName: string }> = [];
  1038. const surviving: typeof afterPolicy = [];
  1039. for (const r of afterPolicy) {
  1040. if (r.resolved && userDeniedBundleIds.has(r.resolved.bundleId)) {
  1041. userDenied.push({
  1042. requestedName: r.requestedName,
  1043. displayName: r.resolved.displayName,
  1044. });
  1045. } else {
  1046. surviving.push(r);
  1047. }
  1048. }
  1049. // Collect resolved apps with a restricted tier for the guidance message.
  1050. // Unresolved apps with a restricted tier (e.g. model asks for "Chrome" but
  1051. // it's not installed) are omitted — they'll end up in the `denied` list
  1052. // with reason "not_installed" and the model will see that instead.
  1053. const tieredApps: TieredApp[] = [];
  1054. for (const r of surviving) {
  1055. if (r.proposedTier === "full" || !r.resolved) continue;
  1056. tieredApps.push({
  1057. bundleId: r.resolved.bundleId,
  1058. displayName: r.resolved.displayName,
  1059. tier: r.proposedTier,
  1060. });
  1061. }
  1062. // Idempotence: apps that are already granted skip the dialog and are
  1063. // merged into the `granted` response. Existing grants keep their tier
  1064. // (which may differ from the current proposedTier if policy changed).
  1065. const skipDialog = surviving.filter((r) => r.alreadyGranted);
  1066. const needDialog = surviving.filter((r) => !r.alreadyGranted);
  1067. // Populate icons only for what the dialog will actually show. Sequential
  1068. // awaits are fine — the Swift module is cached (listInstalledApps above
  1069. // loaded it), each N-API call is synchronous, and the darwin executor
  1070. // memoizes by path. Failures leave iconDataUrl undefined; renderer falls
  1071. // back to a grey box.
  1072. for (const r of needDialog) {
  1073. if (!r.resolved) continue;
  1074. try {
  1075. r.resolved.iconDataUrl = await adapter.executor.getAppIcon(
  1076. r.resolved.path,
  1077. );
  1078. } catch {
  1079. // leave undefined
  1080. }
  1081. }
  1082. const now = Date.now();
  1083. const skipDialogGrants: AppGrant[] = skipDialog
  1084. .filter((r) => r.resolved)
  1085. .map((r) => {
  1086. // Reuse the existing grant (preserving grantedAt + tier) rather than
  1087. // synthesizing a new one — keeps Settings-page "Granted 3m ago" honest.
  1088. const existing = allowedApps.find(
  1089. (g) => g.bundleId === r.resolved!.bundleId,
  1090. );
  1091. return (
  1092. existing ?? {
  1093. bundleId: r.resolved!.bundleId,
  1094. displayName: r.resolved!.displayName,
  1095. grantedAt: now,
  1096. tier: r.proposedTier,
  1097. }
  1098. );
  1099. });
  1100. // Preview what will be hidden if the user approves exactly the requested
  1101. // set plus what they already have. All tiers are visible, so everything
  1102. // resolved goes in the exempt set.
  1103. const exemptForPreview = [
  1104. ...allowedApps.map((a) => a.bundleId),
  1105. ...surviving.filter((r) => r.resolved).map((r) => r.resolved!.bundleId),
  1106. ];
  1107. const willHide = await adapter.executor.previewHideSet(
  1108. exemptForPreview,
  1109. selectedDisplayId,
  1110. );
  1111. return {
  1112. needDialog,
  1113. skipDialogGrants,
  1114. willHide,
  1115. tieredApps,
  1116. userDenied,
  1117. policyDenied,
  1118. };
  1119. }
  1120. /**
  1121. * Build guidance text for apps granted at a restricted tier. Returned
  1122. * inline in the okJson response so the model knows upfront what it can
  1123. * do with each app, instead of learning by hitting the tier gate.
  1124. */
  1125. function buildTierGuidanceMessage(tiered: TieredApp[]): string {
  1126. // tier "read" is not category-unique — split so browsers get the CiC hint
  1127. // and trading platforms get "ask the user" instead.
  1128. const readBrowsers = tiered.filter(
  1129. (t) =>
  1130. t.tier === "read" &&
  1131. getDeniedCategoryForApp(t.bundleId, t.displayName) === "browser",
  1132. );
  1133. const readOther = tiered.filter(
  1134. (t) =>
  1135. t.tier === "read" &&
  1136. getDeniedCategoryForApp(t.bundleId, t.displayName) !== "browser",
  1137. );
  1138. const clickTier = tiered.filter((t) => t.tier === "click");
  1139. const parts: string[] = [];
  1140. if (readBrowsers.length > 0) {
  1141. const names = readBrowsers.map((b) => `"${b.displayName}"`).join(", ");
  1142. parts.push(
  1143. `${names} ${readBrowsers.length === 1 ? "is a browser" : "are browsers"} — ` +
  1144. `granted at tier "read" (visible in screenshots only; no clicks or ` +
  1145. `typing). You can read what's on screen but cannot navigate, click, ` +
  1146. `or type into ${readBrowsers.length === 1 ? "it" : "them"}. For browser ` +
  1147. `interaction, use the Claude-in-Chrome MCP (tools named ` +
  1148. `\`mcp__Claude_in_Chrome__*\`; load via ToolSearch if deferred).`,
  1149. );
  1150. }
  1151. if (readOther.length > 0) {
  1152. const names = readOther.map((t) => `"${t.displayName}"`).join(", ");
  1153. parts.push(
  1154. `${names} ${readOther.length === 1 ? "is" : "are"} granted at tier ` +
  1155. `"read" (visible in screenshots only; no clicks or typing). You can ` +
  1156. `read what's on screen but cannot interact. Ask the user to take any ` +
  1157. `actions in ${readOther.length === 1 ? "this app" : "these apps"} ` +
  1158. `themselves.`,
  1159. );
  1160. }
  1161. if (clickTier.length > 0) {
  1162. const names = clickTier.map((t) => `"${t.displayName}"`).join(", ");
  1163. parts.push(
  1164. `${names} ${clickTier.length === 1 ? "has" : "have"} terminal or IDE ` +
  1165. `capabilities — granted at tier "click" (visible + plain left-click ` +
  1166. `only; NO typing, key presses, right-click, modifier-clicks, or ` +
  1167. `drag-drop). You can click buttons and scroll output, but ` +
  1168. `${clickTier.length === 1 ? "its" : "their"} integrated terminal and ` +
  1169. `editor are off-limits to keyboard input. Right-click (context-menu ` +
  1170. `Paste) and dragging text onto ${clickTier.length === 1 ? "it" : "them"} ` +
  1171. `require tier "full". For shell commands, use the Bash tool.`,
  1172. );
  1173. }
  1174. if (parts.length === 0) return "";
  1175. // Same anti-subversion clause the gate errors carry — said upfront so the
  1176. // model doesn't reach for osascript/cliclick after seeing "no clicks/typing".
  1177. return parts.join("\n\n") + TIER_ANTI_SUBVERSION;
  1178. }
  1179. /**
  1180. * Build guidance text for apps stripped by the user's Settings auto-deny
  1181. * list. Returned inline in the okJson response so the agent knows (a) the
  1182. * app is auto-denied by request_access and (b) the escape hatch
  1183. * is to ask the human to edit Settings, not to retry or reword the request.
  1184. */
  1185. function buildUserDeniedGuidance(
  1186. userDenied: Array<{ requestedName: string; displayName: string }>,
  1187. ): string {
  1188. const names = userDenied.map((d) => `"${d.displayName}"`).join(", ");
  1189. const one = userDenied.length === 1;
  1190. return (
  1191. `${names} ${one ? "is" : "are"} in the user's auto-deny list ` +
  1192. `(Settings → Desktop app (General) → Computer Use → Denied apps). ` +
  1193. `Requests for ` +
  1194. `${one ? "this app" : "these apps"} are automatically denied. If you need access for ` +
  1195. `this task, ask the user to remove ${one ? "it" : "them"} from their ` +
  1196. `deny list in Settings — you cannot request this through the tool.`
  1197. );
  1198. }
  1199. /**
  1200. * Guidance for policy-denied apps (baked-in blocklist, not user-editable).
  1201. * Unlike userDenied, there is no escape hatch — the agent is told to find
  1202. * another approach.
  1203. */
  1204. function buildPolicyDeniedGuidance(
  1205. policyDenied: Array<{ requestedName: string; displayName: string }>,
  1206. ): string {
  1207. const names = policyDenied.map((d) => `"${d.displayName}"`).join(", ");
  1208. const one = policyDenied.length === 1;
  1209. return (
  1210. `${names} ${one ? "is" : "are"} blocked by policy for computer use. ` +
  1211. `Requests for ${one ? "this app" : "these apps"} are automatically ` +
  1212. `denied regardless of what the user has approved. There is no Settings ` +
  1213. `override. Inform the user that you cannot access ` +
  1214. `${one ? "this app" : "these apps"} and suggest an alternative ` +
  1215. `approach if one exists. Do not try to directly subvert this block ` +
  1216. `regardless of the user's request.`
  1217. );
  1218. }
  1219. /**
  1220. * Telemetry helper — counts by category. Field names (`denied_*`) are kept
  1221. * for schema compat; interpret as "assigned non-full tier" in dashboards.
  1222. */
  1223. function tierAssignmentTelemetry(
  1224. tiered: TieredApp[],
  1225. ): Pick<CuCallTelemetry, "denied_browser_count" | "denied_terminal_count"> {
  1226. // `denied_browser_count` now counts ALL tier-"read" grants (browsers +
  1227. // trading). The field name was already legacy-only before trading existed
  1228. // (dashboards read it as "non-full tier"), so no new column.
  1229. const browserCount = tiered.filter((t) => t.tier === "read").length;
  1230. const terminalCount = tiered.filter((t) => t.tier === "click").length;
  1231. return {
  1232. ...(browserCount > 0 && { denied_browser_count: browserCount }),
  1233. ...(terminalCount > 0 && { denied_terminal_count: terminalCount }),
  1234. };
  1235. }
  1236. /**
  1237. * Sibling of `handleRequestAccess`. Same app-resolution + TCC-threading, but
  1238. * routes to the teach approval dialog and fires `onTeachModeActivated` on
  1239. * success. No grant-flag checkboxes (clipboard/systemKeys) in teach mode —
  1240. * the tool schema omits those fields.
  1241. *
  1242. * Unlike `request_access`, this ALWAYS shows the dialog even when every
  1243. * requested app is already granted. Teach mode is a distinct UX the user
  1244. * must explicitly consent to (main window hides) — idempotent app grants
  1245. * don't imply consent to being guided.
  1246. */
  1247. async function handleRequestTeachAccess(
  1248. adapter: ComputerUseHostAdapter,
  1249. args: Record<string, unknown>,
  1250. overrides: ComputerUseOverrides,
  1251. tccState: { accessibility: boolean; screenRecording: boolean } | undefined,
  1252. ): Promise<CuCallToolResult> {
  1253. if (!overrides.onTeachPermissionRequest) {
  1254. return errorResult(
  1255. "Teach mode is not available in this session.",
  1256. "feature_unavailable",
  1257. );
  1258. }
  1259. // Same as handleRequestAccess above — the dialog renders in the hidden
  1260. // main window. Model re-calling request_teach_access mid-tour (to add
  1261. // another app) is plausible since request_access docs say "call again
  1262. // mid-session to add more apps" and this uses the same grant model.
  1263. if (overrides.getTeachModeActive?.()) {
  1264. return errorResult(
  1265. "Teach mode is already active. To add more apps, end the current tour first, then call request_teach_access again with the full app list.",
  1266. "teach_mode_conflict",
  1267. );
  1268. }
  1269. const reason = requireString(args, "reason");
  1270. if (reason instanceof Error) return errorResult(reason.message, "bad_args");
  1271. // TCC-ungranted branch — identical to handleRequestAccess's. The renderer
  1272. // shows the same TCC toggle panel regardless of which request tool got here.
  1273. if (tccState) {
  1274. const req: CuTeachPermissionRequest = {
  1275. requestId: randomUUID(),
  1276. reason,
  1277. apps: [],
  1278. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  1279. tccState,
  1280. };
  1281. await overrides.onTeachPermissionRequest(req);
  1282. // Same re-check as handleRequestAccess — user may have granted while the
  1283. // dialog was up, and the pre-dialog snapshot would mislead the model.
  1284. const recheck = await adapter.ensureOsPermissions();
  1285. if (recheck.granted) {
  1286. return errorResult(
  1287. "macOS Accessibility and Screen Recording are now both granted. " +
  1288. "Call request_teach_access again immediately — the next call will " +
  1289. "show the app selection list.",
  1290. );
  1291. }
  1292. const missing: string[] = [];
  1293. if (!recheck.accessibility) missing.push("Accessibility");
  1294. if (!recheck.screenRecording) missing.push("Screen Recording");
  1295. return errorResult(
  1296. `macOS ${missing.join(" and ")} permission(s) not yet granted. ` +
  1297. `The permission panel has been shown. Once the user grants the ` +
  1298. `missing permission(s), call request_teach_access again.`,
  1299. "tcc_not_granted",
  1300. );
  1301. }
  1302. const rawApps = args.apps;
  1303. if (!Array.isArray(rawApps) || !rawApps.every((a) => typeof a === "string")) {
  1304. return errorResult('"apps" must be an array of strings.', "bad_args");
  1305. }
  1306. const apps = rawApps as string[];
  1307. const {
  1308. needDialog,
  1309. skipDialogGrants,
  1310. willHide,
  1311. tieredApps,
  1312. userDenied,
  1313. policyDenied,
  1314. } = await buildAccessRequest(
  1315. adapter,
  1316. apps,
  1317. overrides.allowedApps,
  1318. new Set(overrides.userDeniedBundleIds),
  1319. overrides.selectedDisplayId,
  1320. );
  1321. // All requested apps were user-denied (or unresolvable) and none pre-granted
  1322. // — skip the dialog entirely. Without this, onTeachPermissionRequest fires
  1323. // with apps:[] and the user sees an empty approval dialog where Allow and
  1324. // Deny produce the same result (granted=[] → teachModeActive stays false).
  1325. // handleRequestAccess has the equivalent guard at the needDialog.length
  1326. // check; teach didn't need one before user-deny because needDialog=[]
  1327. // previously implied skipDialogGrants.length > 0 (all-already-granted).
  1328. if (needDialog.length === 0 && skipDialogGrants.length === 0) {
  1329. return okJson(
  1330. {
  1331. granted: [],
  1332. denied: [],
  1333. ...(policyDenied.length > 0 && {
  1334. policyDenied: {
  1335. apps: policyDenied,
  1336. guidance: buildPolicyDeniedGuidance(policyDenied),
  1337. },
  1338. }),
  1339. ...(userDenied.length > 0 && {
  1340. userDenied: {
  1341. apps: userDenied,
  1342. guidance: buildUserDeniedGuidance(userDenied),
  1343. },
  1344. }),
  1345. teachModeActive: false,
  1346. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  1347. },
  1348. { granted_count: 0, denied_count: 0 },
  1349. );
  1350. }
  1351. const req: CuTeachPermissionRequest = {
  1352. requestId: randomUUID(),
  1353. reason,
  1354. apps: needDialog,
  1355. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  1356. ...(willHide.length > 0 && {
  1357. willHide,
  1358. autoUnhideEnabled: adapter.getAutoUnhideEnabled(),
  1359. }),
  1360. };
  1361. const response = await overrides.onTeachPermissionRequest(req);
  1362. const granted = [...skipDialogGrants, ...response.granted];
  1363. // Gate on explicit dialog consent, NOT on merged grant length.
  1364. // skipDialogGrants are pre-existing idempotent app grants — they don't
  1365. // imply the user said yes to THIS dialog. Without the userConsented
  1366. // check, Deny would still activate teach mode whenever any requested
  1367. // app was previously granted (worst case: needDialog=[] → Allow and
  1368. // Deny payloads are structurally identical).
  1369. const teachModeActive = response.userConsented === true && granted.length > 0;
  1370. if (teachModeActive) {
  1371. overrides.onTeachModeActivated?.();
  1372. }
  1373. const grantedBundleIds = new Set(granted.map((g) => g.bundleId));
  1374. const grantedTieredApps = tieredApps.filter((t) =>
  1375. grantedBundleIds.has(t.bundleId),
  1376. );
  1377. return okJson(
  1378. {
  1379. granted,
  1380. denied: response.denied,
  1381. ...(policyDenied.length > 0 && {
  1382. policyDenied: {
  1383. apps: policyDenied,
  1384. guidance: buildPolicyDeniedGuidance(policyDenied),
  1385. },
  1386. }),
  1387. ...(userDenied.length > 0 && {
  1388. userDenied: {
  1389. apps: userDenied,
  1390. guidance: buildUserDeniedGuidance(userDenied),
  1391. },
  1392. }),
  1393. ...(grantedTieredApps.length > 0 && {
  1394. tierGuidance: buildTierGuidanceMessage(grantedTieredApps),
  1395. }),
  1396. teachModeActive,
  1397. screenshotFiltering: adapter.executor.capabilities.screenshotFiltering,
  1398. },
  1399. {
  1400. // response.granted only — skipDialogGrants are idempotent re-grants.
  1401. // See handleRequestAccess's parallel comment.
  1402. granted_count: response.granted.length,
  1403. denied_count: response.denied.length,
  1404. ...tierAssignmentTelemetry(grantedTieredApps),
  1405. },
  1406. );
  1407. }
  1408. // ---------------------------------------------------------------------------
  1409. // teach_step + teach_batch — shared step primitives
  1410. // ---------------------------------------------------------------------------
  1411. /** A fully-validated teach step, anchor already scaled to logical points. */
  1412. interface ValidatedTeachStep {
  1413. explanation: string;
  1414. nextPreview: string;
  1415. anchorLogical: TeachStepRequest["anchorLogical"];
  1416. actions: Array<Record<string, unknown>>;
  1417. }
  1418. /**
  1419. * Validate one raw step record and scale its anchor. `label` is prefixed to
  1420. * error messages so teach_batch can say `steps[2].actions[0]` instead of
  1421. * just `actions[0]`.
  1422. *
  1423. * The anchor transform is the whole coordinate story: model sends image-pixel
  1424. * coords (same space as click coords, per COORDINATES.md), `scaleCoord` turns
  1425. * them into logical points against `overrides.lastScreenshot`. For
  1426. * teach_batch, lastScreenshot stays at its pre-call value for the entire
  1427. * batch — same invariant as computer_batch's "coordinates refer to the
  1428. * PRE-BATCH screenshot". Anchors for step 2+ must therefore target elements
  1429. * the model can predict will be at those coordinates after step 1's actions.
  1430. */
  1431. async function validateTeachStepArgs(
  1432. raw: Record<string, unknown>,
  1433. adapter: ComputerUseHostAdapter,
  1434. overrides: ComputerUseOverrides,
  1435. label: string,
  1436. ): Promise<ValidatedTeachStep | Error> {
  1437. const explanation = requireString(raw, "explanation");
  1438. if (explanation instanceof Error) {
  1439. return new Error(`${label}: ${explanation.message}`);
  1440. }
  1441. const nextPreview = requireString(raw, "next_preview");
  1442. if (nextPreview instanceof Error) {
  1443. return new Error(`${label}: ${nextPreview.message}`);
  1444. }
  1445. const actions = raw.actions;
  1446. if (!Array.isArray(actions)) {
  1447. return new Error(
  1448. `${label}: "actions" must be an array (empty is allowed).`,
  1449. );
  1450. }
  1451. for (const [i, act] of actions.entries()) {
  1452. if (typeof act !== "object" || act === null) {
  1453. return new Error(`${label}: actions[${i}] must be an object`);
  1454. }
  1455. const action = (act as Record<string, unknown>).action;
  1456. if (typeof action !== "string") {
  1457. return new Error(`${label}: actions[${i}].action must be a string`);
  1458. }
  1459. if (!BATCHABLE_ACTIONS.has(action)) {
  1460. return new Error(
  1461. `${label}: actions[${i}].action="${action}" is not allowed. ` +
  1462. `Allowed: ${[...BATCHABLE_ACTIONS].join(", ")}.`,
  1463. );
  1464. }
  1465. }
  1466. let anchorLogical: TeachStepRequest["anchorLogical"];
  1467. if (raw.anchor !== undefined) {
  1468. const anchor = raw.anchor;
  1469. if (
  1470. !Array.isArray(anchor) ||
  1471. anchor.length !== 2 ||
  1472. typeof anchor[0] !== "number" ||
  1473. typeof anchor[1] !== "number" ||
  1474. !Number.isFinite(anchor[0]) ||
  1475. !Number.isFinite(anchor[1])
  1476. ) {
  1477. return new Error(
  1478. `${label}: "anchor" must be a [x, y] number tuple or omitted.`,
  1479. );
  1480. }
  1481. const display = await adapter.executor.getDisplaySize(
  1482. overrides.selectedDisplayId,
  1483. );
  1484. anchorLogical = scaleCoord(
  1485. anchor[0],
  1486. anchor[1],
  1487. overrides.coordinateMode,
  1488. display,
  1489. overrides.lastScreenshot,
  1490. adapter.logger,
  1491. );
  1492. }
  1493. return {
  1494. explanation,
  1495. nextPreview,
  1496. anchorLogical,
  1497. actions: actions as Array<Record<string, unknown>>,
  1498. };
  1499. }
  1500. /** Outcome of showing one tooltip + running its actions. */
  1501. type TeachStepOutcome =
  1502. | { kind: "exit" }
  1503. | { kind: "ok"; results: BatchActionResult[] }
  1504. | {
  1505. kind: "action_error";
  1506. executed: number;
  1507. failed: BatchActionResult;
  1508. remaining: number;
  1509. /** The inner action's telemetry (error_kind), forwarded so the
  1510. * caller can pass it to okJson and keep cu_tool_call accurate
  1511. * when the failure happened inside a batch. */
  1512. telemetry: CuCallTelemetry | undefined;
  1513. };
  1514. /**
  1515. * Show the tooltip, block for Next/Exit, run actions on Next.
  1516. *
  1517. * Action execution is a straight lift from `handleComputerBatch`:
  1518. * prepareForAction ONCE per step (the user clicked Next — they consented to
  1519. * that step's sequence), pixelValidation OFF (committed sequence), frontmost
  1520. * gate still per-action, stop-on-first-error with partial results.
  1521. *
  1522. * Empty `actions` is valid — "read this, click Next to continue" steps.
  1523. * Assumes `overrides.onTeachStep` is set (caller guards).
  1524. */
  1525. async function executeTeachStep(
  1526. step: ValidatedTeachStep,
  1527. adapter: ComputerUseHostAdapter,
  1528. overrides: ComputerUseOverrides,
  1529. subGates: CuSubGates,
  1530. ): Promise<TeachStepOutcome> {
  1531. // Block until Next or Exit. Same pending-promise pattern as
  1532. // onPermissionRequest — host stores the resolver, overlay IPC fires it.
  1533. // `!` is safe: both callers guard on overrides.onTeachStep before reaching here.
  1534. const stepResult = await overrides.onTeachStep!({
  1535. explanation: step.explanation,
  1536. nextPreview: step.nextPreview,
  1537. anchorLogical: step.anchorLogical,
  1538. });
  1539. if (stepResult.action === "exit") {
  1540. // The host's Exit handler also calls stopSession, so the turn is
  1541. // already unwinding. Caller decides what to return for the transcript.
  1542. // A PREVIOUS step's left_mouse_down may have left the OS button held.
  1543. await releaseHeldMouse(adapter);
  1544. return { kind: "exit" };
  1545. }
  1546. // Next clicked. Flip overlay to spinner before we start driving.
  1547. overrides.onTeachWorking?.();
  1548. if (step.actions.length === 0) {
  1549. return { kind: "ok", results: [] };
  1550. }
  1551. if (subGates.hideBeforeAction) {
  1552. const hidden = await adapter.executor.prepareForAction(
  1553. overrides.allowedApps.map((a) => a.bundleId),
  1554. overrides.selectedDisplayId,
  1555. );
  1556. if (hidden.length > 0) {
  1557. overrides.onAppsHidden?.(hidden);
  1558. }
  1559. }
  1560. const stepSubGates: CuSubGates = {
  1561. ...subGates,
  1562. hideBeforeAction: false,
  1563. pixelValidation: false,
  1564. // Anchors are pre-computed against the display at batch start.
  1565. // A mid-batch resolver switch would break tooltip positioning.
  1566. autoTargetDisplay: false,
  1567. };
  1568. const results: BatchActionResult[] = [];
  1569. for (const [i, act] of step.actions.entries()) {
  1570. // Same abort check as handleComputerBatch — Exit calls stopSession so
  1571. // this IS the exit path, just caught mid-dispatch instead of at the
  1572. // onTeachStep await above. Callers already handle { kind: "exit" }.
  1573. if (overrides.isAborted?.()) {
  1574. await releaseHeldMouse(adapter);
  1575. return { kind: "exit" };
  1576. }
  1577. // Same inter-step settle as handleComputerBatch.
  1578. if (i > 0) await sleep(10);
  1579. const action = act.action as string;
  1580. // Drop mid-step screenshot piggyback — same invariant as computer_batch.
  1581. // Click coords stay anchored to the screenshot the model took BEFORE
  1582. // calling teach_step/teach_batch.
  1583. const { screenshot: _dropped, ...inner } = await dispatchAction(
  1584. action,
  1585. act,
  1586. adapter,
  1587. overrides,
  1588. stepSubGates,
  1589. );
  1590. const text = firstTextContent(inner);
  1591. const result = { action, ok: !inner.isError, output: text };
  1592. results.push(result);
  1593. if (inner.isError) {
  1594. await releaseHeldMouse(adapter);
  1595. return {
  1596. kind: "action_error",
  1597. executed: results.length - 1,
  1598. failed: result,
  1599. remaining: step.actions.length - results.length,
  1600. telemetry: inner.telemetry,
  1601. };
  1602. }
  1603. }
  1604. return { kind: "ok", results };
  1605. }
  1606. /**
  1607. * Fold a fresh screenshot into the result. Eliminates the separate
  1608. * screenshot tool call the model would otherwise make before the next
  1609. * teach_step (one fewer API round trip per step). handleScreenshot
  1610. * runs its own prepareForAction — that's correct: actions may have
  1611. * opened something outside the allowlist. The .screenshot piggyback
  1612. * flows through to serverDef.ts's stash → lastScreenshot updates →
  1613. * the next teach_step.anchor scales against THIS image, which is what
  1614. * the model is now looking at.
  1615. */
  1616. async function appendTeachScreenshot(
  1617. resultJson: unknown,
  1618. adapter: ComputerUseHostAdapter,
  1619. overrides: ComputerUseOverrides,
  1620. subGates: CuSubGates,
  1621. ): Promise<CuCallToolResult> {
  1622. const shotResult = await handleScreenshot(adapter, overrides, subGates);
  1623. if (shotResult.isError) {
  1624. // Hide+screenshot failed (rare — e.g. SCContentFilter error). Don't
  1625. // tank the step; just omit the image. Model will call screenshot
  1626. // itself and see the real error.
  1627. return okJson(resultJson);
  1628. }
  1629. return {
  1630. content: [
  1631. { type: "text", text: JSON.stringify(resultJson) },
  1632. // handleScreenshot's content is [maybeMonitorNote, maybeHiddenNote,
  1633. // image]. Spread all — both notes are useful context and the model
  1634. // expects them alongside screenshots.
  1635. ...shotResult.content,
  1636. ],
  1637. // For serverDef.ts to stash. Next teach_step.anchor scales against this.
  1638. screenshot: shotResult.screenshot,
  1639. };
  1640. }
  1641. /**
  1642. * Show one guided-tour tooltip and block until the user clicks Next or Exit.
  1643. * On Next, execute `actions[]` with `computer_batch` semantics.
  1644. */
  1645. async function handleTeachStep(
  1646. adapter: ComputerUseHostAdapter,
  1647. args: Record<string, unknown>,
  1648. overrides: ComputerUseOverrides,
  1649. subGates: CuSubGates,
  1650. ): Promise<CuCallToolResult> {
  1651. if (!overrides.onTeachStep) {
  1652. return errorResult(
  1653. "Teach mode is not active. Call request_teach_access first.",
  1654. "teach_mode_not_active",
  1655. );
  1656. }
  1657. const step = await validateTeachStepArgs(
  1658. args,
  1659. adapter,
  1660. overrides,
  1661. "teach_step",
  1662. );
  1663. if (step instanceof Error) return errorResult(step.message, "bad_args");
  1664. const outcome = await executeTeachStep(step, adapter, overrides, subGates);
  1665. if (outcome.kind === "exit") {
  1666. return okJson({ exited: true });
  1667. }
  1668. if (outcome.kind === "action_error") {
  1669. return okJson(
  1670. {
  1671. executed: outcome.executed,
  1672. failed: outcome.failed,
  1673. remaining: outcome.remaining,
  1674. },
  1675. outcome.telemetry,
  1676. );
  1677. }
  1678. // ok. No screenshot for empty actions — screen didn't change, model's
  1679. // existing screenshot is still accurate.
  1680. if (step.actions.length === 0) {
  1681. return okJson({ executed: 0, results: [] });
  1682. }
  1683. return appendTeachScreenshot(
  1684. { executed: outcome.results.length, results: outcome.results },
  1685. adapter,
  1686. overrides,
  1687. subGates,
  1688. );
  1689. }
  1690. /**
  1691. * Queue a whole guided tour in one tool call. Parallels `computer_batch`: N
  1692. * steps → one model→API round trip instead of N. Each step still blocks for
  1693. * its own Next click (the user paces the tour), but the model doesn't wait
  1694. * for a round trip between steps.
  1695. *
  1696. * Validates ALL steps upfront so a typo in step 5 doesn't surface after the
  1697. * user has already clicked through steps 1–4.
  1698. *
  1699. * Anchors for every step scale against the pre-call `lastScreenshot` — same
  1700. * PRE-BATCH invariant as computer_batch. Steps 2+ should either omit anchor
  1701. * (centered tooltip) or target elements the model predicts won't have moved.
  1702. *
  1703. * Result shape:
  1704. * {exited: true, stepsCompleted: N} — user clicked Exit
  1705. * {stepsCompleted, stepFailed, executed, failed, …} — action error at step N
  1706. * {stepsCompleted, results: [...]} + screenshot — all steps ran
  1707. */
  1708. async function handleTeachBatch(
  1709. adapter: ComputerUseHostAdapter,
  1710. args: Record<string, unknown>,
  1711. overrides: ComputerUseOverrides,
  1712. subGates: CuSubGates,
  1713. ): Promise<CuCallToolResult> {
  1714. if (!overrides.onTeachStep) {
  1715. return errorResult(
  1716. "Teach mode is not active. Call request_teach_access first.",
  1717. "teach_mode_not_active",
  1718. );
  1719. }
  1720. const rawSteps = args.steps;
  1721. if (!Array.isArray(rawSteps) || rawSteps.length < 1) {
  1722. return errorResult('"steps" must be a non-empty array.', "bad_args");
  1723. }
  1724. // Validate upfront — fail fast before showing any tooltip.
  1725. const steps: ValidatedTeachStep[] = [];
  1726. for (const [i, raw] of rawSteps.entries()) {
  1727. if (typeof raw !== "object" || raw === null) {
  1728. return errorResult(`steps[${i}] must be an object`, "bad_args");
  1729. }
  1730. const v = await validateTeachStepArgs(
  1731. raw as Record<string, unknown>,
  1732. adapter,
  1733. overrides,
  1734. `steps[${i}]`,
  1735. );
  1736. if (v instanceof Error) return errorResult(v.message, "bad_args");
  1737. steps.push(v);
  1738. }
  1739. const allResults: BatchActionResult[][] = [];
  1740. for (const [i, step] of steps.entries()) {
  1741. const outcome = await executeTeachStep(step, adapter, overrides, subGates);
  1742. if (outcome.kind === "exit") {
  1743. return okJson({ exited: true, stepsCompleted: i });
  1744. }
  1745. if (outcome.kind === "action_error") {
  1746. return okJson(
  1747. {
  1748. stepsCompleted: i,
  1749. stepFailed: i,
  1750. executed: outcome.executed,
  1751. failed: outcome.failed,
  1752. remaining: outcome.remaining,
  1753. results: allResults,
  1754. },
  1755. outcome.telemetry,
  1756. );
  1757. }
  1758. allResults.push(outcome.results);
  1759. }
  1760. // Final screenshot only if any step ran actions (screen changed).
  1761. const screenChanged = steps.some((s) => s.actions.length > 0);
  1762. const resultJson = { stepsCompleted: steps.length, results: allResults };
  1763. if (!screenChanged) {
  1764. return okJson(resultJson);
  1765. }
  1766. return appendTeachScreenshot(resultJson, adapter, overrides, subGates);
  1767. }
  1768. /**
  1769. * Build the hidden-apps note that accompanies a screenshot. Tells the model
  1770. * which apps got hidden (not in allowlist) and how to add them. Returns
  1771. * undefined when nothing was hidden since the last screenshot.
  1772. */
  1773. async function buildHiddenNote(
  1774. adapter: ComputerUseHostAdapter,
  1775. hiddenSinceLastSeen: string[],
  1776. ): Promise<string | undefined> {
  1777. if (hiddenSinceLastSeen.length === 0) return undefined;
  1778. const running = await adapter.executor.listRunningApps();
  1779. const nameOf = new Map(running.map((a) => [a.bundleId, a.displayName]));
  1780. const names = hiddenSinceLastSeen.map((id) => nameOf.get(id) ?? id);
  1781. const list = names.map((n) => `"${n}"`).join(", ");
  1782. const one = names.length === 1;
  1783. return (
  1784. `${list} ${one ? "was" : "were"} open and got hidden before this screenshot ` +
  1785. `(not in the session allowlist). If a previous action was meant to open ` +
  1786. `${one ? "it" : "one of them"}, that's why you don't see it — call ` +
  1787. `request_access to add ${one ? "it" : "them"} to the allowlist.`
  1788. );
  1789. }
  1790. /**
  1791. * Assign a human-readable label to each display. Falls back to `display N`
  1792. * when NSScreen.localizedName is undefined; disambiguates identical labels
  1793. * (matched-pair external monitors) with a `(2)` suffix. Used by both
  1794. * buildMonitorNote and handleSwitchDisplay so the name the model sees in a
  1795. * screenshot note is the same name it can pass back to switch_display.
  1796. */
  1797. function uniqueDisplayLabels(
  1798. displays: readonly DisplayGeometry[],
  1799. ): Map<number, string> {
  1800. // Sort by displayId so the (N) suffix is stable regardless of
  1801. // NSScreen.screens iteration order — same label always maps to same
  1802. // physical display across buildMonitorNote → switch_display round-trip,
  1803. // even if display configuration reorders between the two calls.
  1804. const sorted = [...displays].sort((a, b) => a.displayId - b.displayId);
  1805. const counts = new Map<string, number>();
  1806. const out = new Map<number, string>();
  1807. for (const d of sorted) {
  1808. const base = d.label ?? `display ${d.displayId}`;
  1809. const n = (counts.get(base) ?? 0) + 1;
  1810. counts.set(base, n);
  1811. out.set(d.displayId, n === 1 ? base : `${base} (${n})`);
  1812. }
  1813. return out;
  1814. }
  1815. /**
  1816. * Build the monitor-context text that accompanies a screenshot. Tells the
  1817. * model which monitor it's looking at (by human name), lists other attached
  1818. * monitors, and flags when the monitor changed vs. the previous screenshot.
  1819. *
  1820. * Only emitted when there are 2+ displays AND (first screenshot OR the
  1821. * display changed). Single-monitor setups and steady-state same-monitor
  1822. * screenshots get no text — avoids noise.
  1823. */
  1824. async function buildMonitorNote(
  1825. adapter: ComputerUseHostAdapter,
  1826. shotDisplayId: number,
  1827. lastDisplayId: number | undefined,
  1828. canSwitchDisplay: boolean,
  1829. ): Promise<string | undefined> {
  1830. // listDisplays failure (e.g. Swift returns zero screens during monitor
  1831. // hot-unplug) must not tank the screenshot — this note is optional context.
  1832. let displays;
  1833. try {
  1834. displays = await adapter.executor.listDisplays();
  1835. } catch (e) {
  1836. adapter.logger.warn(`[computer-use] listDisplays failed: ${String(e)}`);
  1837. return undefined;
  1838. }
  1839. if (displays.length < 2) return undefined;
  1840. const labels = uniqueDisplayLabels(displays);
  1841. const nameOf = (id: number): string => labels.get(id) ?? `display ${id}`;
  1842. const current = nameOf(shotDisplayId);
  1843. const others = displays
  1844. .filter((d) => d.displayId !== shotDisplayId)
  1845. .map((d) => nameOf(d.displayId));
  1846. const switchHint = canSwitchDisplay
  1847. ? " Use switch_display to capture a different monitor."
  1848. : "";
  1849. const othersList =
  1850. others.length > 0
  1851. ? ` Other attached monitors: ${others.map((n) => `"${n}"`).join(", ")}.` +
  1852. switchHint
  1853. : "";
  1854. // 0 is kCGNullDirectDisplay (sentinel from old sessions persisted
  1855. // pre-multimon) — treat same as undefined.
  1856. if (lastDisplayId === undefined || lastDisplayId === 0) {
  1857. return `This screenshot was taken on monitor "${current}".` + othersList;
  1858. }
  1859. if (lastDisplayId !== shotDisplayId) {
  1860. const prev = nameOf(lastDisplayId);
  1861. return (
  1862. `This screenshot was taken on monitor "${current}", which is different ` +
  1863. `from your previous screenshot (taken on "${prev}").` +
  1864. othersList
  1865. );
  1866. }
  1867. return undefined;
  1868. }
  1869. async function handleScreenshot(
  1870. adapter: ComputerUseHostAdapter,
  1871. overrides: ComputerUseOverrides,
  1872. subGates: CuSubGates,
  1873. ): Promise<CuCallToolResult> {
  1874. // §2 — empty allowlist → tool error, no screenshot.
  1875. if (overrides.allowedApps.length === 0) {
  1876. return errorResult(
  1877. "No applications are granted for this session. Call request_access first.",
  1878. "allowlist_empty",
  1879. );
  1880. }
  1881. // Atomic resolve→prepare→capture (one Swift call, no scheduler gap).
  1882. // Off → fall through to separate-calls path below.
  1883. if (subGates.autoTargetDisplay) {
  1884. // Model's explicit switch_display pin overrides everything — Swift's
  1885. // straight cuDisplayInfo(forDisplayID:) passthrough, no chase chain.
  1886. // Otherwise sticky display: only auto-resolve when the allowed-app
  1887. // set has changed since the display was last resolved. Prevents the
  1888. // resolver yanking the display on every screenshot.
  1889. const allowedBundleIds = overrides.allowedApps.map((a) => a.bundleId);
  1890. const currentAppSetKey = allowedBundleIds.slice().sort().join(",");
  1891. const appSetChanged = currentAppSetKey !== overrides.displayResolvedForApps;
  1892. const autoResolve = !overrides.displayPinnedByModel && appSetChanged;
  1893. const result = await adapter.executor.resolvePrepareCapture({
  1894. allowedBundleIds,
  1895. preferredDisplayId: overrides.selectedDisplayId,
  1896. autoResolve,
  1897. // Keep the hideBeforeAction sub-gate independently rollable —
  1898. // atomic path honors the same toggle the non-atomic path checks
  1899. // at the prepareForAction call site.
  1900. doHide: subGates.hideBeforeAction,
  1901. });
  1902. // Non-atomic path's takeScreenshotWithRetry has a MIN_SCREENSHOT_BYTES
  1903. // check + retry. The atomic call is expensive (resolve+prepare+capture),
  1904. // so no retry here — just a warning when the result is implausibly
  1905. // small (transient display state like sleep wake). Skip when
  1906. // captureError is set (base64 is intentionally empty then).
  1907. if (
  1908. result.captureError === undefined &&
  1909. decodedByteLength(result.base64) < MIN_SCREENSHOT_BYTES
  1910. ) {
  1911. adapter.logger.warn(
  1912. `[computer-use] resolvePrepareCapture result implausibly small (${decodedByteLength(result.base64)} bytes decoded) — possible transient display state`,
  1913. );
  1914. }
  1915. // Resolver picked a different display than the session had selected
  1916. // (host window moved, or allowed app on a different display). Write
  1917. // the pick back to session so teach overlay positioning and subsequent
  1918. // non-resolver calls track the same display. Fire-and-forget.
  1919. if (result.displayId !== overrides.selectedDisplayId) {
  1920. adapter.logger.debug(
  1921. `[computer-use] resolver: preferred=${overrides.selectedDisplayId} resolved=${result.displayId}`,
  1922. );
  1923. overrides.onResolvedDisplayUpdated?.(result.displayId);
  1924. }
  1925. // Record the app set this display was resolved for, so the next
  1926. // screenshot skips auto-resolve until the set changes again. Gated on
  1927. // autoResolve (not just appSetChanged) — when pinned, we didn't
  1928. // actually resolve, so don't update the key.
  1929. if (autoResolve) {
  1930. overrides.onDisplayResolvedForApps?.(currentAppSetKey);
  1931. }
  1932. // Report hidden apps only when the model has already seen the screen.
  1933. let hiddenSinceLastSeen: string[] = [];
  1934. if (overrides.lastScreenshot !== undefined) {
  1935. hiddenSinceLastSeen = result.hidden;
  1936. }
  1937. if (result.hidden.length > 0) {
  1938. overrides.onAppsHidden?.(result.hidden);
  1939. }
  1940. // Partial-success case: hide succeeded, capture failed (SCK perm
  1941. // revoked mid-session). onAppsHidden fired above so auto-unhide will
  1942. // restore hidden apps at turn end. Now surface the error to the model.
  1943. if (result.captureError !== undefined) {
  1944. return errorResult(result.captureError, "capture_failed");
  1945. }
  1946. const hiddenNote = await buildHiddenNote(adapter, hiddenSinceLastSeen);
  1947. // Cherry-pick — don't spread `result` (would leak resolver fields into lastScreenshot).
  1948. const shot: ScreenshotResult = {
  1949. base64: result.base64,
  1950. width: result.width,
  1951. height: result.height,
  1952. displayWidth: result.displayWidth,
  1953. displayHeight: result.displayHeight,
  1954. displayId: result.displayId,
  1955. originX: result.originX,
  1956. originY: result.originY,
  1957. };
  1958. const monitorNote = await buildMonitorNote(
  1959. adapter,
  1960. shot.displayId,
  1961. overrides.lastScreenshot?.displayId,
  1962. overrides.onDisplayPinned !== undefined,
  1963. );
  1964. return {
  1965. content: [
  1966. ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
  1967. ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
  1968. {
  1969. type: "image",
  1970. data: shot.base64,
  1971. mimeType: "image/jpeg",
  1972. },
  1973. ],
  1974. screenshot: shot,
  1975. };
  1976. }
  1977. // Same hide+defocus sequence as input actions. Screenshot needs hide too
  1978. // — if a non-allowlisted app is on top, SCContentFilter would composite it
  1979. // out, but the pixels BELOW it are what the model would see, and those are
  1980. // NOT what's actually there. Hiding first makes the screenshot TRUE.
  1981. let hiddenSinceLastSeen: string[] = [];
  1982. if (subGates.hideBeforeAction) {
  1983. const hidden = await adapter.executor.prepareForAction(
  1984. overrides.allowedApps.map((a) => a.bundleId),
  1985. overrides.selectedDisplayId,
  1986. );
  1987. // "Something appeared since the model last looked." Report whenever:
  1988. // (a) prepare hid something AND
  1989. // (b) the model has ALREADY SEEN the screen (lastScreenshot is set).
  1990. //
  1991. // (b) is the discriminator that silences the first screenshot's
  1992. // expected-noise hide. NOT a delta against a cumulative set — that was
  1993. // the earlier bug: cuHiddenDuringTurn only grows, so once Preview is in
  1994. // it (from the first screenshot's hide), subsequent re-hides of Preview
  1995. // delta to zero. The double-click → Preview opens → re-hide → silent
  1996. // loop never breaks.
  1997. //
  1998. // With this check: every re-hide fires. If the model loops "click → file
  1999. // opens in Preview → screenshot → Preview hidden", it gets told EVERY
  2000. // time. Eventually it'll request_access for Preview (or give up).
  2001. //
  2002. // False positive: user alt-tabs mid-turn → Safari re-hidden → reported.
  2003. // Rare, and "Safari appeared" is at worst mild noise — far better than
  2004. // the false-negative of never explaining why the file vanished.
  2005. if (overrides.lastScreenshot !== undefined) {
  2006. hiddenSinceLastSeen = hidden;
  2007. }
  2008. if (hidden.length > 0) {
  2009. overrides.onAppsHidden?.(hidden);
  2010. }
  2011. }
  2012. const allowedBundleIds = overrides.allowedApps.map((g) => g.bundleId);
  2013. const shot = await takeScreenshotWithRetry(
  2014. adapter.executor,
  2015. allowedBundleIds,
  2016. adapter.logger,
  2017. overrides.selectedDisplayId,
  2018. );
  2019. const hiddenNote = await buildHiddenNote(adapter, hiddenSinceLastSeen);
  2020. const monitorNote = await buildMonitorNote(
  2021. adapter,
  2022. shot.displayId,
  2023. overrides.lastScreenshot?.displayId,
  2024. overrides.onDisplayPinned !== undefined,
  2025. );
  2026. return {
  2027. content: [
  2028. ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
  2029. ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
  2030. {
  2031. type: "image",
  2032. data: shot.base64,
  2033. mimeType: "image/jpeg",
  2034. },
  2035. ],
  2036. // Piggybacked for serverDef.ts to stash on InternalServerContext.
  2037. screenshot: shot,
  2038. };
  2039. }
  2040. /**
  2041. * Region-crop upscaled screenshot. Coord invariant (computer_use_v2.py:1092):
  2042. * click coords ALWAYS refer to the full-screen screenshot, never the zoom.
  2043. * Enforced structurally: this handler's return has NO `.screenshot` field,
  2044. * so serverDef.ts's `if (result.screenshot)` branch cannot fire and
  2045. * `cuLastScreenshot` is never touched. `executor.zoom()`'s return type also
  2046. * lacks displayWidth/displayHeight, so it's not assignable to
  2047. * `ScreenshotResult` even by accident.
  2048. */
  2049. async function handleZoom(
  2050. adapter: ComputerUseHostAdapter,
  2051. args: Record<string, unknown>,
  2052. overrides: ComputerUseOverrides,
  2053. ): Promise<CuCallToolResult> {
  2054. // region: [x0, y0, x1, y1] in IMAGE-PX of lastScreenshot — same space the
  2055. // model reads click coords from.
  2056. const region = args.region;
  2057. if (!Array.isArray(region) || region.length !== 4) {
  2058. return errorResult(
  2059. "region must be an array of length 4: [x0, y0, x1, y1]",
  2060. "bad_args",
  2061. );
  2062. }
  2063. const [x0, y0, x1, y1] = region;
  2064. if (![x0, y0, x1, y1].every((v) => typeof v === "number" && v >= 0)) {
  2065. return errorResult(
  2066. "region values must be non-negative numbers",
  2067. "bad_args",
  2068. );
  2069. }
  2070. if (x1 <= x0)
  2071. return errorResult("region x1 must be greater than x0", "bad_args");
  2072. if (y1 <= y0)
  2073. return errorResult("region y1 must be greater than y0", "bad_args");
  2074. const last = overrides.lastScreenshot;
  2075. if (!last) {
  2076. return errorResult(
  2077. "take a screenshot before zooming (region coords are relative to it)",
  2078. "state_conflict",
  2079. );
  2080. }
  2081. if (x1 > last.width || y1 > last.height) {
  2082. return errorResult(
  2083. `region exceeds screenshot bounds (${last.width}×${last.height})`,
  2084. "bad_args",
  2085. );
  2086. }
  2087. // image-px → logical-pt. Same ratio as scaleCoord (:198-199) —
  2088. // displayWidth / width, not 1/scaleFactor. The ratio is folded.
  2089. const ratioX = last.displayWidth / last.width;
  2090. const ratioY = last.displayHeight / last.height;
  2091. const regionLogical = {
  2092. x: x0 * ratioX,
  2093. y: y0 * ratioY,
  2094. w: (x1 - x0) * ratioX,
  2095. h: (y1 - y0) * ratioY,
  2096. };
  2097. const allowedIds = overrides.allowedApps.map((g) => g.bundleId);
  2098. // Crop from the same display as lastScreenshot so the zoom region
  2099. // matches the image the model is reading coords from.
  2100. const zoomed = await adapter.executor.zoom(
  2101. regionLogical,
  2102. allowedIds,
  2103. last.displayId,
  2104. );
  2105. // Return the image. NO `.screenshot` piggyback — this is the invariant.
  2106. return {
  2107. content: [{ type: "image", data: zoomed.base64, mimeType: "image/jpeg" }],
  2108. };
  2109. }
  2110. /** Shared handler for all five click variants. */
  2111. async function handleClickVariant(
  2112. adapter: ComputerUseHostAdapter,
  2113. args: Record<string, unknown>,
  2114. overrides: ComputerUseOverrides,
  2115. subGates: CuSubGates,
  2116. button: "left" | "right" | "middle",
  2117. count: 1 | 2 | 3,
  2118. ): Promise<CuCallToolResult> {
  2119. // A prior left_mouse_down may have set mouseButtonHeld without a matching
  2120. // left_mouse_up (e.g. drag rejected by a tier gate, model falls back to
  2121. // left_click). executor.click() does its own mouseDown+mouseUp, releasing
  2122. // the OS button — but without this, the JS flag stays true and all
  2123. // subsequent mouse_move calls take the held-button path ("mouse"/
  2124. // "mouse_full" actionKind + hit-test), causing spurious rejections on
  2125. // click-tier and read-tier windows. Release first so click() gets a clean
  2126. // slate.
  2127. if (mouseButtonHeld) {
  2128. await adapter.executor.mouseUp();
  2129. mouseButtonHeld = false;
  2130. mouseMoved = false;
  2131. }
  2132. const coord = extractCoordinate(args);
  2133. if (coord instanceof Error) return errorResult(coord.message, "bad_args");
  2134. const [rawX, rawY] = coord;
  2135. // left_click(coordinate=[x,y], text="shift") — hold modifiers
  2136. // during the click. Same chord parsing as the key tool.
  2137. let modifiers: string[] | undefined;
  2138. if (args.text !== undefined) {
  2139. if (typeof args.text !== "string") {
  2140. return errorResult("text must be a string", "bad_args");
  2141. }
  2142. // Same gate as handleKey/handleHoldKey. withModifiers presses each name
  2143. // via native.key(m, "press") — a non-modifier like "q" in text="cmd+q"
  2144. // gets pressed while Cmd is held → Cmd+Q fires before the click.
  2145. if (
  2146. isSystemKeyCombo(args.text, adapter.executor.capabilities.platform) &&
  2147. !overrides.grantFlags.systemKeyCombos
  2148. ) {
  2149. return errorResult(
  2150. `The modifier chord "${args.text}" would fire a system shortcut. ` +
  2151. "Request the systemKeyCombos grant flag via request_access, or use " +
  2152. "only modifier keys (shift, ctrl, alt, cmd) in the text parameter.",
  2153. "grant_flag_required",
  2154. );
  2155. }
  2156. modifiers = parseKeyChord(args.text);
  2157. }
  2158. // Right/middle-click and any click with a modifier chord escalate to
  2159. // keyboard-equivalent input at tier "click" (context-menu Paste, chord
  2160. // keystrokes). Compute once, pass to both gates.
  2161. const clickActionKind: CuActionKind =
  2162. button !== "left" || (modifiers !== undefined && modifiers.length > 0)
  2163. ? "mouse_full"
  2164. : "mouse";
  2165. const gate = await runInputActionGates(
  2166. adapter,
  2167. overrides,
  2168. subGates,
  2169. clickActionKind,
  2170. );
  2171. if (gate) return gate;
  2172. const display = await adapter.executor.getDisplaySize(
  2173. overrides.selectedDisplayId,
  2174. );
  2175. // §6 item P — pixel-validation staleness check. Sub-gated.
  2176. // Runs AFTER the gates (no point validating if we're about to refuse
  2177. // anyway) but BEFORE the executor call.
  2178. if (subGates.pixelValidation) {
  2179. const { xPct, yPct } = coordToPercentageForPixelCompare(
  2180. rawX,
  2181. rawY,
  2182. overrides.coordinateMode,
  2183. overrides.lastScreenshot,
  2184. );
  2185. const validation = await validateClickTarget(
  2186. adapter.cropRawPatch,
  2187. overrides.lastScreenshot,
  2188. xPct,
  2189. yPct,
  2190. async () => {
  2191. // The fresh screenshot for validation uses the SAME allow-set as
  2192. // the model's last screenshot did, so we compare like with like.
  2193. const allowedIds = overrides.allowedApps.map((g) => g.bundleId);
  2194. try {
  2195. // Fresh shot must match lastScreenshot's display, not the current
  2196. // selection — pixel-compare is against the model's last image.
  2197. return await adapter.executor.screenshot({
  2198. allowedBundleIds: allowedIds,
  2199. displayId: overrides.lastScreenshot?.displayId,
  2200. });
  2201. } catch {
  2202. return null;
  2203. }
  2204. },
  2205. adapter.logger,
  2206. );
  2207. if (!validation.valid && validation.warning) {
  2208. // Warning result — model told to re-screenshot.
  2209. return okText(validation.warning);
  2210. }
  2211. }
  2212. const { x, y } = scaleCoord(
  2213. rawX,
  2214. rawY,
  2215. overrides.coordinateMode,
  2216. display,
  2217. overrides.lastScreenshot,
  2218. adapter.logger,
  2219. );
  2220. const hitGate = await runHitTestGate(
  2221. adapter,
  2222. overrides,
  2223. subGates,
  2224. x,
  2225. y,
  2226. clickActionKind,
  2227. );
  2228. if (hitGate) return hitGate;
  2229. await adapter.executor.click(x, y, button, count, modifiers);
  2230. return okText("Clicked.");
  2231. }
  2232. async function handleType(
  2233. adapter: ComputerUseHostAdapter,
  2234. args: Record<string, unknown>,
  2235. overrides: ComputerUseOverrides,
  2236. subGates: CuSubGates,
  2237. ): Promise<CuCallToolResult> {
  2238. const text = requireString(args, "text");
  2239. if (text instanceof Error) return errorResult(text.message, "bad_args");
  2240. const gate = await runInputActionGates(
  2241. adapter,
  2242. overrides,
  2243. subGates,
  2244. "keyboard",
  2245. );
  2246. if (gate) return gate;
  2247. // §6 item 3 — clipboard-paste fast path for multi-line. Sub-gated AND
  2248. // requires clipboardWrite grant. The save/restore + read-back-verify
  2249. // lives in the EXECUTOR (task #5), not here. Here we just route.
  2250. const viaClipboard =
  2251. text.includes("\n") &&
  2252. overrides.grantFlags.clipboardWrite &&
  2253. subGates.clipboardPasteMultiline;
  2254. if (viaClipboard) {
  2255. await adapter.executor.type(text, { viaClipboard: true });
  2256. return okText("Typed (via clipboard).");
  2257. }
  2258. // §6 item 7 — grapheme-cluster iteration. Prevents ZWJ emoji → �.
  2259. // §6 item 4 — 8ms between graphemes (125 Hz USB polling). Battle-tested:
  2260. // sleep BEFORE each keystroke, not after.
  2261. //
  2262. // \n, \r, \t MUST route through executor.key(), not type(). Two reasons:
  2263. // 1. enigo.text("\n") on macOS posts a stale CGEvent with virtualKey=0
  2264. // after stripping the newline — virtualKey 0 is the 'a' key, so a
  2265. // ghost 'a' gets typed. Upstream bug in enigo 0.6.1 fast_text().
  2266. // 2. Unicode text-insertion of '\n' is not a Return key press. URL bars
  2267. // and terminals ignore it; the model's intent (submit/execute) is lost.
  2268. // CRLF (\r\n) is one grapheme cluster (UAX #29 GB3), so check for it too.
  2269. const graphemes = segmentGraphemes(text);
  2270. for (const [i, g] of graphemes.entries()) {
  2271. // Same abort check as handleComputerBatch. At 8ms/grapheme a 50-char
  2272. // type() runs ~400ms; this is where an in-flight batch actually
  2273. // spends its time.
  2274. if (overrides.isAborted?.()) {
  2275. return errorResult(
  2276. `Typing aborted after ${i} of ${graphemes.length} graphemes (user interrupt).`,
  2277. );
  2278. }
  2279. await sleep(INTER_GRAPHEME_SLEEP_MS);
  2280. if (g === "\n" || g === "\r" || g === "\r\n") {
  2281. await adapter.executor.key("return");
  2282. } else if (g === "\t") {
  2283. await adapter.executor.key("tab");
  2284. } else {
  2285. await adapter.executor.type(g, { viaClipboard: false });
  2286. }
  2287. }
  2288. return okText(`Typed ${graphemes.length} grapheme(s).`);
  2289. }
  2290. async function handleKey(
  2291. adapter: ComputerUseHostAdapter,
  2292. args: Record<string, unknown>,
  2293. overrides: ComputerUseOverrides,
  2294. subGates: CuSubGates,
  2295. ): Promise<CuCallToolResult> {
  2296. const keySequence = requireString(args, "text");
  2297. if (keySequence instanceof Error)
  2298. return errorResult("text is required", "bad_args");
  2299. // Cap 100, error strings match.
  2300. let repeat: number | undefined;
  2301. if (args.repeat !== undefined) {
  2302. if (
  2303. typeof args.repeat !== "number" ||
  2304. !Number.isInteger(args.repeat) ||
  2305. args.repeat < 1
  2306. ) {
  2307. return errorResult("repeat must be a positive integer", "bad_args");
  2308. }
  2309. if (args.repeat > 100) {
  2310. return errorResult("repeat exceeds maximum of 100", "bad_args");
  2311. }
  2312. repeat = args.repeat;
  2313. }
  2314. // §2 — blocklist check BEFORE gates. A blocked combo with an ungranted
  2315. // app frontmost should return the blocklist error, not the frontmost
  2316. // error — the model's fix is to request the flag, not change focus.
  2317. if (
  2318. isSystemKeyCombo(keySequence, adapter.executor.capabilities.platform) &&
  2319. !overrides.grantFlags.systemKeyCombos
  2320. ) {
  2321. return errorResult(
  2322. `"${keySequence}" is a system-level shortcut. Request the \`systemKeyCombos\` grant via request_access to use it.`,
  2323. "grant_flag_required",
  2324. );
  2325. }
  2326. const gate = await runInputActionGates(
  2327. adapter,
  2328. overrides,
  2329. subGates,
  2330. "keyboard",
  2331. );
  2332. if (gate) return gate;
  2333. await adapter.executor.key(keySequence, repeat);
  2334. return okText("Key pressed.");
  2335. }
  2336. async function handleScroll(
  2337. adapter: ComputerUseHostAdapter,
  2338. args: Record<string, unknown>,
  2339. overrides: ComputerUseOverrides,
  2340. subGates: CuSubGates,
  2341. ): Promise<CuCallToolResult> {
  2342. const coord = extractCoordinate(args);
  2343. if (coord instanceof Error) return errorResult(coord.message, "bad_args");
  2344. const [rawX, rawY] = coord;
  2345. // Uses scroll_direction + scroll_amount.
  2346. // Map to our dx/dy executor interface.
  2347. const dir = args.scroll_direction;
  2348. if (dir !== "up" && dir !== "down" && dir !== "left" && dir !== "right") {
  2349. return errorResult(
  2350. "scroll_direction must be 'up', 'down', 'left', or 'right'",
  2351. "bad_args",
  2352. );
  2353. }
  2354. const amount = args.scroll_amount;
  2355. if (typeof amount !== "number" || !Number.isInteger(amount) || amount < 0) {
  2356. return errorResult("scroll_amount must be a non-negative int", "bad_args");
  2357. }
  2358. if (amount > 100) {
  2359. return errorResult("scroll_amount exceeds maximum of 100", "bad_args");
  2360. }
  2361. // up → dy = -amount; down → dy = +amount; left → dx = -amount; right → dx = +amount.
  2362. const dx = dir === "left" ? -amount : dir === "right" ? amount : 0;
  2363. const dy = dir === "up" ? -amount : dir === "down" ? amount : 0;
  2364. const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
  2365. if (gate) return gate;
  2366. const display = await adapter.executor.getDisplaySize(
  2367. overrides.selectedDisplayId,
  2368. );
  2369. const { x, y } = scaleCoord(
  2370. rawX,
  2371. rawY,
  2372. overrides.coordinateMode,
  2373. display,
  2374. overrides.lastScreenshot,
  2375. adapter.logger,
  2376. );
  2377. // When the button is held, executor.scroll's internal moveMouse generates
  2378. // a leftMouseDragged event (enigo reads NSEvent.pressedMouseButtons) —
  2379. // same mechanism as handleMoveMouse's held-button path. Upgrade the
  2380. // hit-test to "mouse_full" so scroll can't be used to drag-drop text onto
  2381. // a click-tier terminal, and mark mouseMoved so the subsequent
  2382. // left_mouse_up hit-tests as a drop not a click-release.
  2383. const hitGate = await runHitTestGate(
  2384. adapter,
  2385. overrides,
  2386. subGates,
  2387. x,
  2388. y,
  2389. mouseButtonHeld ? "mouse_full" : "mouse",
  2390. );
  2391. if (hitGate) return hitGate;
  2392. if (mouseButtonHeld) mouseMoved = true;
  2393. await adapter.executor.scroll(x, y, dx, dy);
  2394. return okText("Scrolled.");
  2395. }
  2396. async function handleDrag(
  2397. adapter: ComputerUseHostAdapter,
  2398. args: Record<string, unknown>,
  2399. overrides: ComputerUseOverrides,
  2400. subGates: CuSubGates,
  2401. ): Promise<CuCallToolResult> {
  2402. // executor.drag() does its own press+release internally. Without this
  2403. // defensive clear, a prior left_mouse_down leaves mouseButtonHeld=true
  2404. // across the drag and desyncs the flag from OS state — same mechanism as
  2405. // the handleClickVariant clear above. Release first so drag() gets a
  2406. // clean slate.
  2407. if (mouseButtonHeld) {
  2408. await adapter.executor.mouseUp();
  2409. mouseButtonHeld = false;
  2410. mouseMoved = false;
  2411. }
  2412. // `coordinate` is the END point
  2413. // (required). `start_coordinate` is OPTIONAL — when omitted, drag from
  2414. // current cursor position.
  2415. const endCoord = extractCoordinate(args, "coordinate");
  2416. if (endCoord instanceof Error)
  2417. return errorResult(endCoord.message, "bad_args");
  2418. const rawTo = endCoord;
  2419. let rawFrom: [number, number] | undefined;
  2420. if (args.start_coordinate !== undefined) {
  2421. const startCoord = extractCoordinate(args, "start_coordinate");
  2422. if (startCoord instanceof Error)
  2423. return errorResult(startCoord.message, "bad_args");
  2424. rawFrom = startCoord;
  2425. }
  2426. // else: rawFrom stays undefined → executor drags from current cursor.
  2427. const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
  2428. if (gate) return gate;
  2429. const display = await adapter.executor.getDisplaySize(
  2430. overrides.selectedDisplayId,
  2431. );
  2432. const from =
  2433. rawFrom === undefined
  2434. ? undefined
  2435. : scaleCoord(
  2436. rawFrom[0],
  2437. rawFrom[1],
  2438. overrides.coordinateMode,
  2439. display,
  2440. overrides.lastScreenshot,
  2441. adapter.logger,
  2442. );
  2443. const to = scaleCoord(
  2444. rawTo[0],
  2445. rawTo[1],
  2446. overrides.coordinateMode,
  2447. display,
  2448. overrides.lastScreenshot,
  2449. adapter.logger,
  2450. );
  2451. // Check both drag endpoints. `from` is where the mouseDown happens (picks
  2452. // up), `to` is where mouseUp happens (drops). When start_coordinate is
  2453. // omitted the drag begins at the cursor — same bypass as mouse_move →
  2454. // left_mouse_down, so read the cursor and hit-test it (mirrors
  2455. // handleLeftMouseDown).
  2456. //
  2457. // The `to` endpoint uses "mouse_full" (not "mouse"): dropping text onto a
  2458. // terminal inserts it as if typed (macOS text drag-drop). Same threat as
  2459. // right-click→Paste. `from` stays "mouse" — picking up is a read.
  2460. const fromPoint = from ?? (await adapter.executor.getCursorPosition());
  2461. const fromGate = await runHitTestGate(
  2462. adapter,
  2463. overrides,
  2464. subGates,
  2465. fromPoint.x,
  2466. fromPoint.y,
  2467. "mouse",
  2468. );
  2469. if (fromGate) return fromGate;
  2470. const toGate = await runHitTestGate(
  2471. adapter,
  2472. overrides,
  2473. subGates,
  2474. to.x,
  2475. to.y,
  2476. "mouse_full",
  2477. );
  2478. if (toGate) return toGate;
  2479. await adapter.executor.drag(from, to);
  2480. return okText("Dragged.");
  2481. }
  2482. async function handleMoveMouse(
  2483. adapter: ComputerUseHostAdapter,
  2484. args: Record<string, unknown>,
  2485. overrides: ComputerUseOverrides,
  2486. subGates: CuSubGates,
  2487. ): Promise<CuCallToolResult> {
  2488. const coord = extractCoordinate(args);
  2489. if (coord instanceof Error) return errorResult(coord.message, "bad_args");
  2490. const [rawX, rawY] = coord;
  2491. // When the button is held, moveMouse generates leftMouseDragged events on
  2492. // the window under the cursor — that's interaction, not positioning.
  2493. // Upgrade to "mouse" and hit-test the destination. When the button is NOT
  2494. // held: pure positioning, passes at any tier, no hit-test (mouseDown/Up
  2495. // hit-test the cursor to close the mouse_move→left_mouse_down decomposition).
  2496. const actionKind: CuActionKind = mouseButtonHeld ? "mouse" : "mouse_position";
  2497. const gate = await runInputActionGates(
  2498. adapter,
  2499. overrides,
  2500. subGates,
  2501. actionKind,
  2502. );
  2503. if (gate) return gate;
  2504. const display = await adapter.executor.getDisplaySize(
  2505. overrides.selectedDisplayId,
  2506. );
  2507. const { x, y } = scaleCoord(
  2508. rawX,
  2509. rawY,
  2510. overrides.coordinateMode,
  2511. display,
  2512. overrides.lastScreenshot,
  2513. adapter.logger,
  2514. );
  2515. if (mouseButtonHeld) {
  2516. // "mouse_full" — same as left_click_drag's to-endpoint. Dragging onto a
  2517. // click-tier terminal is text injection regardless of which primitive
  2518. // (atomic drag vs. decomposed down/move/up) delivers the events.
  2519. const hitGate = await runHitTestGate(
  2520. adapter,
  2521. overrides,
  2522. subGates,
  2523. x,
  2524. y,
  2525. "mouse_full",
  2526. );
  2527. if (hitGate) return hitGate;
  2528. }
  2529. await adapter.executor.moveMouse(x, y);
  2530. if (mouseButtonHeld) mouseMoved = true;
  2531. return okText("Moved.");
  2532. }
  2533. async function handleOpenApplication(
  2534. adapter: ComputerUseHostAdapter,
  2535. args: Record<string, unknown>,
  2536. overrides: ComputerUseOverrides,
  2537. ): Promise<CuCallToolResult> {
  2538. const app = requireString(args, "app");
  2539. if (app instanceof Error) return errorResult(app.message, "bad_args");
  2540. // Resolve display-name → bundle ID. Same logic as request_access.
  2541. const allowed = new Set(overrides.allowedApps.map((g) => g.bundleId));
  2542. let targetBundleId: string | undefined;
  2543. if (looksLikeBundleId(app) && allowed.has(app)) {
  2544. targetBundleId = app;
  2545. } else {
  2546. // Try display name → bundle ID, but ONLY against the allowlist itself.
  2547. // Avoids paying the listInstalledApps() cost on the hot path and is
  2548. // arguably more correct: if the user granted "Slack", the model asking
  2549. // to open "Slack" should match THAT grant.
  2550. const match = overrides.allowedApps.find(
  2551. (g) => g.displayName.toLowerCase() === app.toLowerCase(),
  2552. );
  2553. targetBundleId = match?.bundleId;
  2554. }
  2555. if (!targetBundleId || !allowed.has(targetBundleId)) {
  2556. return errorResult(
  2557. `"${app}" is not granted for this session. Call request_access first.`,
  2558. "app_not_granted",
  2559. );
  2560. }
  2561. // open_application works at any tier — bringing an app forward is exactly
  2562. // what tier "read" enables (you need it on screen to screenshot it). The
  2563. // tier gates on click/type catch any follow-up interaction.
  2564. await adapter.executor.openApp(targetBundleId);
  2565. // On multi-monitor setups, macOS may place the opened window on a monitor
  2566. // the resolver won't pick (e.g. Claude + another allowed app are co-located
  2567. // elsewhere). Nudge the model toward switch_display BEFORE it wastes steps
  2568. // clicking on dock icons. Single-monitor → no hint. listDisplays failure is
  2569. // non-fatal — the hint is advisory.
  2570. if (overrides.onDisplayPinned !== undefined) {
  2571. let displayCount = 1;
  2572. try {
  2573. displayCount = (await adapter.executor.listDisplays()).length;
  2574. } catch {
  2575. // hint skipped
  2576. }
  2577. if (displayCount >= 2) {
  2578. return okText(
  2579. `Opened "${app}". If it isn't visible in the next screenshot, it may ` +
  2580. `have opened on a different monitor — use switch_display to check.`,
  2581. );
  2582. }
  2583. }
  2584. return okText(`Opened "${app}".`);
  2585. }
  2586. async function handleSwitchDisplay(
  2587. adapter: ComputerUseHostAdapter,
  2588. args: Record<string, unknown>,
  2589. overrides: ComputerUseOverrides,
  2590. ): Promise<CuCallToolResult> {
  2591. const display = requireString(args, "display");
  2592. if (display instanceof Error) return errorResult(display.message, "bad_args");
  2593. if (!overrides.onDisplayPinned) {
  2594. return errorResult(
  2595. "Display switching is not available in this session.",
  2596. "feature_unavailable",
  2597. );
  2598. }
  2599. if (display.toLowerCase() === "auto") {
  2600. overrides.onDisplayPinned(undefined);
  2601. return okText(
  2602. "Returned to automatic monitor selection. Call screenshot to continue.",
  2603. );
  2604. }
  2605. // Resolve label → displayId fresh. Same source buildMonitorNote reads,
  2606. // so whatever name the model saw in a screenshot note resolves here.
  2607. let displays;
  2608. try {
  2609. displays = await adapter.executor.listDisplays();
  2610. } catch (e) {
  2611. return errorResult(
  2612. `Failed to enumerate displays: ${String(e)}`,
  2613. "display_error",
  2614. );
  2615. }
  2616. if (displays.length < 2) {
  2617. return errorResult(
  2618. "Only one monitor is connected. There is nothing to switch to.",
  2619. "bad_args",
  2620. );
  2621. }
  2622. const labels = uniqueDisplayLabels(displays);
  2623. const wanted = display.toLowerCase();
  2624. const target = displays.find(
  2625. (d) => labels.get(d.displayId)?.toLowerCase() === wanted,
  2626. );
  2627. if (!target) {
  2628. const available = displays
  2629. .map((d) => `"${labels.get(d.displayId)}"`)
  2630. .join(", ");
  2631. return errorResult(
  2632. `No monitor named "${display}" is connected. Available monitors: ${available}.`,
  2633. "bad_args",
  2634. );
  2635. }
  2636. overrides.onDisplayPinned(target.displayId);
  2637. return okText(
  2638. `Switched to monitor "${labels.get(target.displayId)}". Call screenshot to see it.`,
  2639. );
  2640. }
  2641. function handleListGrantedApplications(
  2642. overrides: ComputerUseOverrides,
  2643. ): CuCallToolResult {
  2644. return okJson({
  2645. allowedApps: overrides.allowedApps,
  2646. grantFlags: overrides.grantFlags,
  2647. });
  2648. }
  2649. async function handleReadClipboard(
  2650. adapter: ComputerUseHostAdapter,
  2651. overrides: ComputerUseOverrides,
  2652. subGates: CuSubGates,
  2653. ): Promise<CuCallToolResult> {
  2654. if (!overrides.grantFlags.clipboardRead) {
  2655. return errorResult(
  2656. "Clipboard read is not granted. Request `clipboardRead` via request_access.",
  2657. "grant_flag_required",
  2658. );
  2659. }
  2660. // read_clipboard doesn't route through runInputActionGates — sync here so
  2661. // reading after clicking into a click-tier app sees the cleared clipboard
  2662. // (same as what the app's own Paste would see).
  2663. if (subGates.clipboardGuard) {
  2664. const frontmost = await adapter.executor.getFrontmostApp();
  2665. const tierByBundleId = new Map(
  2666. overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
  2667. );
  2668. const frontmostTier = frontmost
  2669. ? tierByBundleId.get(frontmost.bundleId)
  2670. : undefined;
  2671. await syncClipboardStash(adapter, overrides, frontmostTier === "click");
  2672. }
  2673. // clipboardGuard may have stashed+cleared — read the actual (possibly
  2674. // empty) clipboard. The agent sees what the app would see.
  2675. const text = await adapter.executor.readClipboard();
  2676. return okJson({ text });
  2677. }
  2678. async function handleWriteClipboard(
  2679. adapter: ComputerUseHostAdapter,
  2680. args: Record<string, unknown>,
  2681. overrides: ComputerUseOverrides,
  2682. subGates: CuSubGates,
  2683. ): Promise<CuCallToolResult> {
  2684. if (!overrides.grantFlags.clipboardWrite) {
  2685. return errorResult(
  2686. "Clipboard write is not granted. Request `clipboardWrite` via request_access.",
  2687. "grant_flag_required",
  2688. );
  2689. }
  2690. const text = requireString(args, "text");
  2691. if (text instanceof Error) return errorResult(text.message, "bad_args");
  2692. if (subGates.clipboardGuard) {
  2693. const frontmost = await adapter.executor.getFrontmostApp();
  2694. const tierByBundleId = new Map(
  2695. overrides.allowedApps.map((a) => [a.bundleId, a.tier] as const),
  2696. );
  2697. const frontmostTier = frontmost
  2698. ? tierByBundleId.get(frontmost.bundleId)
  2699. : undefined;
  2700. // Defense-in-depth for the clipboardGuard bypass: write_clipboard +
  2701. // left_click on a click-tier app's UI Paste button. The re-clear in
  2702. // syncClipboardStash already defeats it (the next action clobbers the
  2703. // write), but rejecting here gives the agent a clear signal instead of
  2704. // silently voiding its write.
  2705. if (frontmost && frontmostTier === "click") {
  2706. return errorResult(
  2707. `"${frontmost.displayName}" is a tier-"click" app and currently ` +
  2708. `frontmost. write_clipboard is blocked because the next action ` +
  2709. `would clear the clipboard anyway — a UI Paste button in this ` +
  2710. `app cannot be used to inject text. Bring a tier-"full" app ` +
  2711. `forward before writing to the clipboard.` +
  2712. TIER_ANTI_SUBVERSION,
  2713. "tier_insufficient",
  2714. );
  2715. }
  2716. // write_clipboard doesn't route through runInputActionGates — sync here
  2717. // so clicking away from a click-tier app then writing restores the user's
  2718. // stash before the agent's text lands.
  2719. await syncClipboardStash(adapter, overrides, frontmostTier === "click");
  2720. }
  2721. await adapter.executor.writeClipboard(text);
  2722. return okText("Clipboard written.");
  2723. }
  2724. /**
  2725. * wait(duration=N). Sleeps N seconds, capped at 100.
  2726. * No frontmost gate — no input, nothing to protect. Kill-switch + TCC
  2727. * are checked in handleToolCall before dispatch reaches here.
  2728. */
  2729. async function handleWait(
  2730. args: Record<string, unknown>,
  2731. ): Promise<CuCallToolResult> {
  2732. const duration = args.duration;
  2733. if (typeof duration !== "number" || !Number.isFinite(duration)) {
  2734. return errorResult("duration must be a number", "bad_args");
  2735. }
  2736. if (duration < 0) {
  2737. return errorResult("duration must be non-negative", "bad_args");
  2738. }
  2739. if (duration > 100) {
  2740. return errorResult(
  2741. "duration is too long. Duration is in seconds.",
  2742. "bad_args",
  2743. );
  2744. }
  2745. await sleep(duration * 1000);
  2746. return okText(`Waited ${duration}s.`);
  2747. }
  2748. /**
  2749. * Returns "X=...,Y=..." plain text. We return richer JSON with
  2750. * coordinateSpace annotation — the model handles both shapes.
  2751. *
  2752. * When lastScreenshot is present: inverse of scaleCoord — logical points →
  2753. * image-pixels via `imageX = logicalX × (screenshotWidth / displayWidth)`.
  2754. * Uses capture-time dims so the returned coords match what the model would
  2755. * read off that screenshot.
  2756. *
  2757. * No frontmost gate — read-only, no input.
  2758. */
  2759. async function handleCursorPosition(
  2760. adapter: ComputerUseHostAdapter,
  2761. overrides: ComputerUseOverrides,
  2762. ): Promise<CuCallToolResult> {
  2763. const logical = await adapter.executor.getCursorPosition();
  2764. const shot = overrides.lastScreenshot;
  2765. if (shot) {
  2766. // Inverse of scaleCoord: subtract capture-time origin to go from
  2767. // virtual-screen to display-relative before the image-px transform.
  2768. const localX = logical.x - shot.originX;
  2769. const localY = logical.y - shot.originY;
  2770. // Cursor off the captured display (multi-monitor): local coords go
  2771. // negative or exceed display dims. Return logical_points + hint rather
  2772. // than garbage image-px.
  2773. if (
  2774. localX < 0 ||
  2775. localX > shot.displayWidth ||
  2776. localY < 0 ||
  2777. localY > shot.displayHeight
  2778. ) {
  2779. return okJson({
  2780. x: logical.x,
  2781. y: logical.y,
  2782. coordinateSpace: "logical_points",
  2783. note: "cursor is on a different monitor than your last screenshot; take a fresh screenshot",
  2784. });
  2785. }
  2786. const x = Math.round(localX * (shot.width / shot.displayWidth));
  2787. const y = Math.round(localY * (shot.height / shot.displayHeight));
  2788. return okJson({ x, y, coordinateSpace: "image_pixels" });
  2789. }
  2790. return okJson({
  2791. x: logical.x,
  2792. y: logical.y,
  2793. coordinateSpace: "logical_points",
  2794. note: "take a screenshot first for image-pixel coordinates",
  2795. });
  2796. }
  2797. /**
  2798. * Presses each key in the
  2799. * chord, sleeps duration seconds, releases in reverse. Same duration bounds
  2800. * as wait. Keyboard action → frontmost gate applies; same systemKeyCombos
  2801. * blocklist check as key.
  2802. */
  2803. async function handleHoldKey(
  2804. adapter: ComputerUseHostAdapter,
  2805. args: Record<string, unknown>,
  2806. overrides: ComputerUseOverrides,
  2807. subGates: CuSubGates,
  2808. ): Promise<CuCallToolResult> {
  2809. const text = requireString(args, "text");
  2810. if (text instanceof Error) return errorResult(text.message, "bad_args");
  2811. const duration = args.duration;
  2812. if (typeof duration !== "number" || !Number.isFinite(duration)) {
  2813. return errorResult("duration must be a number", "bad_args");
  2814. }
  2815. if (duration < 0) {
  2816. return errorResult("duration must be non-negative", "bad_args");
  2817. }
  2818. if (duration > 100) {
  2819. return errorResult(
  2820. "duration is too long. Duration is in seconds.",
  2821. "bad_args",
  2822. );
  2823. }
  2824. // Blocklist check BEFORE gates — same reasoning as handleKey. Holding
  2825. // cmd+q is just as dangerous as tapping it.
  2826. if (
  2827. isSystemKeyCombo(text, adapter.executor.capabilities.platform) &&
  2828. !overrides.grantFlags.systemKeyCombos
  2829. ) {
  2830. return errorResult(
  2831. `"${text}" is a system-level shortcut. Request the \`systemKeyCombos\` grant via request_access to use it.`,
  2832. "grant_flag_required",
  2833. );
  2834. }
  2835. const gate = await runInputActionGates(
  2836. adapter,
  2837. overrides,
  2838. subGates,
  2839. "keyboard",
  2840. );
  2841. if (gate) return gate;
  2842. const keyNames = parseKeyChord(text);
  2843. await adapter.executor.holdKey(keyNames, duration * 1000);
  2844. return okText("Key held.");
  2845. }
  2846. /**
  2847. * Raw press at current cursor, no coordinate.
  2848. * Move first with mouse_move. Errors if already held.
  2849. */
  2850. async function handleLeftMouseDown(
  2851. adapter: ComputerUseHostAdapter,
  2852. overrides: ComputerUseOverrides,
  2853. subGates: CuSubGates,
  2854. ): Promise<CuCallToolResult> {
  2855. if (mouseButtonHeld) {
  2856. return errorResult(
  2857. "mouse button already held, call left_mouse_up first",
  2858. "state_conflict",
  2859. );
  2860. }
  2861. const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
  2862. if (gate) return gate;
  2863. // macOS routes mouseDown to the window under the cursor, not the frontmost
  2864. // app. Without this hit-test, mouse_move (positioning, passes at any tier)
  2865. // + left_mouse_down decomposes a click that lands on a tier-"read" window
  2866. // overlapping a tier-"full" frontmost app — bypassing runHitTestGate's
  2867. // whole purpose. All three are batchable, so the bypass is atomic.
  2868. const cursor = await adapter.executor.getCursorPosition();
  2869. const hitGate = await runHitTestGate(
  2870. adapter,
  2871. overrides,
  2872. subGates,
  2873. cursor.x,
  2874. cursor.y,
  2875. "mouse",
  2876. );
  2877. if (hitGate) return hitGate;
  2878. await adapter.executor.mouseDown();
  2879. mouseButtonHeld = true;
  2880. mouseMoved = false;
  2881. return okText("Mouse button pressed.");
  2882. }
  2883. /**
  2884. * Raw release at current cursor. Does NOT error
  2885. * if not held (idempotent release).
  2886. */
  2887. async function handleLeftMouseUp(
  2888. adapter: ComputerUseHostAdapter,
  2889. overrides: ComputerUseOverrides,
  2890. subGates: CuSubGates,
  2891. ): Promise<CuCallToolResult> {
  2892. // Any gate rejection here must release the button FIRST — otherwise the
  2893. // OS button stays pressed and mouseButtonHeld stays true. Recovery
  2894. // attempts (mouse_move back to a safe app) would generate leftMouseDragged
  2895. // events into whatever window is under the cursor, including the very
  2896. // read-tier window the gate was protecting. A single mouseUp on a
  2897. // restricted window is one event; a stuck button is cascading damage.
  2898. //
  2899. // This includes the frontmost gate: focus can change between mouseDown and
  2900. // mouseUp (something else grabbed focus), in which case runInputActionGates
  2901. // rejects here even though it passed at mouseDown.
  2902. const releaseFirst = async (
  2903. err: CuCallToolResult,
  2904. ): Promise<CuCallToolResult> => {
  2905. await adapter.executor.mouseUp();
  2906. mouseButtonHeld = false;
  2907. mouseMoved = false;
  2908. return err;
  2909. };
  2910. const gate = await runInputActionGates(adapter, overrides, subGates, "mouse");
  2911. if (gate) return releaseFirst(gate);
  2912. // When the cursor moved since mouseDown, this is a drop (text-injection
  2913. // vector) — hit-test at "mouse_full" same as left_click_drag's `to`. When
  2914. // NO move happened, this is a click-release — same semantics as the atomic
  2915. // left_click, hit-test at "mouse". Without this distinction, a decomposed
  2916. // click on a click-tier app fails here while the atomic left_click works,
  2917. // and releaseFirst fires mouseUp anyway so the OS sees a complete click
  2918. // while the model gets a misleading error.
  2919. const cursor = await adapter.executor.getCursorPosition();
  2920. const hitGate = await runHitTestGate(
  2921. adapter,
  2922. overrides,
  2923. subGates,
  2924. cursor.x,
  2925. cursor.y,
  2926. mouseMoved ? "mouse_full" : "mouse",
  2927. );
  2928. if (hitGate) return releaseFirst(hitGate);
  2929. await adapter.executor.mouseUp();
  2930. mouseButtonHeld = false;
  2931. mouseMoved = false;
  2932. return okText("Mouse button released.");
  2933. }
  2934. // ---------------------------------------------------------------------------
  2935. // Batch dispatch
  2936. // ---------------------------------------------------------------------------
  2937. /**
  2938. * Actions allowed inside a computer_batch call. Excludes request_access,
  2939. * open_application, clipboard, list_granted (no latency benefit, complicates
  2940. * security model).
  2941. */
  2942. const BATCHABLE_ACTIONS: ReadonlySet<string> = new Set([
  2943. "key",
  2944. "type",
  2945. "mouse_move",
  2946. "left_click",
  2947. "left_click_drag",
  2948. "right_click",
  2949. "middle_click",
  2950. "double_click",
  2951. "triple_click",
  2952. "scroll",
  2953. "hold_key",
  2954. "screenshot",
  2955. "cursor_position",
  2956. "left_mouse_down",
  2957. "left_mouse_up",
  2958. "wait",
  2959. ]);
  2960. interface BatchActionResult {
  2961. action: string;
  2962. ok: boolean;
  2963. output: string;
  2964. }
  2965. /**
  2966. * Executes `actions: [{action, …}, …]`
  2967. * sequentially in ONE model→API round trip — the dominant latency cost
  2968. * (seconds, vs. ~50ms local overhead per action).
  2969. *
  2970. * Gate semantics (the security model):
  2971. * - Kill-switch + TCC: checked ONCE by handleToolCall before reaching here.
  2972. * - prepareForAction: run ONCE at the top. The user approved "do this
  2973. * sequence"; hiding apps per-action is wasted work and fast-pathed anyway.
  2974. * - Frontmost gate: checked PER ACTION. State can change mid-batch — a
  2975. * click might open a non-allowed app. This is the safety net: if action
  2976. * 3 of 5 opened Safari (not allowed), action 4's frontmost check fires
  2977. * and stops the batch there.
  2978. * - PixelCompare: SKIPPED inside batch. The model committed to the full
  2979. * sequence without intermediate screenshots; validating mid-batch clicks
  2980. * against a pre-batch screenshot would false-positive constantly.
  2981. *
  2982. * Both skips are implemented by passing `{...subGates, hideBeforeAction:
  2983. * false, pixelValidation: false}` to each inner dispatch — the handlers'
  2984. * existing gate logic does the right thing, no new code paths.
  2985. *
  2986. * Stop-on-first-error: accumulate results, on
  2987. * first `isError` stop executing, return everything so far + the error. The
  2988. * model sees exactly where the batch broke and what succeeded before it.
  2989. *
  2990. * Mid-batch screenshots are allowed (for inspection) but NEVER piggyback —
  2991. * their `.screenshot` field is dropped. Same invariant as zoom: click coords
  2992. * always refer to the PRE-BATCH `lastScreenshot`. If the model wants to click
  2993. * based on a new screenshot, it ends the batch and screenshots separately.
  2994. */
  2995. async function handleComputerBatch(
  2996. adapter: ComputerUseHostAdapter,
  2997. args: Record<string, unknown>,
  2998. overrides: ComputerUseOverrides,
  2999. subGates: CuSubGates,
  3000. ): Promise<CuCallToolResult> {
  3001. const actions = args.actions;
  3002. if (!Array.isArray(actions) || actions.length === 0) {
  3003. return errorResult("actions must be a non-empty array", "bad_args");
  3004. }
  3005. for (const [i, act] of actions.entries()) {
  3006. if (typeof act !== "object" || act === null) {
  3007. return errorResult(`actions[${i}] must be an object`, "bad_args");
  3008. }
  3009. const action = (act as Record<string, unknown>).action;
  3010. if (typeof action !== "string") {
  3011. return errorResult(`actions[${i}].action must be a string`, "bad_args");
  3012. }
  3013. if (!BATCHABLE_ACTIONS.has(action)) {
  3014. return errorResult(
  3015. `actions[${i}].action="${action}" is not allowed in a batch. ` +
  3016. `Allowed: ${[...BATCHABLE_ACTIONS].join(", ")}.`,
  3017. "bad_args",
  3018. );
  3019. }
  3020. }
  3021. // prepareForAction ONCE. After this, inner dispatches skip it via
  3022. // hideBeforeAction:false.
  3023. if (subGates.hideBeforeAction) {
  3024. const hidden = await adapter.executor.prepareForAction(
  3025. overrides.allowedApps.map((a) => a.bundleId),
  3026. overrides.selectedDisplayId,
  3027. );
  3028. if (hidden.length > 0) {
  3029. overrides.onAppsHidden?.(hidden);
  3030. }
  3031. }
  3032. // Inner actions: skip prepare (already ran), skip pixelCompare (stale by
  3033. // design). Frontmost still checked — runInputActionGates does it
  3034. // unconditionally.
  3035. const batchSubGates: CuSubGates = {
  3036. ...subGates,
  3037. hideBeforeAction: false,
  3038. pixelValidation: false,
  3039. // Batch already took its screenshot (appended at end); a mid-batch
  3040. // resolver switch would make that screenshot inconsistent with
  3041. // earlier clicks' lastScreenshot-based scaleCoord targeting.
  3042. autoTargetDisplay: false,
  3043. };
  3044. const results: BatchActionResult[] = [];
  3045. for (const [i, act] of actions.entries()) {
  3046. // Overlay Stop → host's stopSession → lifecycleState leaves "running"
  3047. // synchronously before query.interrupt(). The SDK abort tears down the
  3048. // host's await but not this loop — without this check the remaining
  3049. // actions fire into a dead session.
  3050. if (overrides.isAborted?.()) {
  3051. await releaseHeldMouse(adapter);
  3052. return errorResult(
  3053. `Batch aborted after ${results.length} of ${actions.length} actions (user interrupt).`,
  3054. );
  3055. }
  3056. // Small inter-step settle. Synthetic CGEvents post instantly; some apps
  3057. // need a tick to process step N's input before step N+1 lands (e.g. a
  3058. // click opening a menu before the next click targets a menu item).
  3059. if (i > 0) await sleep(10);
  3060. const actionArgs = act as Record<string, unknown>;
  3061. const action = actionArgs.action as string;
  3062. // Drop mid-batch screenshot piggyback (strip .screenshot). Click coords
  3063. // stay anchored to the pre-batch lastScreenshot.
  3064. const { screenshot: _dropped, ...inner } = await dispatchAction(
  3065. action,
  3066. actionArgs,
  3067. adapter,
  3068. overrides,
  3069. batchSubGates,
  3070. );
  3071. const text = firstTextContent(inner);
  3072. const result = { action, ok: !inner.isError, output: text };
  3073. results.push(result);
  3074. if (inner.isError) {
  3075. // Stop-on-first-error. Return everything so far + the error.
  3076. // Forward the inner action's telemetry (error_kind) so cu_tool_call
  3077. // reflects the actual failure — without this, batch-internal errors
  3078. // emit error_kind: undefined despite the inner handler tagging it.
  3079. // Release held mouse: the error may be a mid-grapheme abort in
  3080. // handleType, or a frontmost gate, landing between mouse_down and
  3081. // mouse_up.
  3082. await releaseHeldMouse(adapter);
  3083. return okJson(
  3084. {
  3085. completed: results.slice(0, -1),
  3086. failed: result,
  3087. remaining: actions.length - results.length,
  3088. },
  3089. inner.telemetry,
  3090. );
  3091. }
  3092. }
  3093. return okJson({ completed: results });
  3094. }
  3095. function firstTextContent(r: CuCallToolResult): string {
  3096. const first = r.content[0];
  3097. return first && first.type === "text" ? first.text : "";
  3098. }
  3099. /**
  3100. * Action dispatch shared by handleToolCall and handleComputerBatch. Called
  3101. * AFTER kill-switch + TCC gates have passed. Never sees request_access — it's
  3102. * special-cased in handleToolCall for the tccState thread-through.
  3103. */
  3104. async function dispatchAction(
  3105. name: string,
  3106. a: Record<string, unknown>,
  3107. adapter: ComputerUseHostAdapter,
  3108. overrides: ComputerUseOverrides,
  3109. subGates: CuSubGates,
  3110. ): Promise<CuCallToolResult> {
  3111. switch (name) {
  3112. case "screenshot":
  3113. return handleScreenshot(adapter, overrides, subGates);
  3114. case "zoom":
  3115. return handleZoom(adapter, a, overrides);
  3116. case "left_click":
  3117. return handleClickVariant(adapter, a, overrides, subGates, "left", 1);
  3118. case "double_click":
  3119. return handleClickVariant(adapter, a, overrides, subGates, "left", 2);
  3120. case "triple_click":
  3121. return handleClickVariant(adapter, a, overrides, subGates, "left", 3);
  3122. case "right_click":
  3123. return handleClickVariant(adapter, a, overrides, subGates, "right", 1);
  3124. case "middle_click":
  3125. return handleClickVariant(adapter, a, overrides, subGates, "middle", 1);
  3126. case "type":
  3127. return handleType(adapter, a, overrides, subGates);
  3128. case "key":
  3129. return handleKey(adapter, a, overrides, subGates);
  3130. case "scroll":
  3131. return handleScroll(adapter, a, overrides, subGates);
  3132. case "left_click_drag":
  3133. return handleDrag(adapter, a, overrides, subGates);
  3134. case "mouse_move":
  3135. return handleMoveMouse(adapter, a, overrides, subGates);
  3136. case "wait":
  3137. return handleWait(a);
  3138. case "cursor_position":
  3139. return handleCursorPosition(adapter, overrides);
  3140. case "hold_key":
  3141. return handleHoldKey(adapter, a, overrides, subGates);
  3142. case "left_mouse_down":
  3143. return handleLeftMouseDown(adapter, overrides, subGates);
  3144. case "left_mouse_up":
  3145. return handleLeftMouseUp(adapter, overrides, subGates);
  3146. case "open_application":
  3147. return handleOpenApplication(adapter, a, overrides);
  3148. case "switch_display":
  3149. return handleSwitchDisplay(adapter, a, overrides);
  3150. case "list_granted_applications":
  3151. return handleListGrantedApplications(overrides);
  3152. case "read_clipboard":
  3153. return handleReadClipboard(adapter, overrides, subGates);
  3154. case "write_clipboard":
  3155. return handleWriteClipboard(adapter, a, overrides, subGates);
  3156. case "computer_batch":
  3157. return handleComputerBatch(adapter, a, overrides, subGates);
  3158. default:
  3159. return errorResult(`Unknown tool "${name}".`, "bad_args");
  3160. }
  3161. }
  3162. // ---------------------------------------------------------------------------
  3163. // Main dispatch
  3164. // ---------------------------------------------------------------------------
  3165. export async function handleToolCall(
  3166. adapter: ComputerUseHostAdapter,
  3167. name: string,
  3168. args: unknown,
  3169. rawOverrides: ComputerUseOverrides,
  3170. ): Promise<CuCallToolResult> {
  3171. const { logger, serverName } = adapter;
  3172. // Normalize the allowlist before any gate runs:
  3173. //
  3174. // (a) Strip user-denied. A grant from a previous session (before the user
  3175. // added the app to Settings → Desktop app → Computer Use → Denied apps)
  3176. // must not survive. Without
  3177. // this, a stale grant bypasses the auto-deny. Stripped silently — the
  3178. // agent already saw the userDenied guidance at request_access time, and
  3179. // a live frontmost-gate rejection cites "not in allowed applications".
  3180. //
  3181. // (b) Strip policy-denied. Same story as (a) for a grant that predates a
  3182. // blocklist addition. buildAccessRequest denies these up front for new
  3183. // requests; this catches stale persisted grants.
  3184. //
  3185. // (c) Backfill tier. A grant persisted before the tier field existed has
  3186. // `tier: undefined`, which `tierSatisfies` treats as `"full"` — wrong
  3187. // for a legacy Chrome grant. Assign the hardcoded tier based on
  3188. // bundle-ID category. Modern grants already have a tier.
  3189. //
  3190. // `.some()` guard keeps the hot path (empty deny list, no legacy grants)
  3191. // zero-alloc.
  3192. const userDeniedSet = new Set(rawOverrides.userDeniedBundleIds);
  3193. const overrides: ComputerUseOverrides = rawOverrides.allowedApps.some(
  3194. (a) =>
  3195. a.tier === undefined ||
  3196. userDeniedSet.has(a.bundleId) ||
  3197. isPolicyDenied(a.bundleId, a.displayName),
  3198. )
  3199. ? {
  3200. ...rawOverrides,
  3201. allowedApps: rawOverrides.allowedApps
  3202. .filter((a) => !userDeniedSet.has(a.bundleId))
  3203. .filter((a) => !isPolicyDenied(a.bundleId, a.displayName))
  3204. .map((a) =>
  3205. a.tier !== undefined
  3206. ? a
  3207. : { ...a, tier: getDefaultTierForApp(a.bundleId, a.displayName) },
  3208. ),
  3209. }
  3210. : rawOverrides;
  3211. // ─── Gate 1: kill switch ─────────────────────────────────────────────
  3212. if (adapter.isDisabled()) {
  3213. return errorResult(
  3214. "Computer control is disabled in Settings. Enable it and try again.",
  3215. "other",
  3216. );
  3217. }
  3218. // ─── Gate 2: TCC ─────────────────────────────────────────────────────
  3219. // Accessibility + Screen Recording on macOS. Pure check — no dialog,
  3220. // no relaunch. `request_access` is exempted: it threads the ungranted
  3221. // state through to the renderer, which shows a TCC toggle panel instead
  3222. // of the app list. Every other tool short-circuits here.
  3223. const osPerms = await adapter.ensureOsPermissions();
  3224. let tccState:
  3225. | { accessibility: boolean; screenRecording: boolean }
  3226. | undefined;
  3227. if (!osPerms.granted) {
  3228. // Both request_* tools thread tccState through to the renderer's
  3229. // TCC toggle panel. Every other tool short-circuits.
  3230. if (name !== "request_access" && name !== "request_teach_access") {
  3231. return errorResult(
  3232. "Accessibility and Screen Recording permissions are required. " +
  3233. "Call request_access to show the permission panel.",
  3234. "tcc_not_granted",
  3235. );
  3236. }
  3237. tccState = {
  3238. accessibility: osPerms.accessibility,
  3239. screenRecording: osPerms.screenRecording,
  3240. };
  3241. }
  3242. // ─── Gate 3: global CU lock ──────────────────────────────────────────
  3243. // At most one session uses CU at a time. Every tool including
  3244. // request_access hits the CHECK — even showing the approval dialog while
  3245. // another session holds the lock would be confusing ("why approve access
  3246. // that can't be used?").
  3247. //
  3248. // But ACQUIRE is split: request_access and list_granted_applications
  3249. // check-without-acquire (the overlay + notifications are driven by
  3250. // cuLockChanged, and showing "Claude is using your computer" while the
  3251. // agent is only ASKING for access is premature). First action tool
  3252. // acquires and the overlay appears. If the user denies and no action
  3253. // follows, the overlay never shows.
  3254. //
  3255. // request_teach_access is NOT in this set — approving teach mode HIDES
  3256. // the main window (via onTeachModeActivated), and the lock must be held
  3257. // before that happens. Otherwise a concurrent session's request_access
  3258. // would render its dialog in an invisible main window during the gap
  3259. // between hide and the first teach_step (seconds of model inference).
  3260. // The old acquire-always-at-Gate-3 behavior was correct for teach; only
  3261. // the non-teach permission tools benefit from deferral.
  3262. //
  3263. // Host releases on idle/stop/archive; this package never releases. Both
  3264. // Cowork (LAM) and CCD (LSM) wire checkCuLock via the shared cuLock
  3265. // singleton. When undefined (tests/future hosts), no gate — absence of
  3266. // the mechanism ≠ locked out.
  3267. const deferAcquire = defersLockAcquire(name);
  3268. const lock = overrides.checkCuLock?.();
  3269. if (lock) {
  3270. if (lock.holder !== undefined && !lock.isSelf) {
  3271. return errorResult(
  3272. "Another Claude session is currently using the computer. Wait for " +
  3273. "the user to acknowledge it is finished (stop button in the Claude " +
  3274. "window), or find a non-computer-use approach if one is readily " +
  3275. "apparent.",
  3276. "cu_lock_held",
  3277. );
  3278. }
  3279. if (lock.holder === undefined && !deferAcquire) {
  3280. // Acquire. Emits cuLockChanged → overlay shows. Idempotent — if
  3281. // someone else acquired between check and here (won't happen on a
  3282. // single-threaded event loop, but defensive), this is a no-op.
  3283. overrides.acquireCuLock?.();
  3284. // Fresh lock holder → any prior session's mouseButtonHeld is stale
  3285. // (e.g. overlay stop mid-drag). Clear it so this session doesn't get
  3286. // a spurious "already held" error. resetMouseButtonHeld is file-local;
  3287. // this is the one non-test callsite.
  3288. resetMouseButtonHeld();
  3289. }
  3290. // lock.isSelf → already held by us, proceed.
  3291. // lock.holder === undefined && deferAcquire →
  3292. // checked but not acquired — proceed, first action will acquire.
  3293. }
  3294. // Sub-gates read FRESH every call so a GrowthBook flip takes effect
  3295. // mid-session (plan §3).
  3296. const subGates = adapter.getSubGates();
  3297. // Clipboard guard runs per-action inside runInputActionGates + inline in
  3298. // handleReadClipboard/handleWriteClipboard. NOT here — per-tool-call sync
  3299. // would run once for computer_batch and miss sub-actions 2..N, and would
  3300. // fire during deferAcquire tools / `wait` / teach_step's blocking-dialog
  3301. // phase where no input is happening.
  3302. const a = asRecord(args);
  3303. logger.silly(
  3304. `[${serverName}] tool=${name} args=${JSON.stringify(a).slice(0, 200)}`,
  3305. );
  3306. // ─── Fail-closed dispatch ────────────────────────────────────────────
  3307. // ANY exception below → tool error, executor never left in a half-called
  3308. // state. Explicit inversion of the prior `catch → return true` fail-open.
  3309. try {
  3310. // request_access / request_teach_access: need tccState thread-through;
  3311. // dispatchAction never sees them (not batchable).
  3312. // teach_step: blocking UI tool, also not batchable; needs subGates for
  3313. // its action-execution phase.
  3314. if (name === "request_access") {
  3315. return await handleRequestAccess(adapter, a, overrides, tccState);
  3316. }
  3317. if (name === "request_teach_access") {
  3318. return await handleRequestTeachAccess(adapter, a, overrides, tccState);
  3319. }
  3320. if (name === "teach_step") {
  3321. return await handleTeachStep(adapter, a, overrides, subGates);
  3322. }
  3323. if (name === "teach_batch") {
  3324. return await handleTeachBatch(adapter, a, overrides, subGates);
  3325. }
  3326. return await dispatchAction(name, a, adapter, overrides, subGates);
  3327. } catch (err) {
  3328. // Fail-closed. If the gate machinery itself throws (e.g.
  3329. // getFrontmostApp() rejects), the executor has NOT been called yet for
  3330. // the gated tools — the gates run before the executor in every handler.
  3331. // For ungated tools, the executor may have been mid-call; that's fine —
  3332. // the result is still a tool error, never an implicit success.
  3333. const msg = err instanceof Error ? err.message : String(err);
  3334. logger.error(`[${serverName}] tool=${name} threw: ${msg}`, err);
  3335. return errorResult(`Tool "${name}" failed: ${msg}`, "executor_threw");
  3336. }
  3337. }
  3338. export const _test = {
  3339. scaleCoord,
  3340. coordToPercentageForPixelCompare,
  3341. segmentGraphemes,
  3342. decodedByteLength,
  3343. resolveRequestedApps,
  3344. buildAccessRequest,
  3345. buildTierGuidanceMessage,
  3346. buildUserDeniedGuidance,
  3347. tierSatisfies,
  3348. looksLikeBundleId,
  3349. extractCoordinate,
  3350. parseKeyChord,
  3351. buildMonitorNote,
  3352. handleSwitchDisplay,
  3353. uniqueDisplayLabels,
  3354. };