claude.ts 123 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419
  1. import type {
  2. BetaContentBlock,
  3. BetaContentBlockParam,
  4. BetaImageBlockParam,
  5. BetaJSONOutputFormat,
  6. BetaMessage,
  7. BetaMessageDeltaUsage,
  8. BetaMessageStreamParams,
  9. BetaOutputConfig,
  10. BetaRawMessageStreamEvent,
  11. BetaRequestDocumentBlock,
  12. BetaStopReason,
  13. BetaToolChoiceAuto,
  14. BetaToolChoiceTool,
  15. BetaToolResultBlockParam,
  16. BetaToolUnion,
  17. BetaUsage,
  18. BetaMessageParam as MessageParam,
  19. } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
  20. import type { TextBlockParam } from '@anthropic-ai/sdk/resources/index.mjs'
  21. import type { Stream } from '@anthropic-ai/sdk/streaming.mjs'
  22. import { randomUUID } from 'crypto'
  23. import {
  24. getAPIProvider,
  25. isFirstPartyAnthropicBaseUrl,
  26. } from 'src/utils/model/providers.js'
  27. import {
  28. getAttributionHeader,
  29. getCLISyspromptPrefix,
  30. } from '../../constants/system.js'
  31. import {
  32. getEmptyToolPermissionContext,
  33. type QueryChainTracking,
  34. type Tool,
  35. type ToolPermissionContext,
  36. type Tools,
  37. toolMatchesName,
  38. } from '../../Tool.js'
  39. import type { AgentDefinition } from '../../tools/AgentTool/loadAgentsDir.js'
  40. import {
  41. type ConnectorTextBlock,
  42. type ConnectorTextDelta,
  43. isConnectorTextBlock,
  44. } from '../../types/connectorText.js'
  45. import type {
  46. AssistantMessage,
  47. Message,
  48. StreamEvent,
  49. SystemAPIErrorMessage,
  50. UserMessage,
  51. } from '../../types/message.js'
  52. import {
  53. type CacheScope,
  54. logAPIPrefix,
  55. splitSysPromptPrefix,
  56. toolToAPISchema,
  57. } from '../../utils/api.js'
  58. import { getOauthAccountInfo } from '../../utils/auth.js'
  59. import {
  60. getBedrockExtraBodyParamsBetas,
  61. getMergedBetas,
  62. getModelBetas,
  63. } from '../../utils/betas.js'
  64. import { getOrCreateUserID } from '../../utils/config.js'
  65. import {
  66. CAPPED_DEFAULT_MAX_TOKENS,
  67. getModelMaxOutputTokens,
  68. getSonnet1mExpTreatmentEnabled,
  69. } from '../../utils/context.js'
  70. import { resolveAppliedEffort } from '../../utils/effort.js'
  71. import { isEnvTruthy } from '../../utils/envUtils.js'
  72. import { errorMessage } from '../../utils/errors.js'
  73. import { computeFingerprintFromMessages } from '../../utils/fingerprint.js'
  74. import { captureAPIRequest, logError } from '../../utils/log.js'
  75. import {
  76. createAssistantAPIErrorMessage,
  77. createUserMessage,
  78. ensureToolResultPairing,
  79. normalizeContentFromAPI,
  80. normalizeMessagesForAPI,
  81. stripAdvisorBlocks,
  82. stripCallerFieldFromAssistantMessage,
  83. stripToolReferenceBlocksFromUserMessage,
  84. } from '../../utils/messages.js'
  85. import {
  86. getDefaultOpusModel,
  87. getDefaultSonnetModel,
  88. getSmallFastModel,
  89. isNonCustomOpusModel,
  90. } from '../../utils/model/model.js'
  91. import {
  92. asSystemPrompt,
  93. type SystemPrompt,
  94. } from '../../utils/systemPromptType.js'
  95. import { tokenCountFromLastAPIResponse } from '../../utils/tokens.js'
  96. import { getDynamicConfig_BLOCKS_ON_INIT } from '../analytics/growthbook.js'
  97. import {
  98. currentLimits,
  99. extractQuotaStatusFromError,
  100. extractQuotaStatusFromHeaders,
  101. } from '../claudeAiLimits.js'
  102. import { getAPIContextManagement } from '../compact/apiMicrocompact.js'
  103. /* eslint-disable @typescript-eslint/no-require-imports */
  104. const autoModeStateModule = feature('TRANSCRIPT_CLASSIFIER')
  105. ? (require('../../utils/permissions/autoModeState.js') as typeof import('../../utils/permissions/autoModeState.js'))
  106. : null
  107. import { feature } from 'bun:bundle'
  108. import type { ClientOptions } from '@anthropic-ai/sdk'
  109. import {
  110. APIConnectionTimeoutError,
  111. APIError,
  112. APIUserAbortError,
  113. } from '@anthropic-ai/sdk/error'
  114. import {
  115. getAfkModeHeaderLatched,
  116. getCacheEditingHeaderLatched,
  117. getFastModeHeaderLatched,
  118. getLastApiCompletionTimestamp,
  119. getPromptCache1hAllowlist,
  120. getPromptCache1hEligible,
  121. getSessionId,
  122. getThinkingClearLatched,
  123. setAfkModeHeaderLatched,
  124. setCacheEditingHeaderLatched,
  125. setFastModeHeaderLatched,
  126. setLastMainRequestId,
  127. setPromptCache1hAllowlist,
  128. setPromptCache1hEligible,
  129. setThinkingClearLatched,
  130. } from 'src/bootstrap/state.js'
  131. import {
  132. AFK_MODE_BETA_HEADER,
  133. CONTEXT_1M_BETA_HEADER,
  134. CONTEXT_MANAGEMENT_BETA_HEADER,
  135. EFFORT_BETA_HEADER,
  136. FAST_MODE_BETA_HEADER,
  137. PROMPT_CACHING_SCOPE_BETA_HEADER,
  138. REDACT_THINKING_BETA_HEADER,
  139. STRUCTURED_OUTPUTS_BETA_HEADER,
  140. TASK_BUDGETS_BETA_HEADER,
  141. } from 'src/constants/betas.js'
  142. import type { QuerySource } from 'src/constants/querySource.js'
  143. import type { Notification } from 'src/context/notifications.js'
  144. import { addToTotalSessionCost } from 'src/cost-tracker.js'
  145. import { getFeatureValue_CACHED_MAY_BE_STALE } from 'src/services/analytics/growthbook.js'
  146. import type { AgentId } from 'src/types/ids.js'
  147. import {
  148. ADVISOR_TOOL_INSTRUCTIONS,
  149. getExperimentAdvisorModels,
  150. isAdvisorEnabled,
  151. isValidAdvisorModel,
  152. modelSupportsAdvisor,
  153. } from 'src/utils/advisor.js'
  154. import { getAgentContext } from 'src/utils/agentContext.js'
  155. import { isClaudeAISubscriber } from 'src/utils/auth.js'
  156. import {
  157. getToolSearchBetaHeader,
  158. modelSupportsStructuredOutputs,
  159. shouldIncludeFirstPartyOnlyBetas,
  160. shouldUseGlobalCacheScope,
  161. } from 'src/utils/betas.js'
  162. import { CLAUDE_IN_CHROME_MCP_SERVER_NAME } from 'src/utils/claudeInChrome/common.js'
  163. import { CHROME_TOOL_SEARCH_INSTRUCTIONS } from 'src/utils/claudeInChrome/prompt.js'
  164. import { getMaxThinkingTokensForModel } from 'src/utils/context.js'
  165. import { logForDebugging } from 'src/utils/debug.js'
  166. import { logForDiagnosticsNoPII } from 'src/utils/diagLogs.js'
  167. import { type EffortValue, modelSupportsEffort } from 'src/utils/effort.js'
  168. import {
  169. isFastModeAvailable,
  170. isFastModeCooldown,
  171. isFastModeEnabled,
  172. isFastModeSupportedByModel,
  173. } from 'src/utils/fastMode.js'
  174. import { returnValue } from 'src/utils/generators.js'
  175. import { headlessProfilerCheckpoint } from 'src/utils/headlessProfiler.js'
  176. import { isMcpInstructionsDeltaEnabled } from 'src/utils/mcpInstructionsDelta.js'
  177. import { calculateUSDCost } from 'src/utils/modelCost.js'
  178. import { endQueryProfile, queryCheckpoint } from 'src/utils/queryProfiler.js'
  179. import {
  180. modelSupportsAdaptiveThinking,
  181. modelSupportsThinking,
  182. type ThinkingConfig,
  183. } from 'src/utils/thinking.js'
  184. import {
  185. extractDiscoveredToolNames,
  186. isDeferredToolsDeltaEnabled,
  187. isToolSearchEnabled,
  188. } from 'src/utils/toolSearch.js'
  189. import { API_MAX_MEDIA_PER_REQUEST } from '../../constants/apiLimits.js'
  190. import { ADVISOR_BETA_HEADER } from '../../constants/betas.js'
  191. import {
  192. formatDeferredToolLine,
  193. isDeferredTool,
  194. TOOL_SEARCH_TOOL_NAME,
  195. } from '../../tools/ToolSearchTool/prompt.js'
  196. import { count } from '../../utils/array.js'
  197. import { insertBlockAfterToolResults } from '../../utils/contentArray.js'
  198. import { validateBoundedIntEnvVar } from '../../utils/envValidation.js'
  199. import { safeParseJSON } from '../../utils/json.js'
  200. import { getInferenceProfileBackingModel } from '../../utils/model/bedrock.js'
  201. import {
  202. normalizeModelStringForAPI,
  203. parseUserSpecifiedModel,
  204. } from '../../utils/model/model.js'
  205. import {
  206. startSessionActivity,
  207. stopSessionActivity,
  208. } from '../../utils/sessionActivity.js'
  209. import { jsonStringify } from '../../utils/slowOperations.js'
  210. import {
  211. isBetaTracingEnabled,
  212. type LLMRequestNewContext,
  213. startLLMRequestSpan,
  214. } from '../../utils/telemetry/sessionTracing.js'
  215. /* eslint-enable @typescript-eslint/no-require-imports */
  216. import {
  217. type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  218. logEvent,
  219. } from '../analytics/index.js'
  220. import {
  221. consumePendingCacheEdits,
  222. getPinnedCacheEdits,
  223. markToolsSentToAPIState,
  224. pinCacheEdits,
  225. } from '../compact/microCompact.js'
  226. import { getInitializationStatus } from '../lsp/manager.js'
  227. import { isToolFromMcpServer } from '../mcp/utils.js'
  228. import { withStreamingVCR, withVCR } from '../vcr.js'
  229. import { CLIENT_REQUEST_ID_HEADER, getAnthropicClient } from './client.js'
  230. import {
  231. API_ERROR_MESSAGE_PREFIX,
  232. CUSTOM_OFF_SWITCH_MESSAGE,
  233. getAssistantMessageFromError,
  234. getErrorMessageIfRefusal,
  235. } from './errors.js'
  236. import {
  237. EMPTY_USAGE,
  238. type GlobalCacheStrategy,
  239. logAPIError,
  240. logAPIQuery,
  241. logAPISuccessAndDuration,
  242. type NonNullableUsage,
  243. } from './logging.js'
  244. import {
  245. CACHE_TTL_1HOUR_MS,
  246. checkResponseForCacheBreak,
  247. recordPromptState,
  248. } from './promptCacheBreakDetection.js'
  249. import {
  250. CannotRetryError,
  251. FallbackTriggeredError,
  252. is529Error,
  253. type RetryContext,
  254. withRetry,
  255. } from './withRetry.js'
  256. // Define a type that represents valid JSON values
  257. type JsonValue = string | number | boolean | null | JsonObject | JsonArray
  258. type JsonObject = { [key: string]: JsonValue }
  259. type JsonArray = JsonValue[]
  260. /**
  261. * Assemble the extra body parameters for the API request, based on the
  262. * CLAUDE_CODE_EXTRA_BODY environment variable if present and on any beta
  263. * headers (primarily for Bedrock requests).
  264. *
  265. * @param betaHeaders - An array of beta headers to include in the request.
  266. * @returns A JSON object representing the extra body parameters.
  267. */
  268. export function getExtraBodyParams(betaHeaders?: string[]): JsonObject {
  269. // Parse user's extra body parameters first
  270. const extraBodyStr = process.env.CLAUDE_CODE_EXTRA_BODY
  271. let result: JsonObject = {}
  272. if (extraBodyStr) {
  273. try {
  274. // Parse as JSON, which can be null, boolean, number, string, array or object
  275. const parsed = safeParseJSON(extraBodyStr)
  276. // We expect an object with key-value pairs to spread into API parameters
  277. if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
  278. // Shallow clone — safeParseJSON is LRU-cached and returns the same
  279. // object reference for the same string. Mutating `result` below
  280. // would poison the cache, causing stale values to persist.
  281. result = { ...(parsed as JsonObject) }
  282. } else {
  283. logForDebugging(
  284. `CLAUDE_CODE_EXTRA_BODY env var must be a JSON object, but was given ${extraBodyStr}`,
  285. { level: 'error' },
  286. )
  287. }
  288. } catch (error) {
  289. logForDebugging(
  290. `Error parsing CLAUDE_CODE_EXTRA_BODY: ${errorMessage(error)}`,
  291. { level: 'error' },
  292. )
  293. }
  294. }
  295. // Anti-distillation: send fake_tools opt-in for 1P CLI only
  296. if (
  297. feature('ANTI_DISTILLATION_CC')
  298. ? process.env.CLAUDE_CODE_ENTRYPOINT === 'cli' &&
  299. shouldIncludeFirstPartyOnlyBetas() &&
  300. getFeatureValue_CACHED_MAY_BE_STALE(
  301. 'tengu_anti_distill_fake_tool_injection',
  302. false,
  303. )
  304. : false
  305. ) {
  306. result.anti_distillation = ['fake_tools']
  307. }
  308. // Handle beta headers if provided
  309. if (betaHeaders && betaHeaders.length > 0) {
  310. if (result.anthropic_beta && Array.isArray(result.anthropic_beta)) {
  311. // Add to existing array, avoiding duplicates
  312. const existingHeaders = result.anthropic_beta as string[]
  313. const newHeaders = betaHeaders.filter(
  314. header => !existingHeaders.includes(header),
  315. )
  316. result.anthropic_beta = [...existingHeaders, ...newHeaders]
  317. } else {
  318. // Create new array with the beta headers
  319. result.anthropic_beta = betaHeaders
  320. }
  321. }
  322. return result
  323. }
  324. export function getPromptCachingEnabled(model: string): boolean {
  325. // Global disable takes precedence
  326. if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING)) return false
  327. // Check if we should disable for small/fast model
  328. if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING_HAIKU)) {
  329. const smallFastModel = getSmallFastModel()
  330. if (model === smallFastModel) return false
  331. }
  332. // Check if we should disable for default Sonnet
  333. if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING_SONNET)) {
  334. const defaultSonnet = getDefaultSonnetModel()
  335. if (model === defaultSonnet) return false
  336. }
  337. // Check if we should disable for default Opus
  338. if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING_OPUS)) {
  339. const defaultOpus = getDefaultOpusModel()
  340. if (model === defaultOpus) return false
  341. }
  342. return true
  343. }
  344. export function getCacheControl({
  345. scope,
  346. querySource,
  347. }: {
  348. scope?: CacheScope
  349. querySource?: QuerySource
  350. } = {}): {
  351. type: 'ephemeral'
  352. ttl?: '1h'
  353. scope?: CacheScope
  354. } {
  355. return {
  356. type: 'ephemeral',
  357. ...(should1hCacheTTL(querySource) && { ttl: '1h' }),
  358. ...(scope === 'global' && { scope }),
  359. }
  360. }
  361. /**
  362. * Determines if 1h TTL should be used for prompt caching.
  363. *
  364. * Only applied when:
  365. * 1. User is eligible (ant or subscriber within rate limits)
  366. * 2. The query source matches a pattern in the GrowthBook allowlist
  367. *
  368. * GrowthBook config shape: { allowlist: string[] }
  369. * Patterns support trailing '*' for prefix matching.
  370. * Examples:
  371. * - { allowlist: ["repl_main_thread*", "sdk"] } — main thread + SDK only
  372. * - { allowlist: ["repl_main_thread*", "sdk", "agent:*"] } — also subagents
  373. * - { allowlist: ["*"] } — all sources
  374. *
  375. * The allowlist is cached in STATE for session stability — prevents mixed
  376. * TTLs when GrowthBook's disk cache updates mid-request.
  377. */
  378. function should1hCacheTTL(querySource?: QuerySource): boolean {
  379. // 3P Bedrock users get 1h TTL when opted in via env var — they manage their own billing
  380. // No GrowthBook gating needed since 3P users don't have GrowthBook configured
  381. if (
  382. getAPIProvider() === 'bedrock' &&
  383. isEnvTruthy(process.env.ENABLE_PROMPT_CACHING_1H_BEDROCK)
  384. ) {
  385. return true
  386. }
  387. // Latch eligibility in bootstrap state for session stability — prevents
  388. // mid-session overage flips from changing the cache_control TTL, which
  389. // would bust the server-side prompt cache (~20K tokens per flip).
  390. let userEligible = getPromptCache1hEligible()
  391. if (userEligible === null) {
  392. userEligible =
  393. process.env.USER_TYPE === 'ant' ||
  394. (isClaudeAISubscriber() && !currentLimits.isUsingOverage)
  395. setPromptCache1hEligible(userEligible)
  396. }
  397. if (!userEligible) return false
  398. // Cache allowlist in bootstrap state for session stability — prevents mixed
  399. // TTLs when GrowthBook's disk cache updates mid-request
  400. let allowlist = getPromptCache1hAllowlist()
  401. if (allowlist === null) {
  402. const config = getFeatureValue_CACHED_MAY_BE_STALE<{
  403. allowlist?: string[]
  404. }>('tengu_prompt_cache_1h_config', {})
  405. allowlist = config.allowlist ?? []
  406. setPromptCache1hAllowlist(allowlist)
  407. }
  408. return (
  409. querySource !== undefined &&
  410. allowlist.some(pattern =>
  411. pattern.endsWith('*')
  412. ? querySource.startsWith(pattern.slice(0, -1))
  413. : querySource === pattern,
  414. )
  415. )
  416. }
  417. /**
  418. * Configure effort parameters for API request.
  419. *
  420. */
  421. function configureEffortParams(
  422. effortValue: EffortValue | undefined,
  423. outputConfig: BetaOutputConfig,
  424. extraBodyParams: Record<string, unknown>,
  425. betas: string[],
  426. model: string,
  427. ): void {
  428. if (!modelSupportsEffort(model) || 'effort' in outputConfig) {
  429. return
  430. }
  431. if (effortValue === undefined) {
  432. betas.push(EFFORT_BETA_HEADER)
  433. } else if (typeof effortValue === 'string') {
  434. // Send string effort level as is
  435. outputConfig.effort = effortValue
  436. betas.push(EFFORT_BETA_HEADER)
  437. } else if (process.env.USER_TYPE === 'ant') {
  438. // Numeric effort override - ant-only (uses anthropic_internal)
  439. const existingInternal =
  440. (extraBodyParams.anthropic_internal as Record<string, unknown>) || {}
  441. extraBodyParams.anthropic_internal = {
  442. ...existingInternal,
  443. effort_override: effortValue,
  444. }
  445. }
  446. }
  447. // output_config.task_budget — API-side token budget awareness for the model.
  448. // Stainless SDK types don't yet include task_budget on BetaOutputConfig, so we
  449. // define the wire shape locally and cast. The API validates on receipt; see
  450. // api/api/schemas/messages/request/output_config.py:12-39 in the monorepo.
  451. // Beta: task-budgets-2026-03-13 (EAP, claude-strudel-eap only as of Mar 2026).
  452. type TaskBudgetParam = {
  453. type: 'tokens'
  454. total: number
  455. remaining?: number
  456. }
  457. export function configureTaskBudgetParams(
  458. taskBudget: Options['taskBudget'],
  459. outputConfig: BetaOutputConfig & { task_budget?: TaskBudgetParam },
  460. betas: string[],
  461. ): void {
  462. if (
  463. !taskBudget ||
  464. 'task_budget' in outputConfig ||
  465. !shouldIncludeFirstPartyOnlyBetas()
  466. ) {
  467. return
  468. }
  469. outputConfig.task_budget = {
  470. type: 'tokens',
  471. total: taskBudget.total,
  472. ...(taskBudget.remaining !== undefined && {
  473. remaining: taskBudget.remaining,
  474. }),
  475. }
  476. if (!betas.includes(TASK_BUDGETS_BETA_HEADER)) {
  477. betas.push(TASK_BUDGETS_BETA_HEADER)
  478. }
  479. }
  480. export function getAPIMetadata() {
  481. // https://docs.google.com/document/d/1dURO9ycXXQCBS0V4Vhl4poDBRgkelFc5t2BNPoEgH5Q/edit?tab=t.0#heading=h.5g7nec5b09w5
  482. let extra: JsonObject = {}
  483. const extraStr = process.env.CLAUDE_CODE_EXTRA_METADATA
  484. if (extraStr) {
  485. const parsed = safeParseJSON(extraStr, false)
  486. if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
  487. extra = parsed as JsonObject
  488. } else {
  489. logForDebugging(
  490. `CLAUDE_CODE_EXTRA_METADATA env var must be a JSON object, but was given ${extraStr}`,
  491. { level: 'error' },
  492. )
  493. }
  494. }
  495. return {
  496. user_id: jsonStringify({
  497. ...extra,
  498. device_id: getOrCreateUserID(),
  499. // Only include OAuth account UUID when actively using OAuth authentication
  500. account_uuid: getOauthAccountInfo()?.accountUuid ?? '',
  501. session_id: getSessionId(),
  502. }),
  503. }
  504. }
  505. export async function verifyApiKey(
  506. apiKey: string,
  507. isNonInteractiveSession: boolean,
  508. ): Promise<boolean> {
  509. // Skip API verification if running in print mode (isNonInteractiveSession)
  510. if (isNonInteractiveSession) {
  511. return true
  512. }
  513. try {
  514. // WARNING: if you change this to use a non-Haiku model, this request will fail in 1P unless it uses getCLISyspromptPrefix.
  515. const model = getSmallFastModel()
  516. const betas = getModelBetas(model)
  517. return await returnValue(
  518. withRetry(
  519. () =>
  520. getAnthropicClient({
  521. apiKey,
  522. maxRetries: 3,
  523. model,
  524. source: 'verify_api_key',
  525. }),
  526. async anthropic => {
  527. const messages: MessageParam[] = [{ role: 'user', content: 'test' }]
  528. // biome-ignore lint/plugin: API key verification is intentionally a minimal direct call
  529. await anthropic.beta.messages.create({
  530. model,
  531. max_tokens: 1,
  532. messages,
  533. temperature: 1,
  534. ...(betas.length > 0 && { betas }),
  535. metadata: getAPIMetadata(),
  536. ...getExtraBodyParams(),
  537. })
  538. return true
  539. },
  540. { maxRetries: 2, model, thinkingConfig: { type: 'disabled' } }, // Use fewer retries for API key verification
  541. ),
  542. )
  543. } catch (errorFromRetry) {
  544. let error = errorFromRetry
  545. if (errorFromRetry instanceof CannotRetryError) {
  546. error = errorFromRetry.originalError
  547. }
  548. logError(error)
  549. // Check for authentication error
  550. if (
  551. error instanceof Error &&
  552. error.message.includes(
  553. '{"type":"error","error":{"type":"authentication_error","message":"invalid x-api-key"}}',
  554. )
  555. ) {
  556. return false
  557. }
  558. throw error
  559. }
  560. }
  561. export function userMessageToMessageParam(
  562. message: UserMessage,
  563. addCache = false,
  564. enablePromptCaching: boolean,
  565. querySource?: QuerySource,
  566. ): MessageParam {
  567. if (addCache) {
  568. if (typeof message.message.content === 'string') {
  569. return {
  570. role: 'user',
  571. content: [
  572. {
  573. type: 'text',
  574. text: message.message.content,
  575. ...(enablePromptCaching && {
  576. cache_control: getCacheControl({ querySource }),
  577. }),
  578. },
  579. ],
  580. }
  581. } else {
  582. return {
  583. role: 'user',
  584. content: message.message.content.map((_, i) => ({
  585. ..._,
  586. ...(i === message.message.content.length - 1
  587. ? enablePromptCaching
  588. ? { cache_control: getCacheControl({ querySource }) }
  589. : {}
  590. : {}),
  591. })),
  592. }
  593. }
  594. }
  595. // Clone array content to prevent in-place mutations (e.g., insertCacheEditsBlock's
  596. // splice) from contaminating the original message. Without cloning, multiple calls
  597. // to addCacheBreakpoints share the same array and each splices in duplicate cache_edits.
  598. return {
  599. role: 'user',
  600. content: Array.isArray(message.message.content)
  601. ? [...message.message.content]
  602. : message.message.content,
  603. }
  604. }
  605. export function assistantMessageToMessageParam(
  606. message: AssistantMessage,
  607. addCache = false,
  608. enablePromptCaching: boolean,
  609. querySource?: QuerySource,
  610. ): MessageParam {
  611. if (addCache) {
  612. if (typeof message.message.content === 'string') {
  613. return {
  614. role: 'assistant',
  615. content: [
  616. {
  617. type: 'text',
  618. text: message.message.content,
  619. ...(enablePromptCaching && {
  620. cache_control: getCacheControl({ querySource }),
  621. }),
  622. },
  623. ],
  624. }
  625. } else {
  626. return {
  627. role: 'assistant',
  628. content: message.message.content.map((_, i) => ({
  629. ..._,
  630. ...(i === message.message.content.length - 1 &&
  631. _.type !== 'thinking' &&
  632. _.type !== 'redacted_thinking' &&
  633. (feature('CONNECTOR_TEXT') ? !isConnectorTextBlock(_) : true)
  634. ? enablePromptCaching
  635. ? { cache_control: getCacheControl({ querySource }) }
  636. : {}
  637. : {}),
  638. })),
  639. }
  640. }
  641. }
  642. return {
  643. role: 'assistant',
  644. content: message.message.content,
  645. }
  646. }
  647. export type Options = {
  648. getToolPermissionContext: () => Promise<ToolPermissionContext>
  649. model: string
  650. toolChoice?: BetaToolChoiceTool | BetaToolChoiceAuto | undefined
  651. isNonInteractiveSession: boolean
  652. extraToolSchemas?: BetaToolUnion[]
  653. maxOutputTokensOverride?: number
  654. fallbackModel?: string
  655. onStreamingFallback?: () => void
  656. querySource: QuerySource
  657. agents: AgentDefinition[]
  658. allowedAgentTypes?: string[]
  659. hasAppendSystemPrompt: boolean
  660. fetchOverride?: ClientOptions['fetch']
  661. enablePromptCaching?: boolean
  662. skipCacheWrite?: boolean
  663. temperatureOverride?: number
  664. effortValue?: EffortValue
  665. mcpTools: Tools
  666. hasPendingMcpServers?: boolean
  667. queryTracking?: QueryChainTracking
  668. agentId?: AgentId // Only set for subagents
  669. outputFormat?: BetaJSONOutputFormat
  670. fastMode?: boolean
  671. advisorModel?: string
  672. addNotification?: (notif: Notification) => void
  673. // API-side task budget (output_config.task_budget). Distinct from the
  674. // tokenBudget.ts +500k auto-continue feature — this one is sent to the API
  675. // so the model can pace itself. `remaining` is computed by the caller
  676. // (query.ts decrements across the agentic loop).
  677. taskBudget?: { total: number; remaining?: number }
  678. }
  679. export async function queryModelWithoutStreaming({
  680. messages,
  681. systemPrompt,
  682. thinkingConfig,
  683. tools,
  684. signal,
  685. options,
  686. }: {
  687. messages: Message[]
  688. systemPrompt: SystemPrompt
  689. thinkingConfig: ThinkingConfig
  690. tools: Tools
  691. signal: AbortSignal
  692. options: Options
  693. }): Promise<AssistantMessage> {
  694. // Store the assistant message but continue consuming the generator to ensure
  695. // logAPISuccessAndDuration gets called (which happens after all yields)
  696. let assistantMessage: AssistantMessage | undefined
  697. for await (const message of withStreamingVCR(messages, async function* () {
  698. yield* queryModel(
  699. messages,
  700. systemPrompt,
  701. thinkingConfig,
  702. tools,
  703. signal,
  704. options,
  705. )
  706. })) {
  707. if (message.type === 'assistant') {
  708. assistantMessage = message
  709. }
  710. }
  711. if (!assistantMessage) {
  712. // If the signal was aborted, throw APIUserAbortError instead of a generic error
  713. // This allows callers to handle abort scenarios gracefully
  714. if (signal.aborted) {
  715. throw new APIUserAbortError()
  716. }
  717. throw new Error('未找到助手消息')
  718. }
  719. return assistantMessage
  720. }
  721. export async function* queryModelWithStreaming({
  722. messages,
  723. systemPrompt,
  724. thinkingConfig,
  725. tools,
  726. signal,
  727. options,
  728. }: {
  729. messages: Message[]
  730. systemPrompt: SystemPrompt
  731. thinkingConfig: ThinkingConfig
  732. tools: Tools
  733. signal: AbortSignal
  734. options: Options
  735. }): AsyncGenerator<
  736. StreamEvent | AssistantMessage | SystemAPIErrorMessage,
  737. void
  738. > {
  739. return yield* withStreamingVCR(messages, async function* () {
  740. yield* queryModel(
  741. messages,
  742. systemPrompt,
  743. thinkingConfig,
  744. tools,
  745. signal,
  746. options,
  747. )
  748. })
  749. }
  750. /**
  751. * Determines if an LSP tool should be deferred (tool appears with defer_loading: true)
  752. * because LSP initialization is not yet complete.
  753. */
  754. function shouldDeferLspTool(tool: Tool): boolean {
  755. if (!('isLsp' in tool) || !tool.isLsp) {
  756. return false
  757. }
  758. const status = getInitializationStatus()
  759. // Defer when pending or not started
  760. return status.status === 'pending' || status.status === 'not-started'
  761. }
  762. /**
  763. * Per-attempt timeout for non-streaming fallback requests, in milliseconds.
  764. * Reads API_TIMEOUT_MS when set so slow backends and the streaming path
  765. * share the same ceiling.
  766. *
  767. * Remote sessions default to 120s to stay under CCR's container idle-kill
  768. * (~5min) so a hung fallback to a wedged backend surfaces a clean
  769. * APIConnectionTimeoutError instead of stalling past SIGKILL.
  770. *
  771. * Otherwise defaults to 300s — long enough for slow backends without
  772. * approaching the API's 10-minute non-streaming boundary.
  773. */
  774. function getNonstreamingFallbackTimeoutMs(): number {
  775. const override = parseInt(process.env.API_TIMEOUT_MS || '', 10)
  776. if (override) return override
  777. return isEnvTruthy(process.env.CLAUDE_CODE_REMOTE) ? 120_000 : 300_000
  778. }
  779. /**
  780. * Helper generator for non-streaming API requests.
  781. * Encapsulates the common pattern of creating a withRetry generator,
  782. * iterating to yield system messages, and returning the final BetaMessage.
  783. */
  784. export async function* executeNonStreamingRequest(
  785. clientOptions: {
  786. model: string
  787. fetchOverride?: Options['fetchOverride']
  788. source: string
  789. },
  790. retryOptions: {
  791. model: string
  792. fallbackModel?: string
  793. thinkingConfig: ThinkingConfig
  794. fastMode?: boolean
  795. signal: AbortSignal
  796. initialConsecutive529Errors?: number
  797. querySource?: QuerySource
  798. },
  799. paramsFromContext: (context: RetryContext) => BetaMessageStreamParams,
  800. onAttempt: (attempt: number, start: number, maxOutputTokens: number) => void,
  801. captureRequest: (params: BetaMessageStreamParams) => void,
  802. /**
  803. * Request ID of the failed streaming attempt this fallback is recovering
  804. * from. Emitted in tengu_nonstreaming_fallback_error for funnel correlation.
  805. */
  806. originatingRequestId?: string | null,
  807. ): AsyncGenerator<SystemAPIErrorMessage, BetaMessage> {
  808. const fallbackTimeoutMs = getNonstreamingFallbackTimeoutMs()
  809. const generator = withRetry(
  810. () =>
  811. getAnthropicClient({
  812. maxRetries: 0,
  813. model: clientOptions.model,
  814. fetchOverride: clientOptions.fetchOverride,
  815. source: clientOptions.source,
  816. }),
  817. async (anthropic, attempt, context) => {
  818. const start = Date.now()
  819. const retryParams = paramsFromContext(context)
  820. captureRequest(retryParams)
  821. onAttempt(attempt, start, retryParams.max_tokens)
  822. const adjustedParams = adjustParamsForNonStreaming(
  823. retryParams,
  824. MAX_NON_STREAMING_TOKENS,
  825. )
  826. try {
  827. // biome-ignore lint/plugin: non-streaming API call
  828. return await anthropic.beta.messages.create(
  829. {
  830. ...adjustedParams,
  831. model: normalizeModelStringForAPI(adjustedParams.model),
  832. },
  833. {
  834. signal: retryOptions.signal,
  835. timeout: fallbackTimeoutMs,
  836. },
  837. )
  838. } catch (err) {
  839. // User aborts are not errors — re-throw immediately without logging
  840. if (err instanceof APIUserAbortError) throw err
  841. // Instrumentation: record when the non-streaming request errors (including
  842. // timeouts). Lets us distinguish "fallback hung past container kill"
  843. // (no event) from "fallback hit the bounded timeout" (this event).
  844. logForDiagnosticsNoPII('error', 'cli_nonstreaming_fallback_error')
  845. logEvent('tengu_nonstreaming_fallback_error', {
  846. model:
  847. clientOptions.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  848. error:
  849. err instanceof Error
  850. ? (err.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
  851. : ('unknown' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
  852. attempt,
  853. timeout_ms: fallbackTimeoutMs,
  854. request_id: (originatingRequestId ??
  855. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  856. })
  857. throw err
  858. }
  859. },
  860. {
  861. model: retryOptions.model,
  862. fallbackModel: retryOptions.fallbackModel,
  863. thinkingConfig: retryOptions.thinkingConfig,
  864. ...(isFastModeEnabled() && { fastMode: retryOptions.fastMode }),
  865. signal: retryOptions.signal,
  866. initialConsecutive529Errors: retryOptions.initialConsecutive529Errors,
  867. querySource: retryOptions.querySource,
  868. },
  869. )
  870. let e
  871. do {
  872. e = await generator.next()
  873. if (!e.done && e.value.type === 'system') {
  874. yield e.value
  875. }
  876. } while (!e.done)
  877. return e.value as BetaMessage
  878. }
  879. /**
  880. * Extracts the request ID from the most recent assistant message in the
  881. * conversation. Used to link consecutive API requests in analytics so we can
  882. * join them for cache-hit-rate analysis and incremental token tracking.
  883. *
  884. * Deriving this from the message array (rather than global state) ensures each
  885. * query chain (main thread, subagent, teammate) tracks its own request chain
  886. * independently, and rollback/undo naturally updates the value.
  887. */
  888. function getPreviousRequestIdFromMessages(
  889. messages: Message[],
  890. ): string | undefined {
  891. for (let i = messages.length - 1; i >= 0; i--) {
  892. const msg = messages[i]!
  893. if (msg.type === 'assistant' && msg.requestId) {
  894. return msg.requestId
  895. }
  896. }
  897. return undefined
  898. }
  899. function isMedia(
  900. block: BetaContentBlockParam,
  901. ): block is BetaImageBlockParam | BetaRequestDocumentBlock {
  902. return block.type === 'image' || block.type === 'document'
  903. }
  904. function isToolResult(
  905. block: BetaContentBlockParam,
  906. ): block is BetaToolResultBlockParam {
  907. return block.type === 'tool_result'
  908. }
  909. /**
  910. * Ensures messages contain at most `limit` media items (images + documents).
  911. * Strips oldest media first to preserve the most recent.
  912. */
  913. export function stripExcessMediaItems(
  914. messages: (UserMessage | AssistantMessage)[],
  915. limit: number,
  916. ): (UserMessage | AssistantMessage)[] {
  917. let toRemove = 0
  918. for (const msg of messages) {
  919. if (!Array.isArray(msg.message.content)) continue
  920. for (const block of msg.message.content) {
  921. if (isMedia(block)) toRemove++
  922. if (isToolResult(block) && Array.isArray(block.content)) {
  923. for (const nested of block.content) {
  924. if (isMedia(nested)) toRemove++
  925. }
  926. }
  927. }
  928. }
  929. toRemove -= limit
  930. if (toRemove <= 0) return messages
  931. return messages.map(msg => {
  932. if (toRemove <= 0) return msg
  933. const content = msg.message.content
  934. if (!Array.isArray(content)) return msg
  935. const before = toRemove
  936. const stripped = content
  937. .map(block => {
  938. if (
  939. toRemove <= 0 ||
  940. !isToolResult(block) ||
  941. !Array.isArray(block.content)
  942. )
  943. return block
  944. const filtered = block.content.filter(n => {
  945. if (toRemove > 0 && isMedia(n)) {
  946. toRemove--
  947. return false
  948. }
  949. return true
  950. })
  951. return filtered.length === block.content.length
  952. ? block
  953. : { ...block, content: filtered }
  954. })
  955. .filter(block => {
  956. if (toRemove > 0 && isMedia(block)) {
  957. toRemove--
  958. return false
  959. }
  960. return true
  961. })
  962. return before === toRemove
  963. ? msg
  964. : {
  965. ...msg,
  966. message: { ...msg.message, content: stripped },
  967. }
  968. }) as (UserMessage | AssistantMessage)[]
  969. }
  970. async function* queryModel(
  971. messages: Message[],
  972. systemPrompt: SystemPrompt,
  973. thinkingConfig: ThinkingConfig,
  974. tools: Tools,
  975. signal: AbortSignal,
  976. options: Options,
  977. ): AsyncGenerator<
  978. StreamEvent | AssistantMessage | SystemAPIErrorMessage,
  979. void
  980. > {
  981. // Check cheap conditions first — the off-switch await blocks on GrowthBook
  982. // init (~10ms). For non-Opus models (haiku, sonnet) this skips the await
  983. // entirely. Subscribers don't hit this path at all.
  984. if (
  985. !isClaudeAISubscriber() &&
  986. isNonCustomOpusModel(options.model) &&
  987. (
  988. await getDynamicConfig_BLOCKS_ON_INIT<{ activated: boolean }>(
  989. 'tengu-off-switch',
  990. {
  991. activated: false,
  992. },
  993. )
  994. ).activated
  995. ) {
  996. logEvent('tengu_off_switch_query', {})
  997. yield getAssistantMessageFromError(
  998. new Error(CUSTOM_OFF_SWITCH_MESSAGE),
  999. options.model,
  1000. )
  1001. return
  1002. }
  1003. // Derive previous request ID from the last assistant message in this query chain.
  1004. // This is scoped per message array (main thread, subagent, teammate each have their own),
  1005. // so concurrent agents don't clobber each other's request chain tracking.
  1006. // Also naturally handles rollback/undo since removed messages won't be in the array.
  1007. const previousRequestId = getPreviousRequestIdFromMessages(messages)
  1008. const resolvedModel =
  1009. getAPIProvider() === 'bedrock' &&
  1010. options.model.includes('application-inference-profile')
  1011. ? ((await getInferenceProfileBackingModel(options.model)) ??
  1012. options.model)
  1013. : options.model
  1014. queryCheckpoint('query_tool_schema_build_start')
  1015. const isAgenticQuery =
  1016. options.querySource.startsWith('repl_main_thread') ||
  1017. options.querySource.startsWith('agent:') ||
  1018. options.querySource === 'sdk' ||
  1019. options.querySource === 'hook_agent' ||
  1020. options.querySource === 'verification_agent'
  1021. const betas = getMergedBetas(options.model, { isAgenticQuery })
  1022. // Always send the advisor beta header when advisor is enabled, so
  1023. // non-agentic queries (compact, side_question, extract_memories, etc.)
  1024. // can parse advisor server_tool_use blocks already in the conversation history.
  1025. if (isAdvisorEnabled()) {
  1026. betas.push(ADVISOR_BETA_HEADER)
  1027. }
  1028. let advisorModel: string | undefined
  1029. if (isAgenticQuery && isAdvisorEnabled()) {
  1030. let advisorOption = options.advisorModel
  1031. const advisorExperiment = getExperimentAdvisorModels()
  1032. if (advisorExperiment !== undefined) {
  1033. if (
  1034. normalizeModelStringForAPI(advisorExperiment.baseModel) ===
  1035. normalizeModelStringForAPI(options.model)
  1036. ) {
  1037. // Override the advisor model if the base model matches. We
  1038. // should only have experiment models if the user cannot
  1039. // configure it themselves.
  1040. advisorOption = advisorExperiment.advisorModel
  1041. }
  1042. }
  1043. if (advisorOption) {
  1044. const normalizedAdvisorModel = normalizeModelStringForAPI(
  1045. parseUserSpecifiedModel(advisorOption),
  1046. )
  1047. if (!modelSupportsAdvisor(options.model)) {
  1048. logForDebugging(
  1049. `[AdvisorTool] Skipping advisor - base model ${options.model} does not support advisor`,
  1050. )
  1051. } else if (!isValidAdvisorModel(normalizedAdvisorModel)) {
  1052. logForDebugging(
  1053. `[AdvisorTool] Skipping advisor - ${normalizedAdvisorModel} is not a valid advisor model`,
  1054. )
  1055. } else {
  1056. advisorModel = normalizedAdvisorModel
  1057. logForDebugging(
  1058. `[AdvisorTool] Server-side tool enabled with ${advisorModel} as the advisor model`,
  1059. )
  1060. }
  1061. }
  1062. }
  1063. // Check if tool search is enabled (checks mode, model support, and threshold for auto mode)
  1064. // This is async because it may need to calculate MCP tool description sizes for TstAuto mode
  1065. let useToolSearch = await isToolSearchEnabled(
  1066. options.model,
  1067. tools,
  1068. options.getToolPermissionContext,
  1069. options.agents,
  1070. 'query',
  1071. )
  1072. // Precompute once — isDeferredTool does 2 GrowthBook lookups per call
  1073. const deferredToolNames = new Set<string>()
  1074. if (useToolSearch) {
  1075. for (const t of tools) {
  1076. if (isDeferredTool(t)) deferredToolNames.add(t.name)
  1077. }
  1078. }
  1079. // Even if tool search mode is enabled, skip if there are no deferred tools
  1080. // AND no MCP servers are still connecting. When servers are pending, keep
  1081. // ToolSearch available so the model can discover tools after they connect.
  1082. if (
  1083. useToolSearch &&
  1084. deferredToolNames.size === 0 &&
  1085. !options.hasPendingMcpServers
  1086. ) {
  1087. logForDebugging(
  1088. 'Tool search disabled: no deferred tools available to search',
  1089. )
  1090. useToolSearch = false
  1091. }
  1092. // Filter out ToolSearchTool if tool search is not enabled for this model
  1093. // ToolSearchTool returns tool_reference blocks which unsupported models can't handle
  1094. let filteredTools: Tools
  1095. if (useToolSearch) {
  1096. // Dynamic tool loading: Only include deferred tools that have been discovered
  1097. // via tool_reference blocks in the message history. This eliminates the need
  1098. // to predeclare all deferred tools upfront and removes limits on tool quantity.
  1099. const discoveredToolNames = extractDiscoveredToolNames(messages)
  1100. filteredTools = tools.filter(tool => {
  1101. // Always include non-deferred tools
  1102. if (!deferredToolNames.has(tool.name)) return true
  1103. // Always include ToolSearchTool (so it can discover more tools)
  1104. if (toolMatchesName(tool, TOOL_SEARCH_TOOL_NAME)) return true
  1105. // Only include deferred tools that have been discovered
  1106. return discoveredToolNames.has(tool.name)
  1107. })
  1108. } else {
  1109. filteredTools = tools.filter(
  1110. t => !toolMatchesName(t, TOOL_SEARCH_TOOL_NAME),
  1111. )
  1112. }
  1113. // Add tool search beta header if enabled - required for defer_loading to be accepted
  1114. // Header differs by provider: 1P/Foundry use advanced-tool-use, Vertex/Bedrock use tool-search-tool
  1115. // For Bedrock, this header must go in extraBodyParams, not the betas array
  1116. const toolSearchHeader = useToolSearch ? getToolSearchBetaHeader() : null
  1117. if (toolSearchHeader && getAPIProvider() !== 'bedrock') {
  1118. if (!betas.includes(toolSearchHeader)) {
  1119. betas.push(toolSearchHeader)
  1120. }
  1121. }
  1122. // Determine if cached microcompact is enabled for this model.
  1123. // Computed once here (in async context) and captured by paramsFromContext.
  1124. // The beta header is also captured here to avoid a top-level import of the
  1125. // ant-only CACHE_EDITING_BETA_HEADER constant.
  1126. let cachedMCEnabled = false
  1127. let cacheEditingBetaHeader = ''
  1128. if (feature('CACHED_MICROCOMPACT')) {
  1129. const {
  1130. isCachedMicrocompactEnabled,
  1131. isModelSupportedForCacheEditing,
  1132. getCachedMCConfig,
  1133. } = await import('../compact/cachedMicrocompact.js')
  1134. const betas = await import('src/constants/betas.js')
  1135. cacheEditingBetaHeader = betas.CACHE_EDITING_BETA_HEADER
  1136. const featureEnabled = isCachedMicrocompactEnabled()
  1137. const modelSupported = isModelSupportedForCacheEditing(options.model)
  1138. cachedMCEnabled = featureEnabled && modelSupported
  1139. const config = getCachedMCConfig()
  1140. logForDebugging(
  1141. `Cached MC gate: enabled=${featureEnabled} modelSupported=${modelSupported} model=${options.model} supportedModels=${jsonStringify(config.supportedModels)}`,
  1142. )
  1143. }
  1144. const useGlobalCacheFeature = shouldUseGlobalCacheScope()
  1145. const willDefer = (t: Tool) =>
  1146. useToolSearch && (deferredToolNames.has(t.name) || shouldDeferLspTool(t))
  1147. // MCP tools are per-user → dynamic tool section → can't globally cache.
  1148. // Only gate when an MCP tool will actually render (not defer_loading).
  1149. const needsToolBasedCacheMarker =
  1150. useGlobalCacheFeature &&
  1151. filteredTools.some(t => t.isMcp === true && !willDefer(t))
  1152. // Ensure prompt_caching_scope beta header is present when global cache is enabled.
  1153. if (
  1154. useGlobalCacheFeature &&
  1155. !betas.includes(PROMPT_CACHING_SCOPE_BETA_HEADER)
  1156. ) {
  1157. betas.push(PROMPT_CACHING_SCOPE_BETA_HEADER)
  1158. }
  1159. // Determine global cache strategy for logging
  1160. const globalCacheStrategy: GlobalCacheStrategy = useGlobalCacheFeature
  1161. ? needsToolBasedCacheMarker
  1162. ? 'none'
  1163. : 'system_prompt'
  1164. : 'none'
  1165. // Build tool schemas, adding defer_loading for MCP tools when tool search is enabled
  1166. // Note: We pass the full `tools` list (not filteredTools) to toolToAPISchema so that
  1167. // ToolSearchTool's prompt can list ALL available MCP tools. The filtering only affects
  1168. // which tools are actually sent to the API, not what the model sees in tool descriptions.
  1169. const toolSchemas = await Promise.all(
  1170. filteredTools.map(tool =>
  1171. toolToAPISchema(tool, {
  1172. getToolPermissionContext: options.getToolPermissionContext,
  1173. tools,
  1174. agents: options.agents,
  1175. allowedAgentTypes: options.allowedAgentTypes,
  1176. model: options.model,
  1177. deferLoading: willDefer(tool),
  1178. }),
  1179. ),
  1180. )
  1181. if (useToolSearch) {
  1182. const includedDeferredTools = count(filteredTools, t =>
  1183. deferredToolNames.has(t.name),
  1184. )
  1185. logForDebugging(
  1186. `Dynamic tool loading: ${includedDeferredTools}/${deferredToolNames.size} deferred tools included`,
  1187. )
  1188. }
  1189. queryCheckpoint('query_tool_schema_build_end')
  1190. // Normalize messages before building system prompt (needed for fingerprinting)
  1191. // Instrumentation: Track message count before normalization
  1192. logEvent('tengu_api_before_normalize', {
  1193. preNormalizedMessageCount: messages.length,
  1194. })
  1195. queryCheckpoint('query_message_normalization_start')
  1196. let messagesForAPI = normalizeMessagesForAPI(messages, filteredTools)
  1197. queryCheckpoint('query_message_normalization_end')
  1198. // Model-specific post-processing: strip tool-search-specific fields if the
  1199. // selected model doesn't support tool search.
  1200. //
  1201. // Why is this needed in addition to normalizeMessagesForAPI?
  1202. // - normalizeMessagesForAPI uses isToolSearchEnabledNoModelCheck() because it's
  1203. // called from ~20 places (analytics, feedback, sharing, etc.), many of which
  1204. // don't have model context. Adding model to its signature would be a large refactor.
  1205. // - This post-processing uses the model-aware isToolSearchEnabled() check
  1206. // - This handles mid-conversation model switching (e.g., Sonnet → Haiku) where
  1207. // stale tool-search fields from the previous model would cause 400 errors
  1208. //
  1209. // Note: For assistant messages, normalizeMessagesForAPI already normalized the
  1210. // tool inputs, so stripCallerFieldFromAssistantMessage only needs to remove the
  1211. // 'caller' field (not re-normalize inputs).
  1212. if (!useToolSearch) {
  1213. messagesForAPI = messagesForAPI.map(msg => {
  1214. switch (msg.type) {
  1215. case 'user':
  1216. // Strip tool_reference blocks from tool_result content
  1217. return stripToolReferenceBlocksFromUserMessage(msg)
  1218. case 'assistant':
  1219. // Strip 'caller' field from tool_use blocks
  1220. return stripCallerFieldFromAssistantMessage(msg)
  1221. default:
  1222. return msg
  1223. }
  1224. })
  1225. }
  1226. // Repair tool_use/tool_result pairing mismatches that can occur when resuming
  1227. // remote/teleport sessions. Inserts synthetic error tool_results for orphaned
  1228. // tool_uses and strips orphaned tool_results referencing non-existent tool_uses.
  1229. messagesForAPI = ensureToolResultPairing(messagesForAPI)
  1230. // Strip advisor blocks — the API rejects them without the beta header.
  1231. if (!betas.includes(ADVISOR_BETA_HEADER)) {
  1232. messagesForAPI = stripAdvisorBlocks(messagesForAPI)
  1233. }
  1234. // Strip excess media items before making the API call.
  1235. // The API rejects requests with >100 media items but returns a confusing error.
  1236. // Rather than erroring (which is hard to recover from in Cowork/CCD), we
  1237. // silently drop the oldest media items to stay within the limit.
  1238. messagesForAPI = stripExcessMediaItems(
  1239. messagesForAPI,
  1240. API_MAX_MEDIA_PER_REQUEST,
  1241. )
  1242. // Instrumentation: Track message count after normalization
  1243. logEvent('tengu_api_after_normalize', {
  1244. postNormalizedMessageCount: messagesForAPI.length,
  1245. })
  1246. // Compute fingerprint from first user message for attribution.
  1247. // Must run BEFORE injecting synthetic messages (e.g. deferred tool names)
  1248. // so the fingerprint reflects the actual user input.
  1249. const fingerprint = computeFingerprintFromMessages(messagesForAPI)
  1250. // When the delta attachment is enabled, deferred tools are announced
  1251. // via persisted deferred_tools_delta attachments instead of this
  1252. // ephemeral prepend (which busts cache whenever the pool changes).
  1253. if (useToolSearch && !isDeferredToolsDeltaEnabled()) {
  1254. const deferredToolList = tools
  1255. .filter(t => deferredToolNames.has(t.name))
  1256. .map(formatDeferredToolLine)
  1257. .sort()
  1258. .join('\n')
  1259. if (deferredToolList) {
  1260. messagesForAPI = [
  1261. createUserMessage({
  1262. content: `<available-deferred-tools>\n${deferredToolList}\n</available-deferred-tools>`,
  1263. isMeta: true,
  1264. }),
  1265. ...messagesForAPI,
  1266. ]
  1267. }
  1268. }
  1269. // Chrome tool-search instructions: when the delta attachment is enabled,
  1270. // these are carried as a client-side block in mcp_instructions_delta
  1271. // (attachments.ts) instead of here. This per-request sys-prompt append
  1272. // busts the prompt cache when chrome connects late.
  1273. const hasChromeTools = filteredTools.some(t =>
  1274. isToolFromMcpServer(t.name, CLAUDE_IN_CHROME_MCP_SERVER_NAME),
  1275. )
  1276. const injectChromeHere =
  1277. useToolSearch && hasChromeTools && !isMcpInstructionsDeltaEnabled()
  1278. // filter(Boolean) works by converting each element to a boolean - empty strings become false and are filtered out.
  1279. systemPrompt = asSystemPrompt(
  1280. [
  1281. getAttributionHeader(fingerprint),
  1282. getCLISyspromptPrefix({
  1283. isNonInteractive: options.isNonInteractiveSession,
  1284. hasAppendSystemPrompt: options.hasAppendSystemPrompt,
  1285. }),
  1286. ...systemPrompt,
  1287. ...(advisorModel ? [ADVISOR_TOOL_INSTRUCTIONS] : []),
  1288. ...(injectChromeHere ? [CHROME_TOOL_SEARCH_INSTRUCTIONS] : []),
  1289. ].filter(Boolean),
  1290. )
  1291. // Prepend system prompt block for easy API identification
  1292. logAPIPrefix(systemPrompt)
  1293. const enablePromptCaching =
  1294. options.enablePromptCaching ?? getPromptCachingEnabled(options.model)
  1295. const system = buildSystemPromptBlocks(systemPrompt, enablePromptCaching, {
  1296. skipGlobalCacheForSystemPrompt: needsToolBasedCacheMarker,
  1297. querySource: options.querySource,
  1298. })
  1299. const useBetas = betas.length > 0
  1300. // Build minimal context for detailed tracing (when beta tracing is enabled)
  1301. // Note: The actual new_context message extraction is done in sessionTracing.ts using
  1302. // hash-based tracking per querySource (agent) from the messagesForAPI array
  1303. const extraToolSchemas = [...(options.extraToolSchemas ?? [])]
  1304. if (advisorModel) {
  1305. // Server tools must be in the tools array by API contract. Appended after
  1306. // toolSchemas (which carries the cache_control marker) so toggling /advisor
  1307. // only churns the small suffix, not the cached prefix.
  1308. extraToolSchemas.push({
  1309. type: 'advisor_20260301',
  1310. name: 'advisor',
  1311. model: advisorModel,
  1312. } as unknown as BetaToolUnion)
  1313. }
  1314. const allTools = [...toolSchemas, ...extraToolSchemas]
  1315. const isFastMode =
  1316. isFastModeEnabled() &&
  1317. isFastModeAvailable() &&
  1318. !isFastModeCooldown() &&
  1319. isFastModeSupportedByModel(options.model) &&
  1320. !!options.fastMode
  1321. // Sticky-on latches for dynamic beta headers. Each header, once first
  1322. // sent, keeps being sent for the rest of the session so mid-session
  1323. // toggles don't change the server-side cache key and bust ~50-70K tokens.
  1324. // Latches are cleared on /clear and /compact via clearBetaHeaderLatches().
  1325. // Per-call gates (isAgenticQuery, querySource===repl_main_thread) stay
  1326. // per-call so non-agentic queries keep their own stable header set.
  1327. let afkHeaderLatched = getAfkModeHeaderLatched() === true
  1328. if (feature('TRANSCRIPT_CLASSIFIER')) {
  1329. if (
  1330. !afkHeaderLatched &&
  1331. isAgenticQuery &&
  1332. shouldIncludeFirstPartyOnlyBetas() &&
  1333. (autoModeStateModule?.isAutoModeActive() ?? false)
  1334. ) {
  1335. afkHeaderLatched = true
  1336. setAfkModeHeaderLatched(true)
  1337. }
  1338. }
  1339. let fastModeHeaderLatched = getFastModeHeaderLatched() === true
  1340. if (!fastModeHeaderLatched && isFastMode) {
  1341. fastModeHeaderLatched = true
  1342. setFastModeHeaderLatched(true)
  1343. }
  1344. let cacheEditingHeaderLatched = getCacheEditingHeaderLatched() === true
  1345. if (feature('CACHED_MICROCOMPACT')) {
  1346. if (
  1347. !cacheEditingHeaderLatched &&
  1348. cachedMCEnabled &&
  1349. getAPIProvider() === 'firstParty' &&
  1350. options.querySource === 'repl_main_thread'
  1351. ) {
  1352. cacheEditingHeaderLatched = true
  1353. setCacheEditingHeaderLatched(true)
  1354. }
  1355. }
  1356. // Only latch from agentic queries so a classifier call doesn't flip the
  1357. // main thread's context_management mid-turn.
  1358. let thinkingClearLatched = getThinkingClearLatched() === true
  1359. if (!thinkingClearLatched && isAgenticQuery) {
  1360. const lastCompletion = getLastApiCompletionTimestamp()
  1361. if (
  1362. lastCompletion !== null &&
  1363. Date.now() - lastCompletion > CACHE_TTL_1HOUR_MS
  1364. ) {
  1365. thinkingClearLatched = true
  1366. setThinkingClearLatched(true)
  1367. }
  1368. }
  1369. const effort = resolveAppliedEffort(options.model, options.effortValue)
  1370. if (feature('PROMPT_CACHE_BREAK_DETECTION')) {
  1371. // Exclude defer_loading tools from the hash -- the API strips them from the
  1372. // prompt, so they never affect the actual cache key. Including them creates
  1373. // false-positive "tool schemas changed" breaks when tools are discovered or
  1374. // MCP servers reconnect.
  1375. const toolsForCacheDetection = allTools.filter(
  1376. t => !('defer_loading' in t && t.defer_loading),
  1377. )
  1378. // Capture everything that could affect the server-side cache key.
  1379. // Pass latched header values (not live state) so break detection
  1380. // reflects what we actually send, not what the user toggled.
  1381. recordPromptState({
  1382. system,
  1383. toolSchemas: toolsForCacheDetection,
  1384. querySource: options.querySource,
  1385. model: options.model,
  1386. agentId: options.agentId,
  1387. fastMode: fastModeHeaderLatched,
  1388. globalCacheStrategy,
  1389. betas,
  1390. autoModeActive: afkHeaderLatched,
  1391. isUsingOverage: currentLimits.isUsingOverage ?? false,
  1392. cachedMCEnabled: cacheEditingHeaderLatched,
  1393. effortValue: effort,
  1394. extraBodyParams: getExtraBodyParams(),
  1395. })
  1396. }
  1397. const newContext: LLMRequestNewContext | undefined = isBetaTracingEnabled()
  1398. ? {
  1399. systemPrompt: systemPrompt.join('\n\n'),
  1400. querySource: options.querySource,
  1401. tools: jsonStringify(allTools),
  1402. }
  1403. : undefined
  1404. // Capture the span so we can pass it to endLLMRequestSpan later
  1405. // This ensures responses are matched to the correct request when multiple requests run in parallel
  1406. const llmSpan = startLLMRequestSpan(
  1407. options.model,
  1408. newContext,
  1409. messagesForAPI,
  1410. isFastMode,
  1411. )
  1412. const startIncludingRetries = Date.now()
  1413. let start = Date.now()
  1414. let attemptNumber = 0
  1415. const attemptStartTimes: number[] = []
  1416. let stream: Stream<BetaRawMessageStreamEvent> | undefined = undefined
  1417. let streamRequestId: string | null | undefined = undefined
  1418. let clientRequestId: string | undefined = undefined
  1419. // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins -- Response is available in Node 18+ and is used by the SDK
  1420. let streamResponse: Response | undefined = undefined
  1421. // Release all stream resources to prevent native memory leaks.
  1422. // The Response object holds native TLS/socket buffers that live outside the
  1423. // V8 heap (observed on the Node.js/npm path; see GH #32920), so we must
  1424. // explicitly cancel and release it regardless of how the generator exits.
  1425. function releaseStreamResources(): void {
  1426. cleanupStream(stream)
  1427. stream = undefined
  1428. if (streamResponse) {
  1429. streamResponse.body?.cancel().catch(() => {})
  1430. streamResponse = undefined
  1431. }
  1432. }
  1433. // Consume pending cache edits ONCE before paramsFromContext is defined.
  1434. // paramsFromContext is called multiple times (logging, retries), so consuming
  1435. // inside it would cause the first call to steal edits from subsequent calls.
  1436. const consumedCacheEdits = cachedMCEnabled ? consumePendingCacheEdits() : null
  1437. const consumedPinnedEdits = cachedMCEnabled ? getPinnedCacheEdits() : []
  1438. // Capture the betas sent in the last API request, including the ones that
  1439. // were dynamically added, so we can log and send it to telemetry.
  1440. let lastRequestBetas: string[] | undefined
  1441. const paramsFromContext = (retryContext: RetryContext) => {
  1442. const betasParams = [...betas]
  1443. // Append 1M beta dynamically for the Sonnet 1M experiment.
  1444. if (
  1445. !betasParams.includes(CONTEXT_1M_BETA_HEADER) &&
  1446. getSonnet1mExpTreatmentEnabled(retryContext.model)
  1447. ) {
  1448. betasParams.push(CONTEXT_1M_BETA_HEADER)
  1449. }
  1450. // For Bedrock, include both model-based betas and dynamically-added tool search header
  1451. const bedrockBetas =
  1452. getAPIProvider() === 'bedrock'
  1453. ? [
  1454. ...getBedrockExtraBodyParamsBetas(retryContext.model),
  1455. ...(toolSearchHeader ? [toolSearchHeader] : []),
  1456. ]
  1457. : []
  1458. const extraBodyParams = getExtraBodyParams(bedrockBetas)
  1459. const outputConfig: BetaOutputConfig = {
  1460. ...((extraBodyParams.output_config as BetaOutputConfig) ?? {}),
  1461. }
  1462. configureEffortParams(
  1463. effort,
  1464. outputConfig,
  1465. extraBodyParams,
  1466. betasParams,
  1467. options.model,
  1468. )
  1469. configureTaskBudgetParams(
  1470. options.taskBudget,
  1471. outputConfig as BetaOutputConfig & { task_budget?: TaskBudgetParam },
  1472. betasParams,
  1473. )
  1474. // Merge outputFormat into extraBodyParams.output_config alongside effort
  1475. // Requires structured-outputs beta header per SDK (see parse() in messages.mjs)
  1476. if (options.outputFormat && !('format' in outputConfig)) {
  1477. outputConfig.format = options.outputFormat as BetaJSONOutputFormat
  1478. // Add beta header if not already present and provider supports it
  1479. if (
  1480. modelSupportsStructuredOutputs(options.model) &&
  1481. !betasParams.includes(STRUCTURED_OUTPUTS_BETA_HEADER)
  1482. ) {
  1483. betasParams.push(STRUCTURED_OUTPUTS_BETA_HEADER)
  1484. }
  1485. }
  1486. // Retry context gets preference because it tries to course correct if we exceed the context window limit
  1487. const maxOutputTokens =
  1488. retryContext?.maxTokensOverride ||
  1489. options.maxOutputTokensOverride ||
  1490. getMaxOutputTokensForModel(options.model)
  1491. const hasThinking =
  1492. thinkingConfig.type !== 'disabled' &&
  1493. !isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_THINKING)
  1494. let thinking: BetaMessageStreamParams['thinking'] | undefined = undefined
  1495. // IMPORTANT: Do not change the adaptive-vs-budget thinking selection below
  1496. // without notifying the model launch DRI and research. This is a sensitive
  1497. // setting that can greatly affect model quality and bashing.
  1498. if (hasThinking && modelSupportsThinking(options.model)) {
  1499. if (
  1500. !isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING) &&
  1501. modelSupportsAdaptiveThinking(options.model)
  1502. ) {
  1503. // For models that support adaptive thinking, always use adaptive
  1504. // thinking without a budget.
  1505. thinking = {
  1506. type: 'adaptive',
  1507. } satisfies BetaMessageStreamParams['thinking']
  1508. } else {
  1509. // For models that do not support adaptive thinking, use the default
  1510. // thinking budget unless explicitly specified.
  1511. let thinkingBudget = getMaxThinkingTokensForModel(options.model)
  1512. if (
  1513. thinkingConfig.type === 'enabled' &&
  1514. thinkingConfig.budgetTokens !== undefined
  1515. ) {
  1516. thinkingBudget = thinkingConfig.budgetTokens
  1517. }
  1518. thinkingBudget = Math.min(maxOutputTokens - 1, thinkingBudget)
  1519. thinking = {
  1520. budget_tokens: thinkingBudget,
  1521. type: 'enabled',
  1522. } satisfies BetaMessageStreamParams['thinking']
  1523. }
  1524. }
  1525. // Get API context management strategies if enabled
  1526. const contextManagement = getAPIContextManagement({
  1527. hasThinking,
  1528. isRedactThinkingActive: betasParams.includes(REDACT_THINKING_BETA_HEADER),
  1529. clearAllThinking: thinkingClearLatched,
  1530. })
  1531. const enablePromptCaching =
  1532. options.enablePromptCaching ?? getPromptCachingEnabled(retryContext.model)
  1533. // Fast mode: header is latched session-stable (cache-safe), but
  1534. // `speed='fast'` stays dynamic so cooldown still suppresses the actual
  1535. // fast-mode request without changing the cache key.
  1536. let speed: BetaMessageStreamParams['speed']
  1537. const isFastModeForRetry =
  1538. isFastModeEnabled() &&
  1539. isFastModeAvailable() &&
  1540. !isFastModeCooldown() &&
  1541. isFastModeSupportedByModel(options.model) &&
  1542. !!retryContext.fastMode
  1543. if (isFastModeForRetry) {
  1544. speed = 'fast'
  1545. }
  1546. if (fastModeHeaderLatched && !betasParams.includes(FAST_MODE_BETA_HEADER)) {
  1547. betasParams.push(FAST_MODE_BETA_HEADER)
  1548. }
  1549. // AFK mode beta: latched once auto mode is first activated. Still gated
  1550. // by isAgenticQuery per-call so classifiers/compaction don't get it.
  1551. if (feature('TRANSCRIPT_CLASSIFIER')) {
  1552. if (
  1553. afkHeaderLatched &&
  1554. shouldIncludeFirstPartyOnlyBetas() &&
  1555. isAgenticQuery &&
  1556. !betasParams.includes(AFK_MODE_BETA_HEADER)
  1557. ) {
  1558. betasParams.push(AFK_MODE_BETA_HEADER)
  1559. }
  1560. }
  1561. // Cache editing beta: header is latched session-stable; useCachedMC
  1562. // (controls cache_edits body behavior) stays live so edits stop when
  1563. // the feature disables but the header doesn't flip.
  1564. const useCachedMC =
  1565. cachedMCEnabled &&
  1566. getAPIProvider() === 'firstParty' &&
  1567. options.querySource === 'repl_main_thread'
  1568. if (
  1569. cacheEditingHeaderLatched &&
  1570. getAPIProvider() === 'firstParty' &&
  1571. options.querySource === 'repl_main_thread' &&
  1572. !betasParams.includes(cacheEditingBetaHeader)
  1573. ) {
  1574. betasParams.push(cacheEditingBetaHeader)
  1575. logForDebugging(
  1576. 'Cache editing beta header enabled for cached microcompact',
  1577. )
  1578. }
  1579. // Only send temperature when thinking is disabled — the API requires
  1580. // temperature: 1 when thinking is enabled, which is already the default.
  1581. const temperature = !hasThinking
  1582. ? (options.temperatureOverride ?? 1)
  1583. : undefined
  1584. lastRequestBetas = betasParams
  1585. return {
  1586. model: normalizeModelStringForAPI(options.model),
  1587. messages: addCacheBreakpoints(
  1588. messagesForAPI,
  1589. enablePromptCaching,
  1590. options.querySource,
  1591. useCachedMC,
  1592. consumedCacheEdits,
  1593. consumedPinnedEdits,
  1594. options.skipCacheWrite,
  1595. ),
  1596. system,
  1597. tools: allTools,
  1598. tool_choice: options.toolChoice,
  1599. ...(useBetas && { betas: betasParams }),
  1600. metadata: getAPIMetadata(),
  1601. max_tokens: maxOutputTokens,
  1602. thinking,
  1603. ...(temperature !== undefined && { temperature }),
  1604. ...(contextManagement &&
  1605. useBetas &&
  1606. betasParams.includes(CONTEXT_MANAGEMENT_BETA_HEADER) && {
  1607. context_management: contextManagement,
  1608. }),
  1609. ...extraBodyParams,
  1610. ...(Object.keys(outputConfig).length > 0 && {
  1611. output_config: outputConfig,
  1612. }),
  1613. ...(speed !== undefined && { speed }),
  1614. }
  1615. }
  1616. // Compute log scalars synchronously so the fire-and-forget .then() closure
  1617. // captures only primitives instead of paramsFromContext's full closure scope
  1618. // (messagesForAPI, system, allTools, betas — the entire request-building
  1619. // context), which would otherwise be pinned until the promise resolves.
  1620. {
  1621. const queryParams = paramsFromContext({
  1622. model: options.model,
  1623. thinkingConfig,
  1624. })
  1625. const logMessagesLength = queryParams.messages.length
  1626. const logBetas = useBetas ? (queryParams.betas ?? []) : []
  1627. const logThinkingType = queryParams.thinking?.type ?? 'disabled'
  1628. const logEffortValue = queryParams.output_config?.effort
  1629. void options.getToolPermissionContext().then(permissionContext => {
  1630. logAPIQuery({
  1631. model: options.model,
  1632. messagesLength: logMessagesLength,
  1633. temperature: options.temperatureOverride ?? 1,
  1634. betas: logBetas,
  1635. permissionMode: permissionContext.mode,
  1636. querySource: options.querySource,
  1637. queryTracking: options.queryTracking,
  1638. thinkingType: logThinkingType,
  1639. effortValue: logEffortValue,
  1640. fastMode: isFastMode,
  1641. previousRequestId,
  1642. })
  1643. })
  1644. }
  1645. const newMessages: AssistantMessage[] = []
  1646. let ttftMs = 0
  1647. let partialMessage: BetaMessage | undefined = undefined
  1648. const contentBlocks: (BetaContentBlock | ConnectorTextBlock)[] = []
  1649. let usage: NonNullableUsage = EMPTY_USAGE
  1650. let costUSD = 0
  1651. let stopReason: BetaStopReason | null = null
  1652. let didFallBackToNonStreaming = false
  1653. let fallbackMessage: AssistantMessage | undefined
  1654. let maxOutputTokens = 0
  1655. let responseHeaders: globalThis.Headers | undefined = undefined
  1656. let research: unknown = undefined
  1657. let isFastModeRequest = isFastMode // Keep separate state as it may change if falling back
  1658. let isAdvisorInProgress = false
  1659. try {
  1660. queryCheckpoint('query_client_creation_start')
  1661. const generator = withRetry(
  1662. () =>
  1663. getAnthropicClient({
  1664. maxRetries: 0, // Disabled auto-retry in favor of manual implementation
  1665. model: options.model,
  1666. fetchOverride: options.fetchOverride,
  1667. source: options.querySource,
  1668. }),
  1669. async (anthropic, attempt, context) => {
  1670. attemptNumber = attempt
  1671. isFastModeRequest = context.fastMode ?? false
  1672. start = Date.now()
  1673. attemptStartTimes.push(start)
  1674. // Client has been created by withRetry's getClient() call. This fires
  1675. // once per attempt; on retries the client is usually cached (withRetry
  1676. // only calls getClient() again after auth errors), so the delta from
  1677. // client_creation_start is meaningful on attempt 1.
  1678. queryCheckpoint('query_client_creation_end')
  1679. const params = paramsFromContext(context)
  1680. captureAPIRequest(params, options.querySource) // Capture for bug reports
  1681. maxOutputTokens = params.max_tokens
  1682. // Fire immediately before the fetch is dispatched. .withResponse() below
  1683. // awaits until response headers arrive, so this MUST be before the await
  1684. // or the "Network TTFB" phase measurement is wrong.
  1685. queryCheckpoint('query_api_request_sent')
  1686. if (!options.agentId) {
  1687. headlessProfilerCheckpoint('api_request_sent')
  1688. }
  1689. // Generate and track client request ID so timeouts (which return no
  1690. // server request ID) can still be correlated with server logs.
  1691. // First-party only — 3P providers don't log it (inc-4029 class).
  1692. clientRequestId =
  1693. getAPIProvider() === 'firstParty' && isFirstPartyAnthropicBaseUrl()
  1694. ? randomUUID()
  1695. : undefined
  1696. // Use raw stream instead of BetaMessageStream to avoid O(n²) partial JSON parsing
  1697. // BetaMessageStream calls partialParse() on every input_json_delta, which we don't need
  1698. // since we handle tool input accumulation ourselves
  1699. // biome-ignore lint/plugin: main conversation loop handles attribution separately
  1700. const result = await anthropic.beta.messages
  1701. .create(
  1702. { ...params, stream: true },
  1703. {
  1704. signal,
  1705. ...(clientRequestId && {
  1706. headers: { [CLIENT_REQUEST_ID_HEADER]: clientRequestId },
  1707. }),
  1708. },
  1709. )
  1710. .withResponse()
  1711. queryCheckpoint('query_response_headers_received')
  1712. streamRequestId = result.request_id
  1713. streamResponse = result.response
  1714. return result.data
  1715. },
  1716. {
  1717. model: options.model,
  1718. fallbackModel: options.fallbackModel,
  1719. thinkingConfig,
  1720. ...(isFastModeEnabled() ? { fastMode: isFastMode } : false),
  1721. signal,
  1722. querySource: options.querySource,
  1723. },
  1724. )
  1725. let e
  1726. do {
  1727. e = await generator.next()
  1728. // yield API error messages (the stream has a 'controller' property, error messages don't)
  1729. if (!('controller' in e.value)) {
  1730. yield e.value
  1731. }
  1732. } while (!e.done)
  1733. stream = e.value as Stream<BetaRawMessageStreamEvent>
  1734. // reset state
  1735. newMessages.length = 0
  1736. ttftMs = 0
  1737. partialMessage = undefined
  1738. contentBlocks.length = 0
  1739. usage = EMPTY_USAGE
  1740. stopReason = null
  1741. isAdvisorInProgress = false
  1742. // Streaming idle timeout watchdog: abort the stream if no chunks arrive
  1743. // for STREAM_IDLE_TIMEOUT_MS. Unlike the stall detection below (which only
  1744. // fires when the *next* chunk arrives), this uses setTimeout to actively
  1745. // kill hung streams. Without this, a silently dropped connection can hang
  1746. // the session indefinitely since the SDK's request timeout only covers the
  1747. // initial fetch(), not the streaming body.
  1748. const streamWatchdogEnabled = isEnvTruthy(
  1749. process.env.CLAUDE_ENABLE_STREAM_WATCHDOG,
  1750. )
  1751. const STREAM_IDLE_TIMEOUT_MS =
  1752. parseInt(process.env.CLAUDE_STREAM_IDLE_TIMEOUT_MS || '', 10) || 90_000
  1753. const STREAM_IDLE_WARNING_MS = STREAM_IDLE_TIMEOUT_MS / 2
  1754. let streamIdleAborted = false
  1755. // performance.now() snapshot when watchdog fires, for measuring abort propagation delay
  1756. let streamWatchdogFiredAt: number | null = null
  1757. let streamIdleWarningTimer: ReturnType<typeof setTimeout> | null = null
  1758. let streamIdleTimer: ReturnType<typeof setTimeout> | null = null
  1759. function clearStreamIdleTimers(): void {
  1760. if (streamIdleWarningTimer !== null) {
  1761. clearTimeout(streamIdleWarningTimer)
  1762. streamIdleWarningTimer = null
  1763. }
  1764. if (streamIdleTimer !== null) {
  1765. clearTimeout(streamIdleTimer)
  1766. streamIdleTimer = null
  1767. }
  1768. }
  1769. function resetStreamIdleTimer(): void {
  1770. clearStreamIdleTimers()
  1771. if (!streamWatchdogEnabled) {
  1772. return
  1773. }
  1774. streamIdleWarningTimer = setTimeout(
  1775. warnMs => {
  1776. logForDebugging(
  1777. `Streaming idle warning: no chunks received for ${warnMs / 1000}s`,
  1778. { level: 'warn' },
  1779. )
  1780. logForDiagnosticsNoPII('warn', 'cli_streaming_idle_warning')
  1781. },
  1782. STREAM_IDLE_WARNING_MS,
  1783. STREAM_IDLE_WARNING_MS,
  1784. )
  1785. streamIdleTimer = setTimeout(() => {
  1786. streamIdleAborted = true
  1787. streamWatchdogFiredAt = performance.now()
  1788. logForDebugging(
  1789. `Streaming idle timeout: no chunks received for ${STREAM_IDLE_TIMEOUT_MS / 1000}s, aborting stream`,
  1790. { level: 'error' },
  1791. )
  1792. logForDiagnosticsNoPII('error', 'cli_streaming_idle_timeout')
  1793. logEvent('tengu_streaming_idle_timeout', {
  1794. model:
  1795. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1796. request_id: (streamRequestId ??
  1797. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1798. timeout_ms: STREAM_IDLE_TIMEOUT_MS,
  1799. })
  1800. releaseStreamResources()
  1801. }, STREAM_IDLE_TIMEOUT_MS)
  1802. }
  1803. resetStreamIdleTimer()
  1804. startSessionActivity('api_call')
  1805. try {
  1806. // stream in and accumulate state
  1807. let isFirstChunk = true
  1808. let lastEventTime: number | null = null // Set after first chunk to avoid measuring TTFB as a stall
  1809. const STALL_THRESHOLD_MS = 30_000 // 30 seconds
  1810. let totalStallTime = 0
  1811. let stallCount = 0
  1812. for await (const part of stream) {
  1813. resetStreamIdleTimer()
  1814. const now = Date.now()
  1815. // Detect and log streaming stalls (only after first event to avoid counting TTFB)
  1816. if (lastEventTime !== null) {
  1817. const timeSinceLastEvent = now - lastEventTime
  1818. if (timeSinceLastEvent > STALL_THRESHOLD_MS) {
  1819. stallCount++
  1820. totalStallTime += timeSinceLastEvent
  1821. logForDebugging(
  1822. `Streaming stall detected: ${(timeSinceLastEvent / 1000).toFixed(1)}s gap between events (stall #${stallCount})`,
  1823. { level: 'warn' },
  1824. )
  1825. logEvent('tengu_streaming_stall', {
  1826. stall_duration_ms: timeSinceLastEvent,
  1827. stall_count: stallCount,
  1828. total_stall_time_ms: totalStallTime,
  1829. event_type:
  1830. part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1831. model:
  1832. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1833. request_id: (streamRequestId ??
  1834. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1835. })
  1836. }
  1837. }
  1838. lastEventTime = now
  1839. if (isFirstChunk) {
  1840. logForDebugging('Stream started - received first chunk')
  1841. queryCheckpoint('query_first_chunk_received')
  1842. if (!options.agentId) {
  1843. headlessProfilerCheckpoint('first_chunk')
  1844. }
  1845. endQueryProfile()
  1846. isFirstChunk = false
  1847. }
  1848. switch (part.type) {
  1849. case 'message_start': {
  1850. partialMessage = part.message
  1851. ttftMs = Date.now() - start
  1852. usage = updateUsage(usage, part.message?.usage)
  1853. // Capture research from message_start if available (internal only).
  1854. // Always overwrite with the latest value.
  1855. if (
  1856. process.env.USER_TYPE === 'ant' &&
  1857. 'research' in (part.message as unknown as Record<string, unknown>)
  1858. ) {
  1859. research = (part.message as unknown as Record<string, unknown>)
  1860. .research
  1861. }
  1862. break
  1863. }
  1864. case 'content_block_start':
  1865. switch (part.content_block.type) {
  1866. case 'tool_use':
  1867. contentBlocks[part.index] = {
  1868. ...part.content_block,
  1869. input: '',
  1870. }
  1871. break
  1872. case 'server_tool_use':
  1873. contentBlocks[part.index] = {
  1874. ...part.content_block,
  1875. input: '' as unknown as { [key: string]: unknown },
  1876. }
  1877. if ((part.content_block.name as string) === 'advisor') {
  1878. isAdvisorInProgress = true
  1879. logForDebugging(`[AdvisorTool] Advisor tool called`)
  1880. logEvent('tengu_advisor_tool_call', {
  1881. model:
  1882. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1883. advisor_model: (advisorModel ??
  1884. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1885. })
  1886. }
  1887. break
  1888. case 'text':
  1889. contentBlocks[part.index] = {
  1890. ...part.content_block,
  1891. // awkwardly, the sdk sometimes returns text as part of a
  1892. // content_block_start message, then returns the same text
  1893. // again in a content_block_delta message. we ignore it here
  1894. // since there doesn't seem to be a way to detect when a
  1895. // content_block_delta message duplicates the text.
  1896. text: '',
  1897. }
  1898. break
  1899. case 'thinking':
  1900. contentBlocks[part.index] = {
  1901. ...part.content_block,
  1902. // also awkward
  1903. thinking: '',
  1904. // initialize signature to ensure field exists even if signature_delta never arrives
  1905. signature: '',
  1906. }
  1907. break
  1908. default:
  1909. // even more awkwardly, the sdk mutates the contents of text blocks
  1910. // as it works. we want the blocks to be immutable, so that we can
  1911. // accumulate state ourselves.
  1912. contentBlocks[part.index] = { ...part.content_block }
  1913. if (
  1914. (part.content_block.type as string) === 'advisor_tool_result'
  1915. ) {
  1916. isAdvisorInProgress = false
  1917. logForDebugging(`[AdvisorTool] Advisor tool result received`)
  1918. }
  1919. break
  1920. }
  1921. break
  1922. case 'content_block_delta': {
  1923. const contentBlock = contentBlocks[part.index]
  1924. const delta = part.delta as typeof part.delta | ConnectorTextDelta
  1925. if (!contentBlock) {
  1926. logEvent('tengu_streaming_error', {
  1927. error_type:
  1928. 'content_block_not_found_delta' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1929. part_type:
  1930. part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1931. part_index: part.index,
  1932. })
  1933. throw new RangeError('未找到内容块')
  1934. }
  1935. if (
  1936. feature('CONNECTOR_TEXT') &&
  1937. delta.type === 'connector_text_delta'
  1938. ) {
  1939. if (contentBlock.type !== 'connector_text') {
  1940. logEvent('tengu_streaming_error', {
  1941. error_type:
  1942. 'content_block_type_mismatch_connector_text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1943. expected_type:
  1944. 'connector_text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1945. actual_type:
  1946. contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1947. })
  1948. throw new Error('内容块不是 connector_text 块')
  1949. }
  1950. contentBlock.connector_text += delta.connector_text
  1951. } else {
  1952. switch (delta.type) {
  1953. case 'citations_delta':
  1954. // TODO: handle citations
  1955. break
  1956. case 'input_json_delta':
  1957. if (
  1958. contentBlock.type !== 'tool_use' &&
  1959. contentBlock.type !== 'server_tool_use'
  1960. ) {
  1961. logEvent('tengu_streaming_error', {
  1962. error_type:
  1963. 'content_block_type_mismatch_input_json' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1964. expected_type:
  1965. 'tool_use' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1966. actual_type:
  1967. contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1968. })
  1969. throw new Error('内容块不是 input_json 块')
  1970. }
  1971. if (typeof contentBlock.input !== 'string') {
  1972. logEvent('tengu_streaming_error', {
  1973. error_type:
  1974. 'content_block_input_not_string' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1975. input_type:
  1976. typeof contentBlock.input as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1977. })
  1978. throw new Error('内容块输入不是字符串')
  1979. }
  1980. contentBlock.input += delta.partial_json
  1981. break
  1982. case 'text_delta':
  1983. if (contentBlock.type !== 'text') {
  1984. logEvent('tengu_streaming_error', {
  1985. error_type:
  1986. 'content_block_type_mismatch_text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1987. expected_type:
  1988. 'text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1989. actual_type:
  1990. contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  1991. })
  1992. throw new Error('内容块不是文本块')
  1993. }
  1994. contentBlock.text += delta.text
  1995. break
  1996. case 'signature_delta':
  1997. if (
  1998. feature('CONNECTOR_TEXT') &&
  1999. contentBlock.type === 'connector_text'
  2000. ) {
  2001. contentBlock.signature = delta.signature
  2002. break
  2003. }
  2004. if (contentBlock.type !== 'thinking') {
  2005. logEvent('tengu_streaming_error', {
  2006. error_type:
  2007. 'content_block_type_mismatch_thinking_signature' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2008. expected_type:
  2009. 'thinking' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2010. actual_type:
  2011. contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2012. })
  2013. throw new Error('内容块不是 thinking 块')
  2014. }
  2015. contentBlock.signature = delta.signature
  2016. break
  2017. case 'thinking_delta':
  2018. if (contentBlock.type !== 'thinking') {
  2019. logEvent('tengu_streaming_error', {
  2020. error_type:
  2021. 'content_block_type_mismatch_thinking_delta' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2022. expected_type:
  2023. 'thinking' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2024. actual_type:
  2025. contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2026. })
  2027. throw new Error('内容块不是 thinking 块')
  2028. }
  2029. contentBlock.thinking += delta.thinking
  2030. break
  2031. }
  2032. }
  2033. // Capture research from content_block_delta if available (internal only).
  2034. // Always overwrite with the latest value.
  2035. if (process.env.USER_TYPE === 'ant' && 'research' in part) {
  2036. research = (part as { research: unknown }).research
  2037. }
  2038. break
  2039. }
  2040. case 'content_block_stop': {
  2041. const contentBlock = contentBlocks[part.index]
  2042. if (!contentBlock) {
  2043. logEvent('tengu_streaming_error', {
  2044. error_type:
  2045. 'content_block_not_found_stop' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2046. part_type:
  2047. part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2048. part_index: part.index,
  2049. })
  2050. throw new RangeError('未找到内容块')
  2051. }
  2052. if (!partialMessage) {
  2053. logEvent('tengu_streaming_error', {
  2054. error_type:
  2055. 'partial_message_not_found' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2056. part_type:
  2057. part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2058. })
  2059. throw new Error('未找到消息')
  2060. }
  2061. const m: AssistantMessage = {
  2062. message: {
  2063. ...partialMessage,
  2064. content: normalizeContentFromAPI(
  2065. [contentBlock] as BetaContentBlock[],
  2066. tools,
  2067. options.agentId,
  2068. ),
  2069. },
  2070. requestId: streamRequestId ?? undefined,
  2071. type: 'assistant',
  2072. uuid: randomUUID(),
  2073. timestamp: new Date().toISOString(),
  2074. ...(process.env.USER_TYPE === 'ant' &&
  2075. research !== undefined && { research }),
  2076. ...(advisorModel && { advisorModel }),
  2077. }
  2078. newMessages.push(m)
  2079. yield m
  2080. break
  2081. }
  2082. case 'message_delta': {
  2083. usage = updateUsage(usage, part.usage)
  2084. // Capture research from message_delta if available (internal only).
  2085. // Always overwrite with the latest value. Also write back to
  2086. // already-yielded messages since message_delta arrives after
  2087. // content_block_stop.
  2088. if (
  2089. process.env.USER_TYPE === 'ant' &&
  2090. 'research' in (part as unknown as Record<string, unknown>)
  2091. ) {
  2092. research = (part as unknown as Record<string, unknown>).research
  2093. for (const msg of newMessages) {
  2094. msg.research = research
  2095. }
  2096. }
  2097. // Write final usage and stop_reason back to the last yielded
  2098. // message. Messages are created at content_block_stop from
  2099. // partialMessage, which was set at message_start before any tokens
  2100. // were generated (output_tokens: 0, stop_reason: null).
  2101. // message_delta arrives after content_block_stop with the real
  2102. // values.
  2103. //
  2104. // IMPORTANT: Use direct property mutation, not object replacement.
  2105. // The transcript write queue holds a reference to message.message
  2106. // and serializes it lazily (100ms flush interval). Object
  2107. // replacement ({ ...lastMsg.message, usage }) would disconnect
  2108. // the queued reference; direct mutation ensures the transcript
  2109. // captures the final values.
  2110. stopReason = part.delta.stop_reason
  2111. const lastMsg = newMessages.at(-1)
  2112. if (lastMsg) {
  2113. lastMsg.message.usage = usage
  2114. lastMsg.message.stop_reason = stopReason
  2115. }
  2116. // Update cost
  2117. const costUSDForPart = calculateUSDCost(resolvedModel, usage)
  2118. costUSD += addToTotalSessionCost(
  2119. costUSDForPart,
  2120. usage,
  2121. options.model,
  2122. )
  2123. const refusalMessage = getErrorMessageIfRefusal(
  2124. part.delta.stop_reason,
  2125. options.model,
  2126. )
  2127. if (refusalMessage) {
  2128. yield refusalMessage
  2129. }
  2130. if (stopReason === 'max_tokens') {
  2131. logEvent('tengu_max_tokens_reached', {
  2132. max_tokens: maxOutputTokens,
  2133. })
  2134. yield createAssistantAPIErrorMessage({
  2135. content: `${API_ERROR_MESSAGE_PREFIX}: Claude's response exceeded the ${
  2136. maxOutputTokens
  2137. } output token maximum. To configure this behavior, set the CLAUDE_CODE_MAX_OUTPUT_TOKENS environment variable.`,
  2138. apiError: 'max_output_tokens',
  2139. error: 'max_output_tokens',
  2140. })
  2141. }
  2142. if (stopReason === 'model_context_window_exceeded') {
  2143. logEvent('tengu_context_window_exceeded', {
  2144. max_tokens: maxOutputTokens,
  2145. output_tokens: usage.output_tokens,
  2146. })
  2147. // Reuse the max_output_tokens recovery path — from the model's
  2148. // perspective, both mean "response was cut off, continue from
  2149. // where you left off."
  2150. yield createAssistantAPIErrorMessage({
  2151. content: `${API_ERROR_MESSAGE_PREFIX}: The model has reached its context window limit.`,
  2152. apiError: 'max_output_tokens',
  2153. error: 'max_output_tokens',
  2154. })
  2155. }
  2156. break
  2157. }
  2158. case 'message_stop':
  2159. break
  2160. }
  2161. yield {
  2162. type: 'stream_event',
  2163. event: part,
  2164. ...(part.type === 'message_start' ? { ttftMs } : undefined),
  2165. }
  2166. }
  2167. // Clear the idle timeout watchdog now that the stream loop has exited
  2168. clearStreamIdleTimers()
  2169. // If the stream was aborted by our idle timeout watchdog, fall back to
  2170. // non-streaming retry rather than treating it as a completed stream.
  2171. if (streamIdleAborted) {
  2172. // Instrumentation: proves the for-await exited after the watchdog fired
  2173. // (vs. hung forever). exit_delay_ms measures abort propagation latency:
  2174. // 0-10ms = abort worked; >>1000ms = something else woke the loop.
  2175. const exitDelayMs =
  2176. streamWatchdogFiredAt !== null
  2177. ? Math.round(performance.now() - streamWatchdogFiredAt)
  2178. : -1
  2179. logForDiagnosticsNoPII(
  2180. 'info',
  2181. 'cli_stream_loop_exited_after_watchdog_clean',
  2182. )
  2183. logEvent('tengu_stream_loop_exited_after_watchdog', {
  2184. request_id: (streamRequestId ??
  2185. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2186. exit_delay_ms: exitDelayMs,
  2187. exit_path:
  2188. 'clean' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2189. model:
  2190. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2191. })
  2192. // Prevent double-emit: this throw lands in the catch block below,
  2193. // whose exit_path='error' probe guards on streamWatchdogFiredAt.
  2194. streamWatchdogFiredAt = null
  2195. throw new Error('流空闲超时 - 未收到任何数据块')
  2196. }
  2197. // Detect when the stream completed without producing any assistant messages.
  2198. // This covers two proxy failure modes:
  2199. // 1. No events at all (!partialMessage): proxy returned 200 with non-SSE body
  2200. // 2. Partial events (partialMessage set but no content blocks completed AND
  2201. // no stop_reason received): proxy returned message_start but stream ended
  2202. // before content_block_stop and before message_delta with stop_reason
  2203. // BetaMessageStream had the first check in _endRequest() but the raw Stream
  2204. // does not - without it the generator silently returns no assistant messages,
  2205. // causing "Execution error" in -p mode.
  2206. // Note: We must check stopReason to avoid false positives. For example, with
  2207. // structured output (--json-schema), the model calls a StructuredOutput tool
  2208. // on turn 1, then on turn 2 responds with end_turn and no content blocks.
  2209. // That's a legitimate empty response, not an incomplete stream.
  2210. if (!partialMessage || (newMessages.length === 0 && !stopReason)) {
  2211. logForDebugging(
  2212. !partialMessage
  2213. ? 'Stream completed without receiving message_start event - triggering non-streaming fallback'
  2214. : 'Stream completed with message_start but no content blocks completed - triggering non-streaming fallback',
  2215. { level: 'error' },
  2216. )
  2217. logEvent('tengu_stream_no_events', {
  2218. model:
  2219. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2220. request_id: (streamRequestId ??
  2221. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2222. })
  2223. throw new Error('流结束但未收到任何事件')
  2224. }
  2225. // Log summary if any stalls occurred during streaming
  2226. if (stallCount > 0) {
  2227. logForDebugging(
  2228. `Streaming completed with ${stallCount} stall(s), total stall time: ${(totalStallTime / 1000).toFixed(1)}s`,
  2229. { level: 'warn' },
  2230. )
  2231. logEvent('tengu_streaming_stall_summary', {
  2232. stall_count: stallCount,
  2233. total_stall_time_ms: totalStallTime,
  2234. model:
  2235. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2236. request_id: (streamRequestId ??
  2237. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2238. })
  2239. }
  2240. // Check if the cache actually broke based on response tokens
  2241. if (feature('PROMPT_CACHE_BREAK_DETECTION')) {
  2242. void checkResponseForCacheBreak(
  2243. options.querySource,
  2244. usage.cache_read_input_tokens,
  2245. usage.cache_creation_input_tokens,
  2246. messages,
  2247. options.agentId,
  2248. streamRequestId,
  2249. )
  2250. }
  2251. // Process fallback percentage header and quota status if available
  2252. // streamResponse is set when the stream is created in the withRetry callback above
  2253. // TypeScript's control flow analysis can't track that streamResponse is set in the callback
  2254. // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins
  2255. const resp = streamResponse as unknown as Response | undefined
  2256. if (resp) {
  2257. extractQuotaStatusFromHeaders(resp.headers)
  2258. // Store headers for gateway detection
  2259. responseHeaders = resp.headers
  2260. }
  2261. } catch (streamingError) {
  2262. // Clear the idle timeout watchdog on error path too
  2263. clearStreamIdleTimers()
  2264. // Instrumentation: if the watchdog had already fired and the for-await
  2265. // threw (rather than exiting cleanly), record that the loop DID exit and
  2266. // how long after the watchdog. Distinguishes true hangs from error exits.
  2267. if (streamIdleAborted && streamWatchdogFiredAt !== null) {
  2268. const exitDelayMs = Math.round(
  2269. performance.now() - streamWatchdogFiredAt,
  2270. )
  2271. logForDiagnosticsNoPII(
  2272. 'info',
  2273. 'cli_stream_loop_exited_after_watchdog_error',
  2274. )
  2275. logEvent('tengu_stream_loop_exited_after_watchdog', {
  2276. request_id: (streamRequestId ??
  2277. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2278. exit_delay_ms: exitDelayMs,
  2279. exit_path:
  2280. 'error' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2281. error_name:
  2282. streamingError instanceof Error
  2283. ? (streamingError.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
  2284. : ('unknown' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
  2285. model:
  2286. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2287. })
  2288. }
  2289. if (streamingError instanceof APIUserAbortError) {
  2290. // Check if the abort signal was triggered by the user (ESC key)
  2291. // If the signal is aborted, it's a user-initiated abort
  2292. // If not, it's likely a timeout from the SDK
  2293. if (signal.aborted) {
  2294. // This is a real user abort (ESC key was pressed)
  2295. logForDebugging(
  2296. `Streaming aborted by user: ${errorMessage(streamingError)}`,
  2297. )
  2298. if (isAdvisorInProgress) {
  2299. logEvent('tengu_advisor_tool_interrupted', {
  2300. model:
  2301. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2302. advisor_model: (advisorModel ??
  2303. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2304. })
  2305. }
  2306. throw streamingError
  2307. } else {
  2308. // The SDK threw APIUserAbortError but our signal wasn't aborted
  2309. // This means it's a timeout from the SDK's internal timeout
  2310. logForDebugging(
  2311. `Streaming timeout (SDK abort): ${streamingError.message}`,
  2312. { level: 'error' },
  2313. )
  2314. // Throw a more specific error for timeout
  2315. throw new APIConnectionTimeoutError({ message: '请求超时' })
  2316. }
  2317. }
  2318. // When the flag is enabled, skip the non-streaming fallback and let the
  2319. // error propagate to withRetry. The mid-stream fallback causes double tool
  2320. // execution when streaming tool execution is active: the partial stream
  2321. // starts a tool, then the non-streaming retry produces the same tool_use
  2322. // and runs it again. See inc-4258.
  2323. const disableFallback =
  2324. isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_NONSTREAMING_FALLBACK) ||
  2325. getFeatureValue_CACHED_MAY_BE_STALE(
  2326. 'tengu_disable_streaming_to_non_streaming_fallback',
  2327. false,
  2328. )
  2329. if (disableFallback) {
  2330. logForDebugging(
  2331. `Error streaming (non-streaming fallback disabled): ${errorMessage(streamingError)}`,
  2332. { level: 'error' },
  2333. )
  2334. logEvent('tengu_streaming_fallback_to_non_streaming', {
  2335. model:
  2336. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2337. error:
  2338. streamingError instanceof Error
  2339. ? (streamingError.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
  2340. : (String(
  2341. streamingError,
  2342. ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
  2343. attemptNumber,
  2344. maxOutputTokens,
  2345. thinkingType:
  2346. thinkingConfig.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2347. fallback_disabled: true,
  2348. request_id: (streamRequestId ??
  2349. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2350. fallback_cause: (streamIdleAborted
  2351. ? 'watchdog'
  2352. : 'other') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2353. })
  2354. throw streamingError
  2355. }
  2356. logForDebugging(
  2357. `Error streaming, falling back to non-streaming mode: ${errorMessage(streamingError)}`,
  2358. { level: 'error' },
  2359. )
  2360. didFallBackToNonStreaming = true
  2361. if (options.onStreamingFallback) {
  2362. options.onStreamingFallback()
  2363. }
  2364. logEvent('tengu_streaming_fallback_to_non_streaming', {
  2365. model:
  2366. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2367. error:
  2368. streamingError instanceof Error
  2369. ? (streamingError.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
  2370. : (String(
  2371. streamingError,
  2372. ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
  2373. attemptNumber,
  2374. maxOutputTokens,
  2375. thinkingType:
  2376. thinkingConfig.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2377. fallback_disabled: false,
  2378. request_id: (streamRequestId ??
  2379. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2380. fallback_cause: (streamIdleAborted
  2381. ? 'watchdog'
  2382. : 'other') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2383. })
  2384. // Fall back to non-streaming mode with retries.
  2385. // If the streaming failure was itself a 529, count it toward the
  2386. // consecutive-529 budget so total 529s-before-model-fallback is the
  2387. // same whether the overload was hit in streaming or non-streaming mode.
  2388. // This is a speculative fix for https://github.com/anthropics/claude-code/issues/1513
  2389. // Instrumentation: proves executeNonStreamingRequest was entered (vs. the
  2390. // fallback event firing but the call itself hanging at dispatch).
  2391. logForDiagnosticsNoPII('info', 'cli_nonstreaming_fallback_started')
  2392. logEvent('tengu_nonstreaming_fallback_started', {
  2393. request_id: (streamRequestId ??
  2394. 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2395. model:
  2396. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2397. fallback_cause: (streamIdleAborted
  2398. ? 'watchdog'
  2399. : 'other') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2400. })
  2401. const result = yield* executeNonStreamingRequest(
  2402. { model: options.model, source: options.querySource },
  2403. {
  2404. model: options.model,
  2405. fallbackModel: options.fallbackModel,
  2406. thinkingConfig,
  2407. ...(isFastModeEnabled() && { fastMode: isFastMode }),
  2408. signal,
  2409. initialConsecutive529Errors: is529Error(streamingError) ? 1 : 0,
  2410. querySource: options.querySource,
  2411. },
  2412. paramsFromContext,
  2413. (attempt, _startTime, tokens) => {
  2414. attemptNumber = attempt
  2415. maxOutputTokens = tokens
  2416. },
  2417. params => captureAPIRequest(params, options.querySource),
  2418. streamRequestId,
  2419. )
  2420. const m: AssistantMessage = {
  2421. message: {
  2422. ...result,
  2423. content: normalizeContentFromAPI(
  2424. result.content,
  2425. tools,
  2426. options.agentId,
  2427. ),
  2428. },
  2429. requestId: streamRequestId ?? undefined,
  2430. type: 'assistant',
  2431. uuid: randomUUID(),
  2432. timestamp: new Date().toISOString(),
  2433. ...(process.env.USER_TYPE === 'ant' &&
  2434. research !== undefined && {
  2435. research,
  2436. }),
  2437. ...(advisorModel && {
  2438. advisorModel,
  2439. }),
  2440. }
  2441. newMessages.push(m)
  2442. fallbackMessage = m
  2443. yield m
  2444. } finally {
  2445. clearStreamIdleTimers()
  2446. }
  2447. } catch (errorFromRetry) {
  2448. // FallbackTriggeredError must propagate to query.ts, which performs the
  2449. // actual model switch. Swallowing it here would turn the fallback into a
  2450. // no-op — the user would just see "Model fallback triggered: X -> Y" as
  2451. // an error message with no actual retry on the fallback model.
  2452. if (errorFromRetry instanceof FallbackTriggeredError) {
  2453. throw errorFromRetry
  2454. }
  2455. // Check if this is a 404 error during stream creation that should trigger
  2456. // non-streaming fallback. This handles gateways that return 404 for streaming
  2457. // endpoints but work fine with non-streaming. Before v2.1.8, BetaMessageStream
  2458. // threw 404s during iteration (caught by inner catch with fallback), but now
  2459. // with raw streams, 404s are thrown during creation (caught here).
  2460. const is404StreamCreationError =
  2461. !didFallBackToNonStreaming &&
  2462. errorFromRetry instanceof CannotRetryError &&
  2463. errorFromRetry.originalError instanceof APIError &&
  2464. errorFromRetry.originalError.status === 404
  2465. if (is404StreamCreationError) {
  2466. // 404 is thrown at .withResponse() before streamRequestId is assigned,
  2467. // and CannotRetryError means every retry failed — so grab the failed
  2468. // request's ID from the error header instead.
  2469. const failedRequestId =
  2470. (errorFromRetry.originalError as APIError).requestID ?? 'unknown'
  2471. logForDebugging(
  2472. 'Streaming endpoint returned 404, falling back to non-streaming mode',
  2473. { level: 'warn' },
  2474. )
  2475. didFallBackToNonStreaming = true
  2476. if (options.onStreamingFallback) {
  2477. options.onStreamingFallback()
  2478. }
  2479. logEvent('tengu_streaming_fallback_to_non_streaming', {
  2480. model:
  2481. options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2482. error:
  2483. '404_stream_creation' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2484. attemptNumber,
  2485. maxOutputTokens,
  2486. thinkingType:
  2487. thinkingConfig.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2488. request_id:
  2489. failedRequestId as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2490. fallback_cause:
  2491. '404_stream_creation' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  2492. })
  2493. try {
  2494. // Fall back to non-streaming mode
  2495. const result = yield* executeNonStreamingRequest(
  2496. { model: options.model, source: options.querySource },
  2497. {
  2498. model: options.model,
  2499. fallbackModel: options.fallbackModel,
  2500. thinkingConfig,
  2501. ...(isFastModeEnabled() && { fastMode: isFastMode }),
  2502. signal,
  2503. },
  2504. paramsFromContext,
  2505. (attempt, _startTime, tokens) => {
  2506. attemptNumber = attempt
  2507. maxOutputTokens = tokens
  2508. },
  2509. params => captureAPIRequest(params, options.querySource),
  2510. failedRequestId,
  2511. )
  2512. const m: AssistantMessage = {
  2513. message: {
  2514. ...result,
  2515. content: normalizeContentFromAPI(
  2516. result.content,
  2517. tools,
  2518. options.agentId,
  2519. ),
  2520. },
  2521. requestId: streamRequestId ?? undefined,
  2522. type: 'assistant',
  2523. uuid: randomUUID(),
  2524. timestamp: new Date().toISOString(),
  2525. ...(process.env.USER_TYPE === 'ant' &&
  2526. research !== undefined && { research }),
  2527. ...(advisorModel && { advisorModel }),
  2528. }
  2529. newMessages.push(m)
  2530. fallbackMessage = m
  2531. yield m
  2532. // Continue to success logging below
  2533. } catch (fallbackError) {
  2534. // Propagate model-fallback signal to query.ts (see comment above).
  2535. if (fallbackError instanceof FallbackTriggeredError) {
  2536. throw fallbackError
  2537. }
  2538. // Fallback also failed, handle as normal error
  2539. logForDebugging(
  2540. `Non-streaming fallback also failed: ${errorMessage(fallbackError)}`,
  2541. { level: 'error' },
  2542. )
  2543. let error = fallbackError
  2544. let errorModel = options.model
  2545. if (fallbackError instanceof CannotRetryError) {
  2546. error = fallbackError.originalError
  2547. errorModel = fallbackError.retryContext.model
  2548. }
  2549. if (error instanceof APIError) {
  2550. extractQuotaStatusFromError(error)
  2551. }
  2552. const requestId =
  2553. streamRequestId ||
  2554. (error instanceof APIError ? error.requestID : undefined) ||
  2555. (error instanceof APIError
  2556. ? (error.error as { request_id?: string })?.request_id
  2557. : undefined)
  2558. logAPIError({
  2559. error,
  2560. model: errorModel,
  2561. messageCount: messagesForAPI.length,
  2562. messageTokens: tokenCountFromLastAPIResponse(messagesForAPI),
  2563. durationMs: Date.now() - start,
  2564. durationMsIncludingRetries: Date.now() - startIncludingRetries,
  2565. attempt: attemptNumber,
  2566. requestId,
  2567. clientRequestId,
  2568. didFallBackToNonStreaming,
  2569. queryTracking: options.queryTracking,
  2570. querySource: options.querySource,
  2571. llmSpan,
  2572. fastMode: isFastModeRequest,
  2573. previousRequestId,
  2574. })
  2575. if (error instanceof APIUserAbortError) {
  2576. releaseStreamResources()
  2577. return
  2578. }
  2579. yield getAssistantMessageFromError(error, errorModel, {
  2580. messages,
  2581. messagesForAPI,
  2582. })
  2583. releaseStreamResources()
  2584. return
  2585. }
  2586. } else {
  2587. // Original error handling for non-404 errors
  2588. logForDebugging(`Error in API request: ${errorMessage(errorFromRetry)}`, {
  2589. level: 'error',
  2590. })
  2591. let error = errorFromRetry
  2592. let errorModel = options.model
  2593. if (errorFromRetry instanceof CannotRetryError) {
  2594. error = errorFromRetry.originalError
  2595. errorModel = errorFromRetry.retryContext.model
  2596. }
  2597. // Extract quota status from error headers if it's a rate limit error
  2598. if (error instanceof APIError) {
  2599. extractQuotaStatusFromError(error)
  2600. }
  2601. // Extract requestId from stream, error header, or error body
  2602. const requestId =
  2603. streamRequestId ||
  2604. (error instanceof APIError ? error.requestID : undefined) ||
  2605. (error instanceof APIError
  2606. ? (error.error as { request_id?: string })?.request_id
  2607. : undefined)
  2608. logAPIError({
  2609. error,
  2610. model: errorModel,
  2611. messageCount: messagesForAPI.length,
  2612. messageTokens: tokenCountFromLastAPIResponse(messagesForAPI),
  2613. durationMs: Date.now() - start,
  2614. durationMsIncludingRetries: Date.now() - startIncludingRetries,
  2615. attempt: attemptNumber,
  2616. requestId,
  2617. clientRequestId,
  2618. didFallBackToNonStreaming,
  2619. queryTracking: options.queryTracking,
  2620. querySource: options.querySource,
  2621. llmSpan,
  2622. fastMode: isFastModeRequest,
  2623. previousRequestId,
  2624. })
  2625. // Don't yield an assistant error message for user aborts
  2626. // The interruption message is handled in query.ts
  2627. if (error instanceof APIUserAbortError) {
  2628. releaseStreamResources()
  2629. return
  2630. }
  2631. yield getAssistantMessageFromError(error, errorModel, {
  2632. messages,
  2633. messagesForAPI,
  2634. })
  2635. releaseStreamResources()
  2636. return
  2637. }
  2638. } finally {
  2639. stopSessionActivity('api_call')
  2640. // Must be in the finally block: if the generator is terminated early
  2641. // via .return() (e.g. consumer breaks out of for-await-of, or query.ts
  2642. // encounters an abort), code after the try/finally never executes.
  2643. // Without this, the Response object's native TLS/socket buffers leak
  2644. // until the generator itself is GC'd (see GH #32920).
  2645. releaseStreamResources()
  2646. // Non-streaming fallback cost: the streaming path tracks cost in the
  2647. // message_delta handler before any yield. Fallback pushes to newMessages
  2648. // then yields, so tracking must be here to survive .return() at the yield.
  2649. if (fallbackMessage) {
  2650. const fallbackUsage = fallbackMessage.message.usage
  2651. usage = updateUsage(EMPTY_USAGE, fallbackUsage)
  2652. stopReason = fallbackMessage.message.stop_reason
  2653. const fallbackCost = calculateUSDCost(resolvedModel, fallbackUsage)
  2654. costUSD += addToTotalSessionCost(
  2655. fallbackCost,
  2656. fallbackUsage,
  2657. options.model,
  2658. )
  2659. }
  2660. }
  2661. // Mark all registered tools as sent to API so they become eligible for deletion
  2662. if (feature('CACHED_MICROCOMPACT') && cachedMCEnabled) {
  2663. markToolsSentToAPIState()
  2664. }
  2665. // Track the last requestId for the main conversation chain so shutdown
  2666. // can send a cache eviction hint to inference. Exclude backgrounded
  2667. // sessions (Ctrl+B) which share the repl_main_thread querySource but
  2668. // run inside an agent context — they are independent conversation chains
  2669. // whose cache should not be evicted when the foreground session clears.
  2670. if (
  2671. streamRequestId &&
  2672. !getAgentContext() &&
  2673. (options.querySource.startsWith('repl_main_thread') ||
  2674. options.querySource === 'sdk')
  2675. ) {
  2676. setLastMainRequestId(streamRequestId)
  2677. }
  2678. // Precompute scalars so the fire-and-forget .then() closure doesn't pin the
  2679. // full messagesForAPI array (the entire conversation up to the context window
  2680. // limit) until getToolPermissionContext() resolves.
  2681. const logMessageCount = messagesForAPI.length
  2682. const logMessageTokens = tokenCountFromLastAPIResponse(messagesForAPI)
  2683. void options.getToolPermissionContext().then(permissionContext => {
  2684. logAPISuccessAndDuration({
  2685. model:
  2686. newMessages[0]?.message.model ?? partialMessage?.model ?? options.model,
  2687. preNormalizedModel: options.model,
  2688. usage,
  2689. start,
  2690. startIncludingRetries,
  2691. attempt: attemptNumber,
  2692. messageCount: logMessageCount,
  2693. messageTokens: logMessageTokens,
  2694. requestId: streamRequestId ?? null,
  2695. stopReason,
  2696. ttftMs,
  2697. didFallBackToNonStreaming,
  2698. querySource: options.querySource,
  2699. headers: responseHeaders,
  2700. costUSD,
  2701. queryTracking: options.queryTracking,
  2702. permissionMode: permissionContext.mode,
  2703. // Pass newMessages for beta tracing - extraction happens in logging.ts
  2704. // only when beta tracing is enabled
  2705. newMessages,
  2706. llmSpan,
  2707. globalCacheStrategy,
  2708. requestSetupMs: start - startIncludingRetries,
  2709. attemptStartTimes,
  2710. fastMode: isFastModeRequest,
  2711. previousRequestId,
  2712. betas: lastRequestBetas,
  2713. })
  2714. })
  2715. // Defensive: also release on normal completion (no-op if finally already ran).
  2716. releaseStreamResources()
  2717. }
  2718. /**
  2719. * Cleans up stream resources to prevent memory leaks.
  2720. * @internal Exported for testing
  2721. */
  2722. export function cleanupStream(
  2723. stream: Stream<BetaRawMessageStreamEvent> | undefined,
  2724. ): void {
  2725. if (!stream) {
  2726. return
  2727. }
  2728. try {
  2729. // Abort the stream via its controller if not already aborted
  2730. if (!stream.controller.signal.aborted) {
  2731. stream.controller.abort()
  2732. }
  2733. } catch {
  2734. // Ignore - stream may already be closed
  2735. }
  2736. }
  2737. /**
  2738. * Updates usage statistics with new values from streaming API events.
  2739. * Note: Anthropic's streaming API provides cumulative usage totals, not incremental deltas.
  2740. * Each event contains the complete usage up to that point in the stream.
  2741. *
  2742. * Input-related tokens (input_tokens, cache_creation_input_tokens, cache_read_input_tokens)
  2743. * are typically set in message_start and remain constant. message_delta events may send
  2744. * explicit 0 values for these fields, which should not overwrite the values from message_start.
  2745. * We only update these fields if they have a non-null, non-zero value.
  2746. */
  2747. export function updateUsage(
  2748. usage: Readonly<NonNullableUsage>,
  2749. partUsage: BetaMessageDeltaUsage | undefined,
  2750. ): NonNullableUsage {
  2751. if (!partUsage) {
  2752. return { ...usage }
  2753. }
  2754. return {
  2755. input_tokens:
  2756. partUsage.input_tokens !== null && partUsage.input_tokens > 0
  2757. ? partUsage.input_tokens
  2758. : usage.input_tokens,
  2759. cache_creation_input_tokens:
  2760. partUsage.cache_creation_input_tokens !== null &&
  2761. partUsage.cache_creation_input_tokens > 0
  2762. ? partUsage.cache_creation_input_tokens
  2763. : usage.cache_creation_input_tokens,
  2764. cache_read_input_tokens:
  2765. partUsage.cache_read_input_tokens !== null &&
  2766. partUsage.cache_read_input_tokens > 0
  2767. ? partUsage.cache_read_input_tokens
  2768. : usage.cache_read_input_tokens,
  2769. output_tokens: partUsage.output_tokens ?? usage.output_tokens,
  2770. server_tool_use: {
  2771. web_search_requests:
  2772. partUsage.server_tool_use?.web_search_requests ??
  2773. usage.server_tool_use.web_search_requests,
  2774. web_fetch_requests:
  2775. partUsage.server_tool_use?.web_fetch_requests ??
  2776. usage.server_tool_use.web_fetch_requests,
  2777. },
  2778. service_tier: usage.service_tier,
  2779. cache_creation: {
  2780. // SDK type BetaMessageDeltaUsage is missing cache_creation, but it's real!
  2781. ephemeral_1h_input_tokens:
  2782. (partUsage as BetaUsage).cache_creation?.ephemeral_1h_input_tokens ??
  2783. usage.cache_creation.ephemeral_1h_input_tokens,
  2784. ephemeral_5m_input_tokens:
  2785. (partUsage as BetaUsage).cache_creation?.ephemeral_5m_input_tokens ??
  2786. usage.cache_creation.ephemeral_5m_input_tokens,
  2787. },
  2788. // cache_deleted_input_tokens: returned by the API when cache editing
  2789. // deletes KV cache content, but not in SDK types. Kept off NonNullableUsage
  2790. // so the string is eliminated from external builds by dead code elimination.
  2791. // Uses the same > 0 guard as other token fields to prevent message_delta
  2792. // from overwriting the real value with 0.
  2793. ...(feature('CACHED_MICROCOMPACT')
  2794. ? {
  2795. cache_deleted_input_tokens:
  2796. (partUsage as unknown as { cache_deleted_input_tokens?: number })
  2797. .cache_deleted_input_tokens != null &&
  2798. (partUsage as unknown as { cache_deleted_input_tokens: number })
  2799. .cache_deleted_input_tokens > 0
  2800. ? (partUsage as unknown as { cache_deleted_input_tokens: number })
  2801. .cache_deleted_input_tokens
  2802. : ((usage as unknown as { cache_deleted_input_tokens?: number })
  2803. .cache_deleted_input_tokens ?? 0),
  2804. }
  2805. : {}),
  2806. inference_geo: usage.inference_geo,
  2807. iterations: partUsage.iterations ?? usage.iterations,
  2808. speed: (partUsage as BetaUsage).speed ?? usage.speed,
  2809. }
  2810. }
  2811. /**
  2812. * Accumulates usage from one message into a total usage object.
  2813. * Used to track cumulative usage across multiple assistant turns.
  2814. */
  2815. export function accumulateUsage(
  2816. totalUsage: Readonly<NonNullableUsage>,
  2817. messageUsage: Readonly<NonNullableUsage>,
  2818. ): NonNullableUsage {
  2819. return {
  2820. input_tokens: totalUsage.input_tokens + messageUsage.input_tokens,
  2821. cache_creation_input_tokens:
  2822. totalUsage.cache_creation_input_tokens +
  2823. messageUsage.cache_creation_input_tokens,
  2824. cache_read_input_tokens:
  2825. totalUsage.cache_read_input_tokens + messageUsage.cache_read_input_tokens,
  2826. output_tokens: totalUsage.output_tokens + messageUsage.output_tokens,
  2827. server_tool_use: {
  2828. web_search_requests:
  2829. totalUsage.server_tool_use.web_search_requests +
  2830. messageUsage.server_tool_use.web_search_requests,
  2831. web_fetch_requests:
  2832. totalUsage.server_tool_use.web_fetch_requests +
  2833. messageUsage.server_tool_use.web_fetch_requests,
  2834. },
  2835. service_tier: messageUsage.service_tier, // Use the most recent service tier
  2836. cache_creation: {
  2837. ephemeral_1h_input_tokens:
  2838. totalUsage.cache_creation.ephemeral_1h_input_tokens +
  2839. messageUsage.cache_creation.ephemeral_1h_input_tokens,
  2840. ephemeral_5m_input_tokens:
  2841. totalUsage.cache_creation.ephemeral_5m_input_tokens +
  2842. messageUsage.cache_creation.ephemeral_5m_input_tokens,
  2843. },
  2844. // See comment in updateUsage — field is not on NonNullableUsage to keep
  2845. // the string out of external builds.
  2846. ...(feature('CACHED_MICROCOMPACT')
  2847. ? {
  2848. cache_deleted_input_tokens:
  2849. ((totalUsage as unknown as { cache_deleted_input_tokens?: number })
  2850. .cache_deleted_input_tokens ?? 0) +
  2851. ((
  2852. messageUsage as unknown as { cache_deleted_input_tokens?: number }
  2853. ).cache_deleted_input_tokens ?? 0),
  2854. }
  2855. : {}),
  2856. inference_geo: messageUsage.inference_geo, // Use the most recent
  2857. iterations: messageUsage.iterations, // Use the most recent
  2858. speed: messageUsage.speed, // Use the most recent
  2859. }
  2860. }
  2861. function isToolResultBlock(
  2862. block: unknown,
  2863. ): block is { type: 'tool_result'; tool_use_id: string } {
  2864. return (
  2865. block !== null &&
  2866. typeof block === 'object' &&
  2867. 'type' in block &&
  2868. (block as { type: string }).type === 'tool_result' &&
  2869. 'tool_use_id' in block
  2870. )
  2871. }
  2872. type CachedMCEditsBlock = {
  2873. type: 'cache_edits'
  2874. edits: { type: 'delete'; cache_reference: string }[]
  2875. }
  2876. type CachedMCPinnedEdits = {
  2877. userMessageIndex: number
  2878. block: CachedMCEditsBlock
  2879. }
  2880. // Exported for testing cache_reference placement constraints
  2881. export function addCacheBreakpoints(
  2882. messages: (UserMessage | AssistantMessage)[],
  2883. enablePromptCaching: boolean,
  2884. querySource?: QuerySource,
  2885. useCachedMC = false,
  2886. newCacheEdits?: CachedMCEditsBlock | null,
  2887. pinnedEdits?: CachedMCPinnedEdits[],
  2888. skipCacheWrite = false,
  2889. ): MessageParam[] {
  2890. logEvent('tengu_api_cache_breakpoints', {
  2891. totalMessageCount: messages.length,
  2892. cachingEnabled: enablePromptCaching,
  2893. skipCacheWrite,
  2894. })
  2895. // Exactly one message-level cache_control marker per request. Mycro's
  2896. // turn-to-turn eviction (page_manager/index.rs: Index::insert) frees
  2897. // local-attention KV pages at any cached prefix position NOT in
  2898. // cache_store_int_token_boundaries. With two markers the second-to-last
  2899. // position is protected and its locals survive an extra turn even though
  2900. // nothing will ever resume from there — with one marker they're freed
  2901. // immediately. For fire-and-forget forks (skipCacheWrite) we shift the
  2902. // marker to the second-to-last message: that's the last shared-prefix
  2903. // point, so the write is a no-op merge on mycro (entry already exists)
  2904. // and the fork doesn't leave its own tail in the KVCC. Dense pages are
  2905. // refcounted and survive via the new hash either way.
  2906. const markerIndex = skipCacheWrite ? messages.length - 2 : messages.length - 1
  2907. const result = messages.map((msg, index) => {
  2908. const addCache = index === markerIndex
  2909. if (msg.type === 'user') {
  2910. return userMessageToMessageParam(
  2911. msg,
  2912. addCache,
  2913. enablePromptCaching,
  2914. querySource,
  2915. )
  2916. }
  2917. return assistantMessageToMessageParam(
  2918. msg,
  2919. addCache,
  2920. enablePromptCaching,
  2921. querySource,
  2922. )
  2923. })
  2924. if (!useCachedMC) {
  2925. return result
  2926. }
  2927. // Track all cache_references being deleted to prevent duplicates across blocks.
  2928. const seenDeleteRefs = new Set<string>()
  2929. // Helper to deduplicate a cache_edits block against already-seen deletions
  2930. const deduplicateEdits = (block: CachedMCEditsBlock): CachedMCEditsBlock => {
  2931. const uniqueEdits = block.edits.filter(edit => {
  2932. if (seenDeleteRefs.has(edit.cache_reference)) {
  2933. return false
  2934. }
  2935. seenDeleteRefs.add(edit.cache_reference)
  2936. return true
  2937. })
  2938. return { ...block, edits: uniqueEdits }
  2939. }
  2940. // Re-insert all previously-pinned cache_edits at their original positions
  2941. for (const pinned of pinnedEdits ?? []) {
  2942. const msg = result[pinned.userMessageIndex]
  2943. if (msg && msg.role === 'user') {
  2944. if (!Array.isArray(msg.content)) {
  2945. msg.content = [{ type: 'text', text: msg.content as string }]
  2946. }
  2947. const dedupedBlock = deduplicateEdits(pinned.block)
  2948. if (dedupedBlock.edits.length > 0) {
  2949. insertBlockAfterToolResults(msg.content, dedupedBlock)
  2950. }
  2951. }
  2952. }
  2953. // Insert new cache_edits into the last user message and pin them
  2954. if (newCacheEdits && result.length > 0) {
  2955. const dedupedNewEdits = deduplicateEdits(newCacheEdits)
  2956. if (dedupedNewEdits.edits.length > 0) {
  2957. for (let i = result.length - 1; i >= 0; i--) {
  2958. const msg = result[i]
  2959. if (msg && msg.role === 'user') {
  2960. if (!Array.isArray(msg.content)) {
  2961. msg.content = [{ type: 'text', text: msg.content as string }]
  2962. }
  2963. insertBlockAfterToolResults(msg.content, dedupedNewEdits)
  2964. // Pin so this block is re-sent at the same position in future calls
  2965. pinCacheEdits(i, newCacheEdits)
  2966. logForDebugging(
  2967. `Added cache_edits block with ${dedupedNewEdits.edits.length} deletion(s) to message[${i}]: ${dedupedNewEdits.edits.map(e => e.cache_reference).join(', ')}`,
  2968. )
  2969. break
  2970. }
  2971. }
  2972. }
  2973. }
  2974. // Add cache_reference to tool_result blocks that are within the cached prefix.
  2975. // Must be done AFTER cache_edits insertion since that modifies content arrays.
  2976. if (enablePromptCaching) {
  2977. // Find the last message containing a cache_control marker
  2978. let lastCCMsg = -1
  2979. for (let i = 0; i < result.length; i++) {
  2980. const msg = result[i]!
  2981. if (Array.isArray(msg.content)) {
  2982. for (const block of msg.content) {
  2983. if (block && typeof block === 'object' && 'cache_control' in block) {
  2984. lastCCMsg = i
  2985. }
  2986. }
  2987. }
  2988. }
  2989. // Add cache_reference to tool_result blocks that are strictly before
  2990. // the last cache_control marker. The API requires cache_reference to
  2991. // appear "before or on" the last cache_control — we use strict "before"
  2992. // to avoid edge cases where cache_edits splicing shifts block indices.
  2993. //
  2994. // Create new objects instead of mutating in-place to avoid contaminating
  2995. // blocks reused by secondary queries that use models without cache_editing support.
  2996. if (lastCCMsg >= 0) {
  2997. for (let i = 0; i < lastCCMsg; i++) {
  2998. const msg = result[i]!
  2999. if (msg.role !== 'user' || !Array.isArray(msg.content)) {
  3000. continue
  3001. }
  3002. let cloned = false
  3003. for (let j = 0; j < msg.content.length; j++) {
  3004. const block = msg.content[j]
  3005. if (block && isToolResultBlock(block)) {
  3006. if (!cloned) {
  3007. msg.content = [...msg.content]
  3008. cloned = true
  3009. }
  3010. msg.content[j] = Object.assign({}, block, {
  3011. cache_reference: block.tool_use_id,
  3012. })
  3013. }
  3014. }
  3015. }
  3016. }
  3017. }
  3018. return result
  3019. }
  3020. export function buildSystemPromptBlocks(
  3021. systemPrompt: SystemPrompt,
  3022. enablePromptCaching: boolean,
  3023. options?: {
  3024. skipGlobalCacheForSystemPrompt?: boolean
  3025. querySource?: QuerySource
  3026. },
  3027. ): TextBlockParam[] {
  3028. // IMPORTANT: Do not add any more blocks for caching or you will get a 400
  3029. return splitSysPromptPrefix(systemPrompt, {
  3030. skipGlobalCacheForSystemPrompt: options?.skipGlobalCacheForSystemPrompt,
  3031. }).map(block => {
  3032. return {
  3033. type: 'text' as const,
  3034. text: block.text,
  3035. ...(enablePromptCaching &&
  3036. block.cacheScope !== null && {
  3037. cache_control: getCacheControl({
  3038. scope: block.cacheScope,
  3039. querySource: options?.querySource,
  3040. }),
  3041. }),
  3042. }
  3043. })
  3044. }
  3045. type HaikuOptions = Omit<Options, 'model' | 'getToolPermissionContext'>
  3046. export async function queryHaiku({
  3047. systemPrompt = asSystemPrompt([]),
  3048. userPrompt,
  3049. outputFormat,
  3050. signal,
  3051. options,
  3052. }: {
  3053. systemPrompt: SystemPrompt
  3054. userPrompt: string
  3055. outputFormat?: BetaJSONOutputFormat
  3056. signal: AbortSignal
  3057. options: HaikuOptions
  3058. }): Promise<AssistantMessage> {
  3059. const result = await withVCR(
  3060. [
  3061. createUserMessage({
  3062. content: systemPrompt.map(text => ({ type: 'text', text })),
  3063. }),
  3064. createUserMessage({
  3065. content: userPrompt,
  3066. }),
  3067. ],
  3068. async () => {
  3069. const messages = [
  3070. createUserMessage({
  3071. content: userPrompt,
  3072. }),
  3073. ]
  3074. const result = await queryModelWithoutStreaming({
  3075. messages,
  3076. systemPrompt,
  3077. thinkingConfig: { type: 'disabled' },
  3078. tools: [],
  3079. signal,
  3080. options: {
  3081. ...options,
  3082. model: getSmallFastModel(),
  3083. enablePromptCaching: options.enablePromptCaching ?? false,
  3084. outputFormat,
  3085. async getToolPermissionContext() {
  3086. return getEmptyToolPermissionContext()
  3087. },
  3088. },
  3089. })
  3090. return [result]
  3091. },
  3092. )
  3093. // We don't use streaming for Haiku so this is safe
  3094. return result[0]! as AssistantMessage
  3095. }
  3096. type QueryWithModelOptions = Omit<Options, 'getToolPermissionContext'>
  3097. /**
  3098. * Query a specific model through the Claude Code infrastructure.
  3099. * This goes through the full query pipeline including proper authentication,
  3100. * betas, and headers - unlike direct API calls.
  3101. */
  3102. export async function queryWithModel({
  3103. systemPrompt = asSystemPrompt([]),
  3104. userPrompt,
  3105. outputFormat,
  3106. signal,
  3107. options,
  3108. }: {
  3109. systemPrompt: SystemPrompt
  3110. userPrompt: string
  3111. outputFormat?: BetaJSONOutputFormat
  3112. signal: AbortSignal
  3113. options: QueryWithModelOptions
  3114. }): Promise<AssistantMessage> {
  3115. const result = await withVCR(
  3116. [
  3117. createUserMessage({
  3118. content: systemPrompt.map(text => ({ type: 'text', text })),
  3119. }),
  3120. createUserMessage({
  3121. content: userPrompt,
  3122. }),
  3123. ],
  3124. async () => {
  3125. const messages = [
  3126. createUserMessage({
  3127. content: userPrompt,
  3128. }),
  3129. ]
  3130. const result = await queryModelWithoutStreaming({
  3131. messages,
  3132. systemPrompt,
  3133. thinkingConfig: { type: 'disabled' },
  3134. tools: [],
  3135. signal,
  3136. options: {
  3137. ...options,
  3138. enablePromptCaching: options.enablePromptCaching ?? false,
  3139. outputFormat,
  3140. async getToolPermissionContext() {
  3141. return getEmptyToolPermissionContext()
  3142. },
  3143. },
  3144. })
  3145. return [result]
  3146. },
  3147. )
  3148. return result[0]! as AssistantMessage
  3149. }
  3150. // Non-streaming requests have a 10min max per the docs:
  3151. // https://platform.claude.com/docs/en/api/errors#long-requests
  3152. // The SDK's 21333-token cap is derived from 10min × 128k tokens/hour, but we
  3153. // bypass it by setting a client-level timeout, so we can cap higher.
  3154. export const MAX_NON_STREAMING_TOKENS = 64_000
  3155. /**
  3156. * Adjusts thinking budget when max_tokens is capped for non-streaming fallback.
  3157. * Ensures the API constraint: max_tokens > thinking.budget_tokens
  3158. *
  3159. * @param params - The parameters that will be sent to the API
  3160. * @param maxTokensCap - The maximum allowed tokens (MAX_NON_STREAMING_TOKENS)
  3161. * @returns Adjusted parameters with thinking budget capped if needed
  3162. */
  3163. export function adjustParamsForNonStreaming<
  3164. T extends {
  3165. max_tokens: number
  3166. thinking?: BetaMessageStreamParams['thinking']
  3167. },
  3168. >(params: T, maxTokensCap: number): T {
  3169. const cappedMaxTokens = Math.min(params.max_tokens, maxTokensCap)
  3170. // Adjust thinking budget if it would exceed capped max_tokens
  3171. // to maintain the constraint: max_tokens > thinking.budget_tokens
  3172. const adjustedParams = { ...params }
  3173. if (
  3174. adjustedParams.thinking?.type === 'enabled' &&
  3175. adjustedParams.thinking.budget_tokens
  3176. ) {
  3177. adjustedParams.thinking = {
  3178. ...adjustedParams.thinking,
  3179. budget_tokens: Math.min(
  3180. adjustedParams.thinking.budget_tokens,
  3181. cappedMaxTokens - 1, // Must be at least 1 less than max_tokens
  3182. ),
  3183. }
  3184. }
  3185. return {
  3186. ...adjustedParams,
  3187. max_tokens: cappedMaxTokens,
  3188. }
  3189. }
  3190. function isMaxTokensCapEnabled(): boolean {
  3191. // 3P default: false (not validated on Bedrock/Vertex)
  3192. return getFeatureValue_CACHED_MAY_BE_STALE('tengu_otk_slot_v1', false)
  3193. }
  3194. export function getMaxOutputTokensForModel(model: string): number {
  3195. const maxOutputTokens = getModelMaxOutputTokens(model)
  3196. // Slot-reservation cap: drop default to 8k for all models. BQ p99 output
  3197. // = 4,911 tokens; 32k/64k defaults over-reserve 8-16× slot capacity.
  3198. // Requests hitting the cap get one clean retry at 64k (query.ts
  3199. // max_output_tokens_escalate). Math.min keeps models with lower native
  3200. // defaults (e.g. claude-3-opus at 4k) at their native value. Applied
  3201. // before the env-var override so CLAUDE_CODE_MAX_OUTPUT_TOKENS still wins.
  3202. const defaultTokens = isMaxTokensCapEnabled()
  3203. ? Math.min(maxOutputTokens.default, CAPPED_DEFAULT_MAX_TOKENS)
  3204. : maxOutputTokens.default
  3205. const result = validateBoundedIntEnvVar(
  3206. 'CLAUDE_CODE_MAX_OUTPUT_TOKENS',
  3207. process.env.CLAUDE_CODE_MAX_OUTPUT_TOKENS,
  3208. defaultTokens,
  3209. maxOutputTokens.upperLimit,
  3210. )
  3211. return result.effective
  3212. }