gogadmin
/
claudecode


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							/**
 * Port of the API's image transcoder target-size algorithm. Pre-sizing
 * screenshots to this function's output means the API's early-return fires
 * (tokens ≤ max) and the image is NOT resized server-side — so the model
 * sees exactly the dimensions in `ScreenshotResult.width/height` and
 * `scaleCoord` stays coherent.
 *
 * Rust reference: api/api/image_transcoder/rust_transcoder/src/utils/resize.rs
 * Sibling TS port: apps/claude-browser-use/src/utils/imageResize.ts (identical
 * algorithm, lives in the Chrome extension tree — not a shared package).
 *
 * See COORDINATES.md for why this matters for click accuracy.
 */

export interface ResizeParams {
  pxPerToken: number;
  maxTargetPx: number;
  maxTargetTokens: number;
}

/**
 * Production defaults — match `resize.rs:160-164` and Chrome's
 * `CDPService.ts:638-642`. Vision encoder uses 28px tiles; 1568 is both
 * the long-edge cap (56 tiles) AND the token budget.
 */
export const API_RESIZE_PARAMS: ResizeParams = {
  pxPerToken: 28,
  maxTargetPx: 1568,
  maxTargetTokens: 1568,
};

/** ceil(px / pxPerToken). Matches resize.rs:74-76 (which uses integer ceil-div). */
export function nTokensForPx(px: number, pxPerToken: number): number {
  return Math.floor((px - 1) / pxPerToken) + 1;
}

function nTokensForImg(
  width: number,
  height: number,
  pxPerToken: number,
): number {
  return nTokensForPx(width, pxPerToken) * nTokensForPx(height, pxPerToken);
}

/**
 * Binary-search along the width dimension for the largest image that:
 *   - preserves the input aspect ratio
 *   - has long edge ≤ maxTargetPx
 *   - has ceil(w/pxPerToken) × ceil(h/pxPerToken) ≤ maxTargetTokens
 *
 * Returns [width, height]. No-op if input already satisfies all three.
 *
 * The long-edge constraint alone (what we used to use) is insufficient on
 * squarer-than-16:9 displays: 1568×1014 (MBP 16" AR) is 56×37 = 2072 tokens,
 * over budget, and gets server-resized to 1372×887 — model then clicks in
 * 1372-space but scaleCoord assumed 1568-space → ~14% coord error.
 *
 * Matches resize.rs:91-155 exactly (verified against its test vectors).
 */
export function targetImageSize(
  width: number,
  height: number,
  params: ResizeParams,
): [number, number] {
  const { pxPerToken, maxTargetPx, maxTargetTokens } = params;

  if (
    width <= maxTargetPx &&
    height <= maxTargetPx &&
    nTokensForImg(width, height, pxPerToken) <= maxTargetTokens
  ) {
    return [width, height];
  }

  // Normalize to landscape for the search; transpose result back.
  if (height > width) {
    const [w, h] = targetImageSize(height, width, params);
    return [h, w];
  }

  const aspectRatio = width / height;

  // Loop invariant: lowerBoundWidth is always valid, upperBoundWidth is
  // always invalid. ~12 iterations for a 4000px image.
  let upperBoundWidth = width;
  let lowerBoundWidth = 1;

  for (;;) {
    if (lowerBoundWidth + 1 === upperBoundWidth) {
      return [
        lowerBoundWidth,
        Math.max(Math.round(lowerBoundWidth / aspectRatio), 1),
      ];
    }

    const middleWidth = Math.floor((lowerBoundWidth + upperBoundWidth) / 2);
    const middleHeight = Math.max(Math.round(middleWidth / aspectRatio), 1);

    if (
      middleWidth <= maxTargetPx &&
      nTokensForImg(middleWidth, middleHeight, pxPerToken) <= maxTargetTokens
    ) {
      lowerBoundWidth = middleWidth;
    } else {
      upperBoundWidth = middleWidth;
    }
  }
}