imageResize.ts 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. /**
  2. * Port of the API's image transcoder target-size algorithm. Pre-sizing
  3. * screenshots to this function's output means the API's early-return fires
  4. * (tokens ≤ max) and the image is NOT resized server-side — so the model
  5. * sees exactly the dimensions in `ScreenshotResult.width/height` and
  6. * `scaleCoord` stays coherent.
  7. *
  8. * Rust reference: api/api/image_transcoder/rust_transcoder/src/utils/resize.rs
  9. * Sibling TS port: apps/claude-browser-use/src/utils/imageResize.ts (identical
  10. * algorithm, lives in the Chrome extension tree — not a shared package).
  11. *
  12. * See COORDINATES.md for why this matters for click accuracy.
  13. */
  14. export interface ResizeParams {
  15. pxPerToken: number;
  16. maxTargetPx: number;
  17. maxTargetTokens: number;
  18. }
  19. /**
  20. * Production defaults — match `resize.rs:160-164` and Chrome's
  21. * `CDPService.ts:638-642`. Vision encoder uses 28px tiles; 1568 is both
  22. * the long-edge cap (56 tiles) AND the token budget.
  23. */
  24. export const API_RESIZE_PARAMS: ResizeParams = {
  25. pxPerToken: 28,
  26. maxTargetPx: 1568,
  27. maxTargetTokens: 1568,
  28. };
  29. /** ceil(px / pxPerToken). Matches resize.rs:74-76 (which uses integer ceil-div). */
  30. export function nTokensForPx(px: number, pxPerToken: number): number {
  31. return Math.floor((px - 1) / pxPerToken) + 1;
  32. }
  33. function nTokensForImg(
  34. width: number,
  35. height: number,
  36. pxPerToken: number,
  37. ): number {
  38. return nTokensForPx(width, pxPerToken) * nTokensForPx(height, pxPerToken);
  39. }
  40. /**
  41. * Binary-search along the width dimension for the largest image that:
  42. * - preserves the input aspect ratio
  43. * - has long edge ≤ maxTargetPx
  44. * - has ceil(w/pxPerToken) × ceil(h/pxPerToken) ≤ maxTargetTokens
  45. *
  46. * Returns [width, height]. No-op if input already satisfies all three.
  47. *
  48. * The long-edge constraint alone (what we used to use) is insufficient on
  49. * squarer-than-16:9 displays: 1568×1014 (MBP 16" AR) is 56×37 = 2072 tokens,
  50. * over budget, and gets server-resized to 1372×887 — model then clicks in
  51. * 1372-space but scaleCoord assumed 1568-space → ~14% coord error.
  52. *
  53. * Matches resize.rs:91-155 exactly (verified against its test vectors).
  54. */
  55. export function targetImageSize(
  56. width: number,
  57. height: number,
  58. params: ResizeParams,
  59. ): [number, number] {
  60. const { pxPerToken, maxTargetPx, maxTargetTokens } = params;
  61. if (
  62. width <= maxTargetPx &&
  63. height <= maxTargetPx &&
  64. nTokensForImg(width, height, pxPerToken) <= maxTargetTokens
  65. ) {
  66. return [width, height];
  67. }
  68. // Normalize to landscape for the search; transpose result back.
  69. if (height > width) {
  70. const [w, h] = targetImageSize(height, width, params);
  71. return [h, w];
  72. }
  73. const aspectRatio = width / height;
  74. // Loop invariant: lowerBoundWidth is always valid, upperBoundWidth is
  75. // always invalid. ~12 iterations for a 4000px image.
  76. let upperBoundWidth = width;
  77. let lowerBoundWidth = 1;
  78. for (;;) {
  79. if (lowerBoundWidth + 1 === upperBoundWidth) {
  80. return [
  81. lowerBoundWidth,
  82. Math.max(Math.round(lowerBoundWidth / aspectRatio), 1),
  83. ];
  84. }
  85. const middleWidth = Math.floor((lowerBoundWidth + upperBoundWidth) / 2);
  86. const middleHeight = Math.max(Math.round(middleWidth / aspectRatio), 1);
  87. if (
  88. middleWidth <= maxTargetPx &&
  89. nTokensForImg(middleWidth, middleHeight, pxPerToken) <= maxTargetTokens
  90. ) {
  91. lowerBoundWidth = middleWidth;
  92. } else {
  93. upperBoundWidth = middleWidth;
  94. }
  95. }
  96. }