mock_parity_harness.rs 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877
  1. use std::collections::BTreeMap;
  2. use std::fs;
  3. use std::io::Write;
  4. use std::os::unix::fs::PermissionsExt;
  5. use std::path::{Path, PathBuf};
  6. use std::process::{Command, Output, Stdio};
  7. use std::sync::atomic::{AtomicU64, Ordering};
  8. use std::time::{SystemTime, UNIX_EPOCH};
  9. use mock_anthropic_service::{MockAnthropicService, SCENARIO_PREFIX};
  10. use serde_json::{json, Value};
  11. static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
  12. #[test]
  13. #[allow(clippy::too_many_lines)]
  14. fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios() {
  15. let manifest_entries = load_scenario_manifest();
  16. let manifest = manifest_entries
  17. .iter()
  18. .cloned()
  19. .map(|entry| (entry.name.clone(), entry))
  20. .collect::<BTreeMap<_, _>>();
  21. let runtime = tokio::runtime::Runtime::new().expect("tokio runtime should build");
  22. let server = runtime
  23. .block_on(MockAnthropicService::spawn())
  24. .expect("mock service should start");
  25. let base_url = server.base_url();
  26. let cases = [
  27. ScenarioCase {
  28. name: "streaming_text",
  29. permission_mode: "read-only",
  30. allowed_tools: None,
  31. stdin: None,
  32. prepare: prepare_noop,
  33. assert: assert_streaming_text,
  34. extra_env: None,
  35. resume_session: None,
  36. },
  37. ScenarioCase {
  38. name: "read_file_roundtrip",
  39. permission_mode: "read-only",
  40. allowed_tools: Some("read_file"),
  41. stdin: None,
  42. prepare: prepare_read_fixture,
  43. assert: assert_read_file_roundtrip,
  44. extra_env: None,
  45. resume_session: None,
  46. },
  47. ScenarioCase {
  48. name: "grep_chunk_assembly",
  49. permission_mode: "read-only",
  50. allowed_tools: Some("grep_search"),
  51. stdin: None,
  52. prepare: prepare_grep_fixture,
  53. assert: assert_grep_chunk_assembly,
  54. extra_env: None,
  55. resume_session: None,
  56. },
  57. ScenarioCase {
  58. name: "write_file_allowed",
  59. permission_mode: "workspace-write",
  60. allowed_tools: Some("write_file"),
  61. stdin: None,
  62. prepare: prepare_noop,
  63. assert: assert_write_file_allowed,
  64. extra_env: None,
  65. resume_session: None,
  66. },
  67. ScenarioCase {
  68. name: "write_file_denied",
  69. permission_mode: "read-only",
  70. allowed_tools: Some("write_file"),
  71. stdin: None,
  72. prepare: prepare_noop,
  73. assert: assert_write_file_denied,
  74. extra_env: None,
  75. resume_session: None,
  76. },
  77. ScenarioCase {
  78. name: "multi_tool_turn_roundtrip",
  79. permission_mode: "read-only",
  80. allowed_tools: Some("read_file,grep_search"),
  81. stdin: None,
  82. prepare: prepare_multi_tool_fixture,
  83. assert: assert_multi_tool_turn_roundtrip,
  84. extra_env: None,
  85. resume_session: None,
  86. },
  87. ScenarioCase {
  88. name: "bash_stdout_roundtrip",
  89. permission_mode: "danger-full-access",
  90. allowed_tools: Some("bash"),
  91. stdin: None,
  92. prepare: prepare_noop,
  93. assert: assert_bash_stdout_roundtrip,
  94. extra_env: None,
  95. resume_session: None,
  96. },
  97. ScenarioCase {
  98. name: "bash_permission_prompt_approved",
  99. permission_mode: "workspace-write",
  100. allowed_tools: Some("bash"),
  101. stdin: Some("y\n"),
  102. prepare: prepare_noop,
  103. assert: assert_bash_permission_prompt_approved,
  104. extra_env: None,
  105. resume_session: None,
  106. },
  107. ScenarioCase {
  108. name: "bash_permission_prompt_denied",
  109. permission_mode: "workspace-write",
  110. allowed_tools: Some("bash"),
  111. stdin: Some("n\n"),
  112. prepare: prepare_noop,
  113. assert: assert_bash_permission_prompt_denied,
  114. extra_env: None,
  115. resume_session: None,
  116. },
  117. ScenarioCase {
  118. name: "plugin_tool_roundtrip",
  119. permission_mode: "workspace-write",
  120. allowed_tools: None,
  121. stdin: None,
  122. prepare: prepare_plugin_fixture,
  123. assert: assert_plugin_tool_roundtrip,
  124. extra_env: None,
  125. resume_session: None,
  126. },
  127. ScenarioCase {
  128. name: "auto_compact_triggered",
  129. permission_mode: "read-only",
  130. allowed_tools: None,
  131. stdin: None,
  132. prepare: prepare_noop,
  133. assert: assert_auto_compact_triggered,
  134. extra_env: None,
  135. resume_session: None,
  136. },
  137. ScenarioCase {
  138. name: "token_cost_reporting",
  139. permission_mode: "read-only",
  140. allowed_tools: None,
  141. stdin: None,
  142. prepare: prepare_noop,
  143. assert: assert_token_cost_reporting,
  144. extra_env: None,
  145. resume_session: None,
  146. },
  147. ];
  148. let case_names = cases.iter().map(|case| case.name).collect::<Vec<_>>();
  149. let manifest_names = manifest_entries
  150. .iter()
  151. .map(|entry| entry.name.as_str())
  152. .collect::<Vec<_>>();
  153. assert_eq!(
  154. case_names, manifest_names,
  155. "manifest and harness cases must stay aligned"
  156. );
  157. let mut scenario_reports = Vec::new();
  158. for case in cases {
  159. let workspace = HarnessWorkspace::new(unique_temp_dir(case.name));
  160. workspace.create().expect("workspace should exist");
  161. (case.prepare)(&workspace);
  162. let run = run_case(case, &workspace, &base_url);
  163. (case.assert)(&workspace, &run);
  164. let manifest_entry = manifest
  165. .get(case.name)
  166. .unwrap_or_else(|| panic!("missing manifest entry for {}", case.name));
  167. scenario_reports.push(build_scenario_report(
  168. case.name,
  169. manifest_entry,
  170. &run.response,
  171. ));
  172. fs::remove_dir_all(&workspace.root).expect("workspace cleanup should succeed");
  173. }
  174. let captured = runtime.block_on(server.captured_requests());
  175. assert_eq!(
  176. captured.len(),
  177. 21,
  178. "twelve scenarios should produce twenty-one requests"
  179. );
  180. assert!(captured
  181. .iter()
  182. .all(|request| request.path == "/v1/messages"));
  183. assert!(captured.iter().all(|request| request.stream));
  184. let scenarios = captured
  185. .iter()
  186. .map(|request| request.scenario.as_str())
  187. .collect::<Vec<_>>();
  188. assert_eq!(
  189. scenarios,
  190. vec![
  191. "streaming_text",
  192. "read_file_roundtrip",
  193. "read_file_roundtrip",
  194. "grep_chunk_assembly",
  195. "grep_chunk_assembly",
  196. "write_file_allowed",
  197. "write_file_allowed",
  198. "write_file_denied",
  199. "write_file_denied",
  200. "multi_tool_turn_roundtrip",
  201. "multi_tool_turn_roundtrip",
  202. "bash_stdout_roundtrip",
  203. "bash_stdout_roundtrip",
  204. "bash_permission_prompt_approved",
  205. "bash_permission_prompt_approved",
  206. "bash_permission_prompt_denied",
  207. "bash_permission_prompt_denied",
  208. "plugin_tool_roundtrip",
  209. "plugin_tool_roundtrip",
  210. "auto_compact_triggered",
  211. "token_cost_reporting",
  212. ]
  213. );
  214. let mut request_counts = BTreeMap::new();
  215. for request in &captured {
  216. *request_counts
  217. .entry(request.scenario.as_str())
  218. .or_insert(0_usize) += 1;
  219. }
  220. for report in &mut scenario_reports {
  221. report.request_count = *request_counts
  222. .get(report.name.as_str())
  223. .unwrap_or_else(|| panic!("missing request count for {}", report.name));
  224. }
  225. maybe_write_report(&scenario_reports);
  226. }
  227. #[derive(Clone, Copy)]
  228. struct ScenarioCase {
  229. name: &'static str,
  230. permission_mode: &'static str,
  231. allowed_tools: Option<&'static str>,
  232. stdin: Option<&'static str>,
  233. prepare: fn(&HarnessWorkspace),
  234. assert: fn(&HarnessWorkspace, &ScenarioRun),
  235. extra_env: Option<(&'static str, &'static str)>,
  236. resume_session: Option<&'static str>,
  237. }
  238. struct HarnessWorkspace {
  239. root: PathBuf,
  240. config_home: PathBuf,
  241. home: PathBuf,
  242. }
  243. impl HarnessWorkspace {
  244. fn new(root: PathBuf) -> Self {
  245. Self {
  246. config_home: root.join("config-home"),
  247. home: root.join("home"),
  248. root,
  249. }
  250. }
  251. fn create(&self) -> std::io::Result<()> {
  252. fs::create_dir_all(&self.root)?;
  253. fs::create_dir_all(&self.config_home)?;
  254. fs::create_dir_all(&self.home)?;
  255. Ok(())
  256. }
  257. }
  258. struct ScenarioRun {
  259. response: Value,
  260. stdout: String,
  261. }
  262. #[derive(Debug, Clone)]
  263. struct ScenarioManifestEntry {
  264. name: String,
  265. category: String,
  266. description: String,
  267. parity_refs: Vec<String>,
  268. }
  269. #[derive(Debug)]
  270. struct ScenarioReport {
  271. name: String,
  272. category: String,
  273. description: String,
  274. parity_refs: Vec<String>,
  275. iterations: u64,
  276. request_count: usize,
  277. tool_uses: Vec<String>,
  278. tool_error_count: usize,
  279. final_message: String,
  280. }
  281. fn run_case(case: ScenarioCase, workspace: &HarnessWorkspace, base_url: &str) -> ScenarioRun {
  282. let mut command = Command::new(env!("CARGO_BIN_EXE_claw"));
  283. command
  284. .current_dir(&workspace.root)
  285. .env_clear()
  286. .env("ANTHROPIC_API_KEY", "test-parity-key")
  287. .env("ANTHROPIC_BASE_URL", base_url)
  288. .env("CLAW_CONFIG_HOME", &workspace.config_home)
  289. .env("HOME", &workspace.home)
  290. .env("NO_COLOR", "1")
  291. .env("PATH", "/usr/bin:/bin")
  292. .args([
  293. "--model",
  294. "sonnet",
  295. "--permission-mode",
  296. case.permission_mode,
  297. "--output-format=json",
  298. ]);
  299. if let Some(allowed_tools) = case.allowed_tools {
  300. command.args(["--allowedTools", allowed_tools]);
  301. }
  302. if let Some((key, value)) = case.extra_env {
  303. command.env(key, value);
  304. }
  305. if let Some(session_id) = case.resume_session {
  306. command.args(["--resume", session_id]);
  307. }
  308. let prompt = format!("{SCENARIO_PREFIX}{}", case.name);
  309. command.arg(prompt);
  310. let output = if let Some(stdin) = case.stdin {
  311. let mut child = command
  312. .stdin(Stdio::piped())
  313. .stdout(Stdio::piped())
  314. .stderr(Stdio::piped())
  315. .spawn()
  316. .expect("claw should launch");
  317. child
  318. .stdin
  319. .as_mut()
  320. .expect("stdin should be piped")
  321. .write_all(stdin.as_bytes())
  322. .expect("stdin should write");
  323. child.wait_with_output().expect("claw should finish")
  324. } else {
  325. command.output().expect("claw should launch")
  326. };
  327. assert_success(&output);
  328. let stdout = String::from_utf8_lossy(&output.stdout).into_owned();
  329. ScenarioRun {
  330. response: parse_json_output(&stdout),
  331. stdout,
  332. }
  333. }
  334. #[allow(dead_code)]
  335. fn prepare_auto_compact_fixture(workspace: &HarnessWorkspace) {
  336. let sessions_dir = workspace.root.join(".claw").join("sessions");
  337. fs::create_dir_all(&sessions_dir).expect("sessions dir should exist");
  338. // Write a pre-seeded session with 6 messages so auto-compact can remove them
  339. let session_id = "parity-auto-compact-seed";
  340. let session_jsonl = r#"{"type":"session_meta","version":3,"session_id":"parity-auto-compact-seed","created_at_ms":1743724800000,"updated_at_ms":1743724800000}
  341. {"type":"message","message":{"role":"user","blocks":[{"type":"text","text":"step one of the parity scenario"}]}}
  342. {"type":"message","message":{"role":"assistant","blocks":[{"type":"text","text":"acknowledged step one"}]}}
  343. {"type":"message","message":{"role":"user","blocks":[{"type":"text","text":"step two of the parity scenario"}]}}
  344. {"type":"message","message":{"role":"assistant","blocks":[{"type":"text","text":"acknowledged step two"}]}}
  345. {"type":"message","message":{"role":"user","blocks":[{"type":"text","text":"step three of the parity scenario"}]}}
  346. {"type":"message","message":{"role":"assistant","blocks":[{"type":"text","text":"acknowledged step three"}]}}
  347. "#;
  348. fs::write(
  349. sessions_dir.join(format!("{session_id}.jsonl")),
  350. session_jsonl,
  351. )
  352. .expect("pre-seeded session should write");
  353. }
  354. fn prepare_noop(_: &HarnessWorkspace) {}
  355. fn prepare_read_fixture(workspace: &HarnessWorkspace) {
  356. fs::write(workspace.root.join("fixture.txt"), "alpha parity line\n")
  357. .expect("fixture should write");
  358. }
  359. fn prepare_grep_fixture(workspace: &HarnessWorkspace) {
  360. fs::write(
  361. workspace.root.join("fixture.txt"),
  362. "alpha parity line\nbeta line\ngamma parity line\n",
  363. )
  364. .expect("grep fixture should write");
  365. }
  366. fn prepare_multi_tool_fixture(workspace: &HarnessWorkspace) {
  367. fs::write(
  368. workspace.root.join("fixture.txt"),
  369. "alpha parity line\nbeta line\ngamma parity line\n",
  370. )
  371. .expect("multi tool fixture should write");
  372. }
  373. fn prepare_plugin_fixture(workspace: &HarnessWorkspace) {
  374. let plugin_root = workspace
  375. .root
  376. .join("external-plugins")
  377. .join("parity-plugin");
  378. let tool_dir = plugin_root.join("tools");
  379. let manifest_dir = plugin_root.join(".claude-plugin");
  380. fs::create_dir_all(&tool_dir).expect("plugin tools dir");
  381. fs::create_dir_all(&manifest_dir).expect("plugin manifest dir");
  382. let script_path = tool_dir.join("echo-json.sh");
  383. fs::write(
  384. &script_path,
  385. "#!/bin/sh\nINPUT=$(cat)\nprintf '{\"plugin\":\"%s\",\"tool\":\"%s\",\"input\":%s}\\n' \"$CLAWD_PLUGIN_ID\" \"$CLAWD_TOOL_NAME\" \"$INPUT\"\n",
  386. )
  387. .expect("plugin script should write");
  388. let mut permissions = fs::metadata(&script_path)
  389. .expect("plugin script metadata")
  390. .permissions();
  391. permissions.set_mode(0o755);
  392. fs::set_permissions(&script_path, permissions).expect("plugin script should be executable");
  393. fs::write(
  394. manifest_dir.join("plugin.json"),
  395. r#"{
  396. "name": "parity-plugin",
  397. "version": "1.0.0",
  398. "description": "mock parity plugin",
  399. "tools": [
  400. {
  401. "name": "plugin_echo",
  402. "description": "Echo JSON input",
  403. "inputSchema": {
  404. "type": "object",
  405. "properties": {
  406. "message": { "type": "string" }
  407. },
  408. "required": ["message"],
  409. "additionalProperties": false
  410. },
  411. "command": "./tools/echo-json.sh",
  412. "requiredPermission": "workspace-write"
  413. }
  414. ]
  415. }"#,
  416. )
  417. .expect("plugin manifest should write");
  418. fs::write(
  419. workspace.config_home.join("settings.json"),
  420. json!({
  421. "enabledPlugins": {
  422. "parity-plugin@external": true
  423. },
  424. "plugins": {
  425. "externalDirectories": [plugin_root.parent().expect("plugin parent").display().to_string()]
  426. }
  427. })
  428. .to_string(),
  429. )
  430. .expect("plugin settings should write");
  431. }
  432. fn assert_streaming_text(_: &HarnessWorkspace, run: &ScenarioRun) {
  433. assert_eq!(
  434. run.response["message"],
  435. Value::String("Mock streaming says hello from the parity harness.".to_string())
  436. );
  437. assert_eq!(run.response["iterations"], Value::from(1));
  438. assert_eq!(run.response["tool_uses"], Value::Array(Vec::new()));
  439. assert_eq!(run.response["tool_results"], Value::Array(Vec::new()));
  440. }
  441. fn assert_read_file_roundtrip(workspace: &HarnessWorkspace, run: &ScenarioRun) {
  442. assert_eq!(run.response["iterations"], Value::from(2));
  443. assert_eq!(
  444. run.response["tool_uses"][0]["name"],
  445. Value::String("read_file".to_string())
  446. );
  447. assert_eq!(
  448. run.response["tool_uses"][0]["input"],
  449. Value::String(r#"{"path":"fixture.txt"}"#.to_string())
  450. );
  451. assert!(run.response["message"]
  452. .as_str()
  453. .expect("message text")
  454. .contains("alpha parity line"));
  455. let output = run.response["tool_results"][0]["output"]
  456. .as_str()
  457. .expect("tool output");
  458. assert!(output.contains(&workspace.root.join("fixture.txt").display().to_string()));
  459. assert!(output.contains("alpha parity line"));
  460. }
  461. fn assert_grep_chunk_assembly(_: &HarnessWorkspace, run: &ScenarioRun) {
  462. assert_eq!(run.response["iterations"], Value::from(2));
  463. assert_eq!(
  464. run.response["tool_uses"][0]["name"],
  465. Value::String("grep_search".to_string())
  466. );
  467. assert_eq!(
  468. run.response["tool_uses"][0]["input"],
  469. Value::String(
  470. r#"{"pattern":"parity","path":"fixture.txt","output_mode":"count"}"#.to_string()
  471. )
  472. );
  473. assert!(run.response["message"]
  474. .as_str()
  475. .expect("message text")
  476. .contains("2 occurrences"));
  477. assert_eq!(
  478. run.response["tool_results"][0]["is_error"],
  479. Value::Bool(false)
  480. );
  481. }
  482. fn assert_write_file_allowed(workspace: &HarnessWorkspace, run: &ScenarioRun) {
  483. assert_eq!(run.response["iterations"], Value::from(2));
  484. assert_eq!(
  485. run.response["tool_uses"][0]["name"],
  486. Value::String("write_file".to_string())
  487. );
  488. assert!(run.response["message"]
  489. .as_str()
  490. .expect("message text")
  491. .contains("generated/output.txt"));
  492. let generated = workspace.root.join("generated").join("output.txt");
  493. let contents = fs::read_to_string(&generated).expect("generated file should exist");
  494. assert_eq!(contents, "created by mock service\n");
  495. assert_eq!(
  496. run.response["tool_results"][0]["is_error"],
  497. Value::Bool(false)
  498. );
  499. }
  500. fn assert_write_file_denied(workspace: &HarnessWorkspace, run: &ScenarioRun) {
  501. assert_eq!(run.response["iterations"], Value::from(2));
  502. assert_eq!(
  503. run.response["tool_uses"][0]["name"],
  504. Value::String("write_file".to_string())
  505. );
  506. let tool_output = run.response["tool_results"][0]["output"]
  507. .as_str()
  508. .expect("tool output");
  509. assert!(tool_output.contains("requires workspace-write permission"));
  510. assert_eq!(
  511. run.response["tool_results"][0]["is_error"],
  512. Value::Bool(true)
  513. );
  514. assert!(run.response["message"]
  515. .as_str()
  516. .expect("message text")
  517. .contains("denied as expected"));
  518. assert!(!workspace.root.join("generated").join("denied.txt").exists());
  519. }
  520. fn assert_multi_tool_turn_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) {
  521. assert_eq!(run.response["iterations"], Value::from(2));
  522. let tool_uses = run.response["tool_uses"]
  523. .as_array()
  524. .expect("tool uses array");
  525. assert_eq!(
  526. tool_uses.len(),
  527. 2,
  528. "expected two tool uses in a single turn"
  529. );
  530. assert_eq!(tool_uses[0]["name"], Value::String("read_file".to_string()));
  531. assert_eq!(
  532. tool_uses[1]["name"],
  533. Value::String("grep_search".to_string())
  534. );
  535. let tool_results = run.response["tool_results"]
  536. .as_array()
  537. .expect("tool results array");
  538. assert_eq!(
  539. tool_results.len(),
  540. 2,
  541. "expected two tool results in a single turn"
  542. );
  543. assert!(run.response["message"]
  544. .as_str()
  545. .expect("message text")
  546. .contains("alpha parity line"));
  547. assert!(run.response["message"]
  548. .as_str()
  549. .expect("message text")
  550. .contains("2 occurrences"));
  551. }
  552. fn assert_bash_stdout_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) {
  553. assert_eq!(run.response["iterations"], Value::from(2));
  554. assert_eq!(
  555. run.response["tool_uses"][0]["name"],
  556. Value::String("bash".to_string())
  557. );
  558. let tool_output = run.response["tool_results"][0]["output"]
  559. .as_str()
  560. .expect("tool output");
  561. let parsed: Value = serde_json::from_str(tool_output).expect("bash output json");
  562. assert_eq!(
  563. parsed["stdout"],
  564. Value::String("alpha from bash".to_string())
  565. );
  566. assert_eq!(
  567. run.response["tool_results"][0]["is_error"],
  568. Value::Bool(false)
  569. );
  570. assert!(run.response["message"]
  571. .as_str()
  572. .expect("message text")
  573. .contains("alpha from bash"));
  574. }
  575. fn assert_bash_permission_prompt_approved(_: &HarnessWorkspace, run: &ScenarioRun) {
  576. assert!(run.stdout.contains("Permission approval required"));
  577. assert!(run.stdout.contains("Approve this tool call? [y/N]:"));
  578. assert_eq!(run.response["iterations"], Value::from(2));
  579. assert_eq!(
  580. run.response["tool_results"][0]["is_error"],
  581. Value::Bool(false)
  582. );
  583. let tool_output = run.response["tool_results"][0]["output"]
  584. .as_str()
  585. .expect("tool output");
  586. let parsed: Value = serde_json::from_str(tool_output).expect("bash output json");
  587. assert_eq!(
  588. parsed["stdout"],
  589. Value::String("approved via prompt".to_string())
  590. );
  591. assert!(run.response["message"]
  592. .as_str()
  593. .expect("message text")
  594. .contains("approved and executed"));
  595. }
  596. fn assert_bash_permission_prompt_denied(_: &HarnessWorkspace, run: &ScenarioRun) {
  597. assert!(run.stdout.contains("Permission approval required"));
  598. assert!(run.stdout.contains("Approve this tool call? [y/N]:"));
  599. assert_eq!(run.response["iterations"], Value::from(2));
  600. let tool_output = run.response["tool_results"][0]["output"]
  601. .as_str()
  602. .expect("tool output");
  603. assert!(tool_output.contains("denied by user approval prompt"));
  604. assert_eq!(
  605. run.response["tool_results"][0]["is_error"],
  606. Value::Bool(true)
  607. );
  608. assert!(run.response["message"]
  609. .as_str()
  610. .expect("message text")
  611. .contains("denied as expected"));
  612. }
  613. fn assert_plugin_tool_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) {
  614. assert_eq!(run.response["iterations"], Value::from(2));
  615. assert_eq!(
  616. run.response["tool_uses"][0]["name"],
  617. Value::String("plugin_echo".to_string())
  618. );
  619. let tool_output = run.response["tool_results"][0]["output"]
  620. .as_str()
  621. .expect("tool output");
  622. let parsed: Value = serde_json::from_str(tool_output).expect("plugin output json");
  623. assert_eq!(
  624. parsed["plugin"],
  625. Value::String("parity-plugin@external".to_string())
  626. );
  627. assert_eq!(parsed["tool"], Value::String("plugin_echo".to_string()));
  628. assert_eq!(
  629. parsed["input"]["message"],
  630. Value::String("hello from plugin parity".to_string())
  631. );
  632. assert!(run.response["message"]
  633. .as_str()
  634. .expect("message text")
  635. .contains("hello from plugin parity"));
  636. }
  637. fn assert_auto_compact_triggered(_: &HarnessWorkspace, run: &ScenarioRun) {
  638. // Validates that the auto_compaction field is present in JSON output (format parity).
  639. // Trigger behavior is covered by conversation::tests::auto_compacts_when_cumulative_input_threshold_is_crossed.
  640. assert_eq!(run.response["iterations"], Value::from(1));
  641. assert_eq!(run.response["tool_uses"], Value::Array(Vec::new()));
  642. assert!(
  643. run.response["message"]
  644. .as_str()
  645. .expect("message text")
  646. .contains("auto compact parity complete."),
  647. "expected auto compact message in response"
  648. );
  649. // auto_compaction key must be present in JSON (may be null for below-threshold sessions)
  650. assert!(
  651. run.response
  652. .as_object()
  653. .expect("response object")
  654. .contains_key("auto_compaction"),
  655. "auto_compaction key must be present in JSON output"
  656. );
  657. // Verify input_tokens field reflects the large mock token counts
  658. let input_tokens = run.response["usage"]["input_tokens"]
  659. .as_u64()
  660. .expect("input_tokens should be present");
  661. assert!(
  662. input_tokens >= 50_000,
  663. "input_tokens should reflect mock service value (got {input_tokens})"
  664. );
  665. }
  666. fn assert_token_cost_reporting(_: &HarnessWorkspace, run: &ScenarioRun) {
  667. assert_eq!(run.response["iterations"], Value::from(1));
  668. assert!(run.response["message"]
  669. .as_str()
  670. .expect("message text")
  671. .contains("token cost reporting parity complete."),);
  672. let usage = &run.response["usage"];
  673. assert!(
  674. usage["input_tokens"].as_u64().unwrap_or(0) > 0,
  675. "input_tokens should be non-zero"
  676. );
  677. assert!(
  678. usage["output_tokens"].as_u64().unwrap_or(0) > 0,
  679. "output_tokens should be non-zero"
  680. );
  681. assert!(
  682. run.response["estimated_cost"]
  683. .as_str()
  684. .map(|cost| cost.starts_with('$'))
  685. .unwrap_or(false),
  686. "estimated_cost should be a dollar-prefixed string"
  687. );
  688. }
  689. fn parse_json_output(stdout: &str) -> Value {
  690. if let Some(index) = stdout.rfind("{\"auto_compaction\"") {
  691. return serde_json::from_str(&stdout[index..]).unwrap_or_else(|error| {
  692. panic!("failed to parse JSON response from stdout: {error}\n{stdout}")
  693. });
  694. }
  695. stdout
  696. .lines()
  697. .rev()
  698. .find_map(|line| {
  699. let trimmed = line.trim();
  700. if trimmed.starts_with('{') && trimmed.ends_with('}') {
  701. serde_json::from_str(trimmed).ok()
  702. } else {
  703. None
  704. }
  705. })
  706. .unwrap_or_else(|| panic!("no JSON response line found in stdout:\n{stdout}"))
  707. }
  708. fn build_scenario_report(
  709. name: &str,
  710. manifest_entry: &ScenarioManifestEntry,
  711. response: &Value,
  712. ) -> ScenarioReport {
  713. ScenarioReport {
  714. name: name.to_string(),
  715. category: manifest_entry.category.clone(),
  716. description: manifest_entry.description.clone(),
  717. parity_refs: manifest_entry.parity_refs.clone(),
  718. iterations: response["iterations"]
  719. .as_u64()
  720. .expect("iterations should exist"),
  721. request_count: 0,
  722. tool_uses: response["tool_uses"]
  723. .as_array()
  724. .expect("tool uses array")
  725. .iter()
  726. .filter_map(|value| value["name"].as_str().map(ToOwned::to_owned))
  727. .collect(),
  728. tool_error_count: response["tool_results"]
  729. .as_array()
  730. .expect("tool results array")
  731. .iter()
  732. .filter(|value| value["is_error"].as_bool().unwrap_or(false))
  733. .count(),
  734. final_message: response["message"]
  735. .as_str()
  736. .expect("message text")
  737. .to_string(),
  738. }
  739. }
  740. fn maybe_write_report(reports: &[ScenarioReport]) {
  741. let Some(path) = std::env::var_os("MOCK_PARITY_REPORT_PATH") else {
  742. return;
  743. };
  744. let payload = json!({
  745. "scenario_count": reports.len(),
  746. "request_count": reports.iter().map(|report| report.request_count).sum::<usize>(),
  747. "scenarios": reports.iter().map(scenario_report_json).collect::<Vec<_>>(),
  748. });
  749. fs::write(
  750. path,
  751. serde_json::to_vec_pretty(&payload).expect("report json should serialize"),
  752. )
  753. .expect("report should write");
  754. }
  755. fn load_scenario_manifest() -> Vec<ScenarioManifestEntry> {
  756. let manifest_path =
  757. Path::new(env!("CARGO_MANIFEST_DIR")).join("../../mock_parity_scenarios.json");
  758. let manifest = fs::read_to_string(&manifest_path).expect("scenario manifest should exist");
  759. serde_json::from_str::<Vec<Value>>(&manifest)
  760. .expect("scenario manifest should parse")
  761. .into_iter()
  762. .map(|entry| ScenarioManifestEntry {
  763. name: entry["name"]
  764. .as_str()
  765. .expect("scenario name should be a string")
  766. .to_string(),
  767. category: entry["category"]
  768. .as_str()
  769. .expect("scenario category should be a string")
  770. .to_string(),
  771. description: entry["description"]
  772. .as_str()
  773. .expect("scenario description should be a string")
  774. .to_string(),
  775. parity_refs: entry["parity_refs"]
  776. .as_array()
  777. .expect("parity refs should be an array")
  778. .iter()
  779. .map(|value| {
  780. value
  781. .as_str()
  782. .expect("parity ref should be a string")
  783. .to_string()
  784. })
  785. .collect(),
  786. })
  787. .collect()
  788. }
  789. fn scenario_report_json(report: &ScenarioReport) -> Value {
  790. json!({
  791. "name": report.name,
  792. "category": report.category,
  793. "description": report.description,
  794. "parity_refs": report.parity_refs,
  795. "iterations": report.iterations,
  796. "request_count": report.request_count,
  797. "tool_uses": report.tool_uses,
  798. "tool_error_count": report.tool_error_count,
  799. "final_message": report.final_message,
  800. })
  801. }
  802. fn assert_success(output: &Output) {
  803. assert!(
  804. output.status.success(),
  805. "stdout:\n{}\n\nstderr:\n{}",
  806. String::from_utf8_lossy(&output.stdout),
  807. String::from_utf8_lossy(&output.stderr)
  808. );
  809. }
  810. fn unique_temp_dir(label: &str) -> PathBuf {
  811. let millis = SystemTime::now()
  812. .duration_since(UNIX_EPOCH)
  813. .expect("clock should be after epoch")
  814. .as_millis();
  815. let counter = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
  816. std::env::temp_dir().join(format!(
  817. "claw-mock-parity-{label}-{}-{millis}-{counter}",
  818. std::process::id()
  819. ))
  820. }