|
@@ -1,16 +1,26 @@
|
|
|
|
|
+use std::collections::BTreeMap;
|
|
|
use std::fs;
|
|
use std::fs;
|
|
|
|
|
+use std::io::Write;
|
|
|
|
|
+use std::os::unix::fs::PermissionsExt;
|
|
|
use std::path::{Path, PathBuf};
|
|
use std::path::{Path, PathBuf};
|
|
|
-use std::process::{Command, Output};
|
|
|
|
|
|
|
+use std::process::{Command, Output, Stdio};
|
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
|
|
|
|
|
|
use mock_anthropic_service::{MockAnthropicService, SCENARIO_PREFIX};
|
|
use mock_anthropic_service::{MockAnthropicService, SCENARIO_PREFIX};
|
|
|
-use serde_json::Value;
|
|
|
|
|
|
|
+use serde_json::{json, Value};
|
|
|
|
|
|
|
|
static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
|
|
static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
|
|
|
|
|
|
|
|
#[test]
|
|
#[test]
|
|
|
|
|
+#[allow(clippy::too_many_lines)]
|
|
|
fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios() {
|
|
fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios() {
|
|
|
|
|
+ let manifest_entries = load_scenario_manifest();
|
|
|
|
|
+ let manifest = manifest_entries
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .cloned()
|
|
|
|
|
+ .map(|entry| (entry.name.clone(), entry))
|
|
|
|
|
+ .collect::<BTreeMap<_, _>>();
|
|
|
let runtime = tokio::runtime::Runtime::new().expect("tokio runtime should build");
|
|
let runtime = tokio::runtime::Runtime::new().expect("tokio runtime should build");
|
|
|
let server = runtime
|
|
let server = runtime
|
|
|
.block_on(MockAnthropicService::spawn())
|
|
.block_on(MockAnthropicService::spawn())
|
|
@@ -22,53 +32,121 @@ fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios
|
|
|
name: "streaming_text",
|
|
name: "streaming_text",
|
|
|
permission_mode: "read-only",
|
|
permission_mode: "read-only",
|
|
|
allowed_tools: None,
|
|
allowed_tools: None,
|
|
|
- seed: seed_noop,
|
|
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_noop,
|
|
|
assert: assert_streaming_text,
|
|
assert: assert_streaming_text,
|
|
|
},
|
|
},
|
|
|
ScenarioCase {
|
|
ScenarioCase {
|
|
|
name: "read_file_roundtrip",
|
|
name: "read_file_roundtrip",
|
|
|
permission_mode: "read-only",
|
|
permission_mode: "read-only",
|
|
|
allowed_tools: Some("read_file"),
|
|
allowed_tools: Some("read_file"),
|
|
|
- seed: seed_read_fixture,
|
|
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_read_fixture,
|
|
|
assert: assert_read_file_roundtrip,
|
|
assert: assert_read_file_roundtrip,
|
|
|
},
|
|
},
|
|
|
ScenarioCase {
|
|
ScenarioCase {
|
|
|
name: "grep_chunk_assembly",
|
|
name: "grep_chunk_assembly",
|
|
|
permission_mode: "read-only",
|
|
permission_mode: "read-only",
|
|
|
allowed_tools: Some("grep_search"),
|
|
allowed_tools: Some("grep_search"),
|
|
|
- seed: seed_grep_fixture,
|
|
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_grep_fixture,
|
|
|
assert: assert_grep_chunk_assembly,
|
|
assert: assert_grep_chunk_assembly,
|
|
|
},
|
|
},
|
|
|
ScenarioCase {
|
|
ScenarioCase {
|
|
|
name: "write_file_allowed",
|
|
name: "write_file_allowed",
|
|
|
permission_mode: "workspace-write",
|
|
permission_mode: "workspace-write",
|
|
|
allowed_tools: Some("write_file"),
|
|
allowed_tools: Some("write_file"),
|
|
|
- seed: seed_noop,
|
|
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_noop,
|
|
|
assert: assert_write_file_allowed,
|
|
assert: assert_write_file_allowed,
|
|
|
},
|
|
},
|
|
|
ScenarioCase {
|
|
ScenarioCase {
|
|
|
name: "write_file_denied",
|
|
name: "write_file_denied",
|
|
|
permission_mode: "read-only",
|
|
permission_mode: "read-only",
|
|
|
allowed_tools: Some("write_file"),
|
|
allowed_tools: Some("write_file"),
|
|
|
- seed: seed_noop,
|
|
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_noop,
|
|
|
assert: assert_write_file_denied,
|
|
assert: assert_write_file_denied,
|
|
|
},
|
|
},
|
|
|
|
|
+ ScenarioCase {
|
|
|
|
|
+ name: "multi_tool_turn_roundtrip",
|
|
|
|
|
+ permission_mode: "read-only",
|
|
|
|
|
+ allowed_tools: Some("read_file,grep_search"),
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_multi_tool_fixture,
|
|
|
|
|
+ assert: assert_multi_tool_turn_roundtrip,
|
|
|
|
|
+ },
|
|
|
|
|
+ ScenarioCase {
|
|
|
|
|
+ name: "bash_stdout_roundtrip",
|
|
|
|
|
+ permission_mode: "danger-full-access",
|
|
|
|
|
+ allowed_tools: Some("bash"),
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_noop,
|
|
|
|
|
+ assert: assert_bash_stdout_roundtrip,
|
|
|
|
|
+ },
|
|
|
|
|
+ ScenarioCase {
|
|
|
|
|
+ name: "bash_permission_prompt_approved",
|
|
|
|
|
+ permission_mode: "workspace-write",
|
|
|
|
|
+ allowed_tools: Some("bash"),
|
|
|
|
|
+ stdin: Some("y\n"),
|
|
|
|
|
+ prepare: prepare_noop,
|
|
|
|
|
+ assert: assert_bash_permission_prompt_approved,
|
|
|
|
|
+ },
|
|
|
|
|
+ ScenarioCase {
|
|
|
|
|
+ name: "bash_permission_prompt_denied",
|
|
|
|
|
+ permission_mode: "workspace-write",
|
|
|
|
|
+ allowed_tools: Some("bash"),
|
|
|
|
|
+ stdin: Some("n\n"),
|
|
|
|
|
+ prepare: prepare_noop,
|
|
|
|
|
+ assert: assert_bash_permission_prompt_denied,
|
|
|
|
|
+ },
|
|
|
|
|
+ ScenarioCase {
|
|
|
|
|
+ name: "plugin_tool_roundtrip",
|
|
|
|
|
+ permission_mode: "workspace-write",
|
|
|
|
|
+ allowed_tools: None,
|
|
|
|
|
+ stdin: None,
|
|
|
|
|
+ prepare: prepare_plugin_fixture,
|
|
|
|
|
+ assert: assert_plugin_tool_roundtrip,
|
|
|
|
|
+ },
|
|
|
];
|
|
];
|
|
|
|
|
|
|
|
|
|
+ let case_names = cases.iter().map(|case| case.name).collect::<Vec<_>>();
|
|
|
|
|
+ let manifest_names = manifest_entries
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .map(|entry| entry.name.as_str())
|
|
|
|
|
+ .collect::<Vec<_>>();
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ case_names, manifest_names,
|
|
|
|
|
+ "manifest and harness cases must stay aligned"
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ let mut scenario_reports = Vec::new();
|
|
|
|
|
+
|
|
|
for case in cases {
|
|
for case in cases {
|
|
|
- let workspace = unique_temp_dir(case.name);
|
|
|
|
|
- fs::create_dir_all(&workspace).expect("workspace should exist");
|
|
|
|
|
- (case.seed)(&workspace);
|
|
|
|
|
- let response = run_case(case, &workspace, &base_url);
|
|
|
|
|
- (case.assert)(&workspace, &response);
|
|
|
|
|
- fs::remove_dir_all(&workspace).expect("workspace cleanup should succeed");
|
|
|
|
|
|
|
+ let workspace = HarnessWorkspace::new(unique_temp_dir(case.name));
|
|
|
|
|
+ workspace.create().expect("workspace should exist");
|
|
|
|
|
+ (case.prepare)(&workspace);
|
|
|
|
|
+
|
|
|
|
|
+ let run = run_case(case, &workspace, &base_url);
|
|
|
|
|
+ (case.assert)(&workspace, &run);
|
|
|
|
|
+
|
|
|
|
|
+ let manifest_entry = manifest
|
|
|
|
|
+ .get(case.name)
|
|
|
|
|
+ .unwrap_or_else(|| panic!("missing manifest entry for {}", case.name));
|
|
|
|
|
+ scenario_reports.push(build_scenario_report(
|
|
|
|
|
+ case.name,
|
|
|
|
|
+ manifest_entry,
|
|
|
|
|
+ &run.response,
|
|
|
|
|
+ ));
|
|
|
|
|
+
|
|
|
|
|
+ fs::remove_dir_all(&workspace.root).expect("workspace cleanup should succeed");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
let captured = runtime.block_on(server.captured_requests());
|
|
let captured = runtime.block_on(server.captured_requests());
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
captured.len(),
|
|
captured.len(),
|
|
|
- 9,
|
|
|
|
|
- "five scenarios should produce nine requests"
|
|
|
|
|
|
|
+ 19,
|
|
|
|
|
+ "ten scenarios should produce nineteen requests"
|
|
|
);
|
|
);
|
|
|
assert!(captured
|
|
assert!(captured
|
|
|
.iter()
|
|
.iter()
|
|
@@ -91,8 +169,32 @@ fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios
|
|
|
"write_file_allowed",
|
|
"write_file_allowed",
|
|
|
"write_file_denied",
|
|
"write_file_denied",
|
|
|
"write_file_denied",
|
|
"write_file_denied",
|
|
|
|
|
+ "multi_tool_turn_roundtrip",
|
|
|
|
|
+ "multi_tool_turn_roundtrip",
|
|
|
|
|
+ "bash_stdout_roundtrip",
|
|
|
|
|
+ "bash_stdout_roundtrip",
|
|
|
|
|
+ "bash_permission_prompt_approved",
|
|
|
|
|
+ "bash_permission_prompt_approved",
|
|
|
|
|
+ "bash_permission_prompt_denied",
|
|
|
|
|
+ "bash_permission_prompt_denied",
|
|
|
|
|
+ "plugin_tool_roundtrip",
|
|
|
|
|
+ "plugin_tool_roundtrip",
|
|
|
]
|
|
]
|
|
|
);
|
|
);
|
|
|
|
|
+
|
|
|
|
|
+ let mut request_counts = BTreeMap::new();
|
|
|
|
|
+ for request in &captured {
|
|
|
|
|
+ *request_counts
|
|
|
|
|
+ .entry(request.scenario.as_str())
|
|
|
|
|
+ .or_insert(0_usize) += 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ for report in &mut scenario_reports {
|
|
|
|
|
+ report.request_count = *request_counts
|
|
|
|
|
+ .get(report.name.as_str())
|
|
|
|
|
+ .unwrap_or_else(|| panic!("missing request count for {}", report.name));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ maybe_write_report(&scenario_reports);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Copy)]
|
|
#[derive(Clone, Copy)]
|
|
@@ -100,25 +202,71 @@ struct ScenarioCase {
|
|
|
name: &'static str,
|
|
name: &'static str,
|
|
|
permission_mode: &'static str,
|
|
permission_mode: &'static str,
|
|
|
allowed_tools: Option<&'static str>,
|
|
allowed_tools: Option<&'static str>,
|
|
|
- seed: fn(&Path),
|
|
|
|
|
- assert: fn(&Path, &Value),
|
|
|
|
|
|
|
+ stdin: Option<&'static str>,
|
|
|
|
|
+ prepare: fn(&HarnessWorkspace),
|
|
|
|
|
+ assert: fn(&HarnessWorkspace, &ScenarioRun),
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn run_case(case: ScenarioCase, workspace: &Path, base_url: &str) -> Value {
|
|
|
|
|
- let config_home = workspace.join("config-home");
|
|
|
|
|
- let home = workspace.join("home");
|
|
|
|
|
- fs::create_dir_all(config_home.join(".claw")).expect("config home should exist");
|
|
|
|
|
- fs::create_dir_all(&home).expect("home should exist");
|
|
|
|
|
|
|
+struct HarnessWorkspace {
|
|
|
|
|
+ root: PathBuf,
|
|
|
|
|
+ config_home: PathBuf,
|
|
|
|
|
+ home: PathBuf,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+impl HarnessWorkspace {
|
|
|
|
|
+ fn new(root: PathBuf) -> Self {
|
|
|
|
|
+ Self {
|
|
|
|
|
+ config_home: root.join("config-home"),
|
|
|
|
|
+ home: root.join("home"),
|
|
|
|
|
+ root,
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
|
|
+ fn create(&self) -> std::io::Result<()> {
|
|
|
|
|
+ fs::create_dir_all(&self.root)?;
|
|
|
|
|
+ fs::create_dir_all(&self.config_home)?;
|
|
|
|
|
+ fs::create_dir_all(&self.home)?;
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+struct ScenarioRun {
|
|
|
|
|
+ response: Value,
|
|
|
|
|
+ stdout: String,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+#[derive(Debug, Clone)]
|
|
|
|
|
+struct ScenarioManifestEntry {
|
|
|
|
|
+ name: String,
|
|
|
|
|
+ category: String,
|
|
|
|
|
+ description: String,
|
|
|
|
|
+ parity_refs: Vec<String>,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+#[derive(Debug)]
|
|
|
|
|
+struct ScenarioReport {
|
|
|
|
|
+ name: String,
|
|
|
|
|
+ category: String,
|
|
|
|
|
+ description: String,
|
|
|
|
|
+ parity_refs: Vec<String>,
|
|
|
|
|
+ iterations: u64,
|
|
|
|
|
+ request_count: usize,
|
|
|
|
|
+ tool_uses: Vec<String>,
|
|
|
|
|
+ tool_error_count: usize,
|
|
|
|
|
+ final_message: String,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn run_case(case: ScenarioCase, workspace: &HarnessWorkspace, base_url: &str) -> ScenarioRun {
|
|
|
let mut command = Command::new(env!("CARGO_BIN_EXE_claw"));
|
|
let mut command = Command::new(env!("CARGO_BIN_EXE_claw"));
|
|
|
command
|
|
command
|
|
|
- .current_dir(workspace)
|
|
|
|
|
|
|
+ .current_dir(&workspace.root)
|
|
|
.env_clear()
|
|
.env_clear()
|
|
|
.env("ANTHROPIC_API_KEY", "test-parity-key")
|
|
.env("ANTHROPIC_API_KEY", "test-parity-key")
|
|
|
.env("ANTHROPIC_BASE_URL", base_url)
|
|
.env("ANTHROPIC_BASE_URL", base_url)
|
|
|
- .env("CLAW_CONFIG_HOME", &config_home)
|
|
|
|
|
- .env("HOME", &home)
|
|
|
|
|
|
|
+ .env("CLAW_CONFIG_HOME", &workspace.config_home)
|
|
|
|
|
+ .env("HOME", &workspace.home)
|
|
|
.env("NO_COLOR", "1")
|
|
.env("NO_COLOR", "1")
|
|
|
|
|
+ .env("PATH", "/usr/bin:/bin")
|
|
|
.args([
|
|
.args([
|
|
|
"--model",
|
|
"--model",
|
|
|
"sonnet",
|
|
"sonnet",
|
|
@@ -132,107 +280,453 @@ fn run_case(case: ScenarioCase, workspace: &Path, base_url: &str) -> Value {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
let prompt = format!("{SCENARIO_PREFIX}{}", case.name);
|
|
let prompt = format!("{SCENARIO_PREFIX}{}", case.name);
|
|
|
- let output = command.arg(prompt).output().expect("claw should launch");
|
|
|
|
|
|
|
+ command.arg(prompt);
|
|
|
|
|
+
|
|
|
|
|
+ let output = if let Some(stdin) = case.stdin {
|
|
|
|
|
+ let mut child = command
|
|
|
|
|
+ .stdin(Stdio::piped())
|
|
|
|
|
+ .stdout(Stdio::piped())
|
|
|
|
|
+ .stderr(Stdio::piped())
|
|
|
|
|
+ .spawn()
|
|
|
|
|
+ .expect("claw should launch");
|
|
|
|
|
+ child
|
|
|
|
|
+ .stdin
|
|
|
|
|
+ .as_mut()
|
|
|
|
|
+ .expect("stdin should be piped")
|
|
|
|
|
+ .write_all(stdin.as_bytes())
|
|
|
|
|
+ .expect("stdin should write");
|
|
|
|
|
+ child.wait_with_output().expect("claw should finish")
|
|
|
|
|
+ } else {
|
|
|
|
|
+ command.output().expect("claw should launch")
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
assert_success(&output);
|
|
assert_success(&output);
|
|
|
- serde_json::from_slice(&output.stdout).expect("prompt output should be valid json")
|
|
|
|
|
|
|
+ let stdout = String::from_utf8_lossy(&output.stdout).into_owned();
|
|
|
|
|
+ ScenarioRun {
|
|
|
|
|
+ response: parse_json_output(&stdout),
|
|
|
|
|
+ stdout,
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn seed_noop(_: &Path) {}
|
|
|
|
|
|
|
+fn prepare_noop(_: &HarnessWorkspace) {}
|
|
|
|
|
|
|
|
-fn seed_read_fixture(workspace: &Path) {
|
|
|
|
|
- fs::write(workspace.join("fixture.txt"), "alpha parity line\n").expect("fixture should write");
|
|
|
|
|
|
|
+fn prepare_read_fixture(workspace: &HarnessWorkspace) {
|
|
|
|
|
+ fs::write(workspace.root.join("fixture.txt"), "alpha parity line\n")
|
|
|
|
|
+ .expect("fixture should write");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn seed_grep_fixture(workspace: &Path) {
|
|
|
|
|
|
|
+fn prepare_grep_fixture(workspace: &HarnessWorkspace) {
|
|
|
fs::write(
|
|
fs::write(
|
|
|
- workspace.join("fixture.txt"),
|
|
|
|
|
|
|
+ workspace.root.join("fixture.txt"),
|
|
|
"alpha parity line\nbeta line\ngamma parity line\n",
|
|
"alpha parity line\nbeta line\ngamma parity line\n",
|
|
|
)
|
|
)
|
|
|
.expect("grep fixture should write");
|
|
.expect("grep fixture should write");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn assert_streaming_text(_: &Path, response: &Value) {
|
|
|
|
|
|
|
+fn prepare_multi_tool_fixture(workspace: &HarnessWorkspace) {
|
|
|
|
|
+ fs::write(
|
|
|
|
|
+ workspace.root.join("fixture.txt"),
|
|
|
|
|
+ "alpha parity line\nbeta line\ngamma parity line\n",
|
|
|
|
|
+ )
|
|
|
|
|
+ .expect("multi tool fixture should write");
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn prepare_plugin_fixture(workspace: &HarnessWorkspace) {
|
|
|
|
|
+ let plugin_root = workspace
|
|
|
|
|
+ .root
|
|
|
|
|
+ .join("external-plugins")
|
|
|
|
|
+ .join("parity-plugin");
|
|
|
|
|
+ let tool_dir = plugin_root.join("tools");
|
|
|
|
|
+ let manifest_dir = plugin_root.join(".claude-plugin");
|
|
|
|
|
+ fs::create_dir_all(&tool_dir).expect("plugin tools dir");
|
|
|
|
|
+ fs::create_dir_all(&manifest_dir).expect("plugin manifest dir");
|
|
|
|
|
+
|
|
|
|
|
+ let script_path = tool_dir.join("echo-json.sh");
|
|
|
|
|
+ fs::write(
|
|
|
|
|
+ &script_path,
|
|
|
|
|
+ "#!/bin/sh\nINPUT=$(cat)\nprintf '{\"plugin\":\"%s\",\"tool\":\"%s\",\"input\":%s}\\n' \"$CLAWD_PLUGIN_ID\" \"$CLAWD_TOOL_NAME\" \"$INPUT\"\n",
|
|
|
|
|
+ )
|
|
|
|
|
+ .expect("plugin script should write");
|
|
|
|
|
+ let mut permissions = fs::metadata(&script_path)
|
|
|
|
|
+ .expect("plugin script metadata")
|
|
|
|
|
+ .permissions();
|
|
|
|
|
+ permissions.set_mode(0o755);
|
|
|
|
|
+ fs::set_permissions(&script_path, permissions).expect("plugin script should be executable");
|
|
|
|
|
+
|
|
|
|
|
+ fs::write(
|
|
|
|
|
+ manifest_dir.join("plugin.json"),
|
|
|
|
|
+ r#"{
|
|
|
|
|
+ "name": "parity-plugin",
|
|
|
|
|
+ "version": "1.0.0",
|
|
|
|
|
+ "description": "mock parity plugin",
|
|
|
|
|
+ "tools": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "name": "plugin_echo",
|
|
|
|
|
+ "description": "Echo JSON input",
|
|
|
|
|
+ "inputSchema": {
|
|
|
|
|
+ "type": "object",
|
|
|
|
|
+ "properties": {
|
|
|
|
|
+ "message": { "type": "string" }
|
|
|
|
|
+ },
|
|
|
|
|
+ "required": ["message"],
|
|
|
|
|
+ "additionalProperties": false
|
|
|
|
|
+ },
|
|
|
|
|
+ "command": "./tools/echo-json.sh",
|
|
|
|
|
+ "requiredPermission": "workspace-write"
|
|
|
|
|
+ }
|
|
|
|
|
+ ]
|
|
|
|
|
+}"#,
|
|
|
|
|
+ )
|
|
|
|
|
+ .expect("plugin manifest should write");
|
|
|
|
|
+
|
|
|
|
|
+ fs::write(
|
|
|
|
|
+ workspace.config_home.join("settings.json"),
|
|
|
|
|
+ json!({
|
|
|
|
|
+ "enabledPlugins": {
|
|
|
|
|
+ "parity-plugin@external": true
|
|
|
|
|
+ },
|
|
|
|
|
+ "plugins": {
|
|
|
|
|
+ "externalDirectories": [plugin_root.parent().expect("plugin parent").display().to_string()]
|
|
|
|
|
+ }
|
|
|
|
|
+ })
|
|
|
|
|
+ .to_string(),
|
|
|
|
|
+ )
|
|
|
|
|
+ .expect("plugin settings should write");
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn assert_streaming_text(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["message"],
|
|
|
|
|
|
|
+ run.response["message"],
|
|
|
Value::String("Mock streaming says hello from the parity harness.".to_string())
|
|
Value::String("Mock streaming says hello from the parity harness.".to_string())
|
|
|
);
|
|
);
|
|
|
- assert_eq!(response["iterations"], Value::from(1));
|
|
|
|
|
- assert_eq!(response["tool_uses"], Value::Array(Vec::new()));
|
|
|
|
|
- assert_eq!(response["tool_results"], Value::Array(Vec::new()));
|
|
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(1));
|
|
|
|
|
+ assert_eq!(run.response["tool_uses"], Value::Array(Vec::new()));
|
|
|
|
|
+ assert_eq!(run.response["tool_results"], Value::Array(Vec::new()));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn assert_read_file_roundtrip(workspace: &Path, response: &Value) {
|
|
|
|
|
- assert_eq!(response["iterations"], Value::from(2));
|
|
|
|
|
|
|
+fn assert_read_file_roundtrip(workspace: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["tool_uses"][0]["name"],
|
|
|
|
|
|
|
+ run.response["tool_uses"][0]["name"],
|
|
|
Value::String("read_file".to_string())
|
|
Value::String("read_file".to_string())
|
|
|
);
|
|
);
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["tool_uses"][0]["input"],
|
|
|
|
|
|
|
+ run.response["tool_uses"][0]["input"],
|
|
|
Value::String(r#"{"path":"fixture.txt"}"#.to_string())
|
|
Value::String(r#"{"path":"fixture.txt"}"#.to_string())
|
|
|
);
|
|
);
|
|
|
- assert!(response["message"]
|
|
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
.as_str()
|
|
.as_str()
|
|
|
.expect("message text")
|
|
.expect("message text")
|
|
|
.contains("alpha parity line"));
|
|
.contains("alpha parity line"));
|
|
|
- let output = response["tool_results"][0]["output"]
|
|
|
|
|
|
|
+ let output = run.response["tool_results"][0]["output"]
|
|
|
.as_str()
|
|
.as_str()
|
|
|
.expect("tool output");
|
|
.expect("tool output");
|
|
|
- assert!(output.contains(&workspace.join("fixture.txt").display().to_string()));
|
|
|
|
|
|
|
+ assert!(output.contains(&workspace.root.join("fixture.txt").display().to_string()));
|
|
|
assert!(output.contains("alpha parity line"));
|
|
assert!(output.contains("alpha parity line"));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn assert_grep_chunk_assembly(_: &Path, response: &Value) {
|
|
|
|
|
- assert_eq!(response["iterations"], Value::from(2));
|
|
|
|
|
|
|
+fn assert_grep_chunk_assembly(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["tool_uses"][0]["name"],
|
|
|
|
|
|
|
+ run.response["tool_uses"][0]["name"],
|
|
|
Value::String("grep_search".to_string())
|
|
Value::String("grep_search".to_string())
|
|
|
);
|
|
);
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["tool_uses"][0]["input"],
|
|
|
|
|
|
|
+ run.response["tool_uses"][0]["input"],
|
|
|
Value::String(
|
|
Value::String(
|
|
|
r#"{"pattern":"parity","path":"fixture.txt","output_mode":"count"}"#.to_string()
|
|
r#"{"pattern":"parity","path":"fixture.txt","output_mode":"count"}"#.to_string()
|
|
|
)
|
|
)
|
|
|
);
|
|
);
|
|
|
- assert!(response["message"]
|
|
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
.as_str()
|
|
.as_str()
|
|
|
.expect("message text")
|
|
.expect("message text")
|
|
|
.contains("2 occurrences"));
|
|
.contains("2 occurrences"));
|
|
|
- assert_eq!(response["tool_results"][0]["is_error"], Value::Bool(false));
|
|
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_results"][0]["is_error"],
|
|
|
|
|
+ Value::Bool(false)
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn assert_write_file_allowed(workspace: &Path, response: &Value) {
|
|
|
|
|
- assert_eq!(response["iterations"], Value::from(2));
|
|
|
|
|
|
|
+fn assert_write_file_allowed(workspace: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["tool_uses"][0]["name"],
|
|
|
|
|
|
|
+ run.response["tool_uses"][0]["name"],
|
|
|
Value::String("write_file".to_string())
|
|
Value::String("write_file".to_string())
|
|
|
);
|
|
);
|
|
|
- assert!(response["message"]
|
|
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
.as_str()
|
|
.as_str()
|
|
|
.expect("message text")
|
|
.expect("message text")
|
|
|
.contains("generated/output.txt"));
|
|
.contains("generated/output.txt"));
|
|
|
- let generated = workspace.join("generated").join("output.txt");
|
|
|
|
|
|
|
+ let generated = workspace.root.join("generated").join("output.txt");
|
|
|
let contents = fs::read_to_string(&generated).expect("generated file should exist");
|
|
let contents = fs::read_to_string(&generated).expect("generated file should exist");
|
|
|
assert_eq!(contents, "created by mock service\n");
|
|
assert_eq!(contents, "created by mock service\n");
|
|
|
- assert_eq!(response["tool_results"][0]["is_error"], Value::Bool(false));
|
|
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_results"][0]["is_error"],
|
|
|
|
|
+ Value::Bool(false)
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-fn assert_write_file_denied(workspace: &Path, response: &Value) {
|
|
|
|
|
- assert_eq!(response["iterations"], Value::from(2));
|
|
|
|
|
|
|
+fn assert_write_file_denied(workspace: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
assert_eq!(
|
|
assert_eq!(
|
|
|
- response["tool_uses"][0]["name"],
|
|
|
|
|
|
|
+ run.response["tool_uses"][0]["name"],
|
|
|
Value::String("write_file".to_string())
|
|
Value::String("write_file".to_string())
|
|
|
);
|
|
);
|
|
|
- let tool_output = response["tool_results"][0]["output"]
|
|
|
|
|
|
|
+ let tool_output = run.response["tool_results"][0]["output"]
|
|
|
.as_str()
|
|
.as_str()
|
|
|
.expect("tool output");
|
|
.expect("tool output");
|
|
|
assert!(tool_output.contains("requires workspace-write permission"));
|
|
assert!(tool_output.contains("requires workspace-write permission"));
|
|
|
- assert_eq!(response["tool_results"][0]["is_error"], Value::Bool(true));
|
|
|
|
|
- assert!(response["message"]
|
|
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_results"][0]["is_error"],
|
|
|
|
|
+ Value::Bool(true)
|
|
|
|
|
+ );
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .contains("denied as expected"));
|
|
|
|
|
+ assert!(!workspace.root.join("generated").join("denied.txt").exists());
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn assert_multi_tool_turn_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
|
|
+ let tool_uses = run.response["tool_uses"]
|
|
|
|
|
+ .as_array()
|
|
|
|
|
+ .expect("tool uses array");
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ tool_uses.len(),
|
|
|
|
|
+ 2,
|
|
|
|
|
+ "expected two tool uses in a single turn"
|
|
|
|
|
+ );
|
|
|
|
|
+ assert_eq!(tool_uses[0]["name"], Value::String("read_file".to_string()));
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ tool_uses[1]["name"],
|
|
|
|
|
+ Value::String("grep_search".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ let tool_results = run.response["tool_results"]
|
|
|
|
|
+ .as_array()
|
|
|
|
|
+ .expect("tool results array");
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ tool_results.len(),
|
|
|
|
|
+ 2,
|
|
|
|
|
+ "expected two tool results in a single turn"
|
|
|
|
|
+ );
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .contains("alpha parity line"));
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .contains("2 occurrences"));
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn assert_bash_stdout_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_uses"][0]["name"],
|
|
|
|
|
+ Value::String("bash".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ let tool_output = run.response["tool_results"][0]["output"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("tool output");
|
|
|
|
|
+ let parsed: Value = serde_json::from_str(tool_output).expect("bash output json");
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ parsed["stdout"],
|
|
|
|
|
+ Value::String("alpha from bash".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_results"][0]["is_error"],
|
|
|
|
|
+ Value::Bool(false)
|
|
|
|
|
+ );
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .contains("alpha from bash"));
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn assert_bash_permission_prompt_approved(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert!(run.stdout.contains("Permission approval required"));
|
|
|
|
|
+ assert!(run.stdout.contains("Approve this tool call? [y/N]:"));
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_results"][0]["is_error"],
|
|
|
|
|
+ Value::Bool(false)
|
|
|
|
|
+ );
|
|
|
|
|
+ let tool_output = run.response["tool_results"][0]["output"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("tool output");
|
|
|
|
|
+ let parsed: Value = serde_json::from_str(tool_output).expect("bash output json");
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ parsed["stdout"],
|
|
|
|
|
+ Value::String("approved via prompt".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .contains("approved and executed"));
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn assert_bash_permission_prompt_denied(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert!(run.stdout.contains("Permission approval required"));
|
|
|
|
|
+ assert!(run.stdout.contains("Approve this tool call? [y/N]:"));
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
|
|
+ let tool_output = run.response["tool_results"][0]["output"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("tool output");
|
|
|
|
|
+ assert!(tool_output.contains("denied by user approval prompt"));
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_results"][0]["is_error"],
|
|
|
|
|
+ Value::Bool(true)
|
|
|
|
|
+ );
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
.as_str()
|
|
.as_str()
|
|
|
.expect("message text")
|
|
.expect("message text")
|
|
|
.contains("denied as expected"));
|
|
.contains("denied as expected"));
|
|
|
- assert!(!workspace.join("generated").join("denied.txt").exists());
|
|
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn assert_plugin_tool_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) {
|
|
|
|
|
+ assert_eq!(run.response["iterations"], Value::from(2));
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ run.response["tool_uses"][0]["name"],
|
|
|
|
|
+ Value::String("plugin_echo".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ let tool_output = run.response["tool_results"][0]["output"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("tool output");
|
|
|
|
|
+ let parsed: Value = serde_json::from_str(tool_output).expect("plugin output json");
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ parsed["plugin"],
|
|
|
|
|
+ Value::String("parity-plugin@external".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ assert_eq!(parsed["tool"], Value::String("plugin_echo".to_string()));
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ parsed["input"]["message"],
|
|
|
|
|
+ Value::String("hello from plugin parity".to_string())
|
|
|
|
|
+ );
|
|
|
|
|
+ assert!(run.response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .contains("hello from plugin parity"));
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn parse_json_output(stdout: &str) -> Value {
|
|
|
|
|
+ if let Some(index) = stdout.rfind("{\"auto_compaction\"") {
|
|
|
|
|
+ return serde_json::from_str(&stdout[index..]).unwrap_or_else(|error| {
|
|
|
|
|
+ panic!("failed to parse JSON response from stdout: {error}\n{stdout}")
|
|
|
|
|
+ });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ stdout
|
|
|
|
|
+ .lines()
|
|
|
|
|
+ .rev()
|
|
|
|
|
+ .find_map(|line| {
|
|
|
|
|
+ let trimmed = line.trim();
|
|
|
|
|
+ if trimmed.starts_with('{') && trimmed.ends_with('}') {
|
|
|
|
|
+ serde_json::from_str(trimmed).ok()
|
|
|
|
|
+ } else {
|
|
|
|
|
+ None
|
|
|
|
|
+ }
|
|
|
|
|
+ })
|
|
|
|
|
+ .unwrap_or_else(|| panic!("no JSON response line found in stdout:\n{stdout}"))
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn build_scenario_report(
|
|
|
|
|
+ name: &str,
|
|
|
|
|
+ manifest_entry: &ScenarioManifestEntry,
|
|
|
|
|
+ response: &Value,
|
|
|
|
|
+) -> ScenarioReport {
|
|
|
|
|
+ ScenarioReport {
|
|
|
|
|
+ name: name.to_string(),
|
|
|
|
|
+ category: manifest_entry.category.clone(),
|
|
|
|
|
+ description: manifest_entry.description.clone(),
|
|
|
|
|
+ parity_refs: manifest_entry.parity_refs.clone(),
|
|
|
|
|
+ iterations: response["iterations"]
|
|
|
|
|
+ .as_u64()
|
|
|
|
|
+ .expect("iterations should exist"),
|
|
|
|
|
+ request_count: 0,
|
|
|
|
|
+ tool_uses: response["tool_uses"]
|
|
|
|
|
+ .as_array()
|
|
|
|
|
+ .expect("tool uses array")
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .filter_map(|value| value["name"].as_str().map(ToOwned::to_owned))
|
|
|
|
|
+ .collect(),
|
|
|
|
|
+ tool_error_count: response["tool_results"]
|
|
|
|
|
+ .as_array()
|
|
|
|
|
+ .expect("tool results array")
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .filter(|value| value["is_error"].as_bool().unwrap_or(false))
|
|
|
|
|
+ .count(),
|
|
|
|
|
+ final_message: response["message"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("message text")
|
|
|
|
|
+ .to_string(),
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn maybe_write_report(reports: &[ScenarioReport]) {
|
|
|
|
|
+ let Some(path) = std::env::var_os("MOCK_PARITY_REPORT_PATH") else {
|
|
|
|
|
+ return;
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ let payload = json!({
|
|
|
|
|
+ "scenario_count": reports.len(),
|
|
|
|
|
+ "request_count": reports.iter().map(|report| report.request_count).sum::<usize>(),
|
|
|
|
|
+ "scenarios": reports.iter().map(scenario_report_json).collect::<Vec<_>>(),
|
|
|
|
|
+ });
|
|
|
|
|
+ fs::write(
|
|
|
|
|
+ path,
|
|
|
|
|
+ serde_json::to_vec_pretty(&payload).expect("report json should serialize"),
|
|
|
|
|
+ )
|
|
|
|
|
+ .expect("report should write");
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn load_scenario_manifest() -> Vec<ScenarioManifestEntry> {
|
|
|
|
|
+ let manifest_path =
|
|
|
|
|
+ Path::new(env!("CARGO_MANIFEST_DIR")).join("../../mock_parity_scenarios.json");
|
|
|
|
|
+ let manifest = fs::read_to_string(&manifest_path).expect("scenario manifest should exist");
|
|
|
|
|
+ serde_json::from_str::<Vec<Value>>(&manifest)
|
|
|
|
|
+ .expect("scenario manifest should parse")
|
|
|
|
|
+ .into_iter()
|
|
|
|
|
+ .map(|entry| ScenarioManifestEntry {
|
|
|
|
|
+ name: entry["name"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("scenario name should be a string")
|
|
|
|
|
+ .to_string(),
|
|
|
|
|
+ category: entry["category"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("scenario category should be a string")
|
|
|
|
|
+ .to_string(),
|
|
|
|
|
+ description: entry["description"]
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("scenario description should be a string")
|
|
|
|
|
+ .to_string(),
|
|
|
|
|
+ parity_refs: entry["parity_refs"]
|
|
|
|
|
+ .as_array()
|
|
|
|
|
+ .expect("parity refs should be an array")
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .map(|value| {
|
|
|
|
|
+ value
|
|
|
|
|
+ .as_str()
|
|
|
|
|
+ .expect("parity ref should be a string")
|
|
|
|
|
+ .to_string()
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect(),
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect()
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn scenario_report_json(report: &ScenarioReport) -> Value {
|
|
|
|
|
+ json!({
|
|
|
|
|
+ "name": report.name,
|
|
|
|
|
+ "category": report.category,
|
|
|
|
|
+ "description": report.description,
|
|
|
|
|
+ "parity_refs": report.parity_refs,
|
|
|
|
|
+ "iterations": report.iterations,
|
|
|
|
|
+ "request_count": report.request_count,
|
|
|
|
|
+ "tool_uses": report.tool_uses,
|
|
|
|
|
+ "tool_error_count": report.tool_error_count,
|
|
|
|
|
+ "final_message": report.final_message,
|
|
|
|
|
+ })
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
fn assert_success(output: &Output) {
|
|
fn assert_success(output: &Output) {
|