recovery_recipes.rs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. //! Recovery recipes for common failure scenarios.
  2. //!
  3. //! Encodes known automatic recoveries for the six failure scenarios
  4. //! listed in ROADMAP item 8, and enforces one automatic recovery
  5. //! attempt before escalation. Each attempt is emitted as a structured
  6. //! recovery event.
  7. use std::collections::HashMap;
  8. use serde::{Deserialize, Serialize};
  9. /// The six failure scenarios that have known recovery recipes.
  10. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
  11. #[serde(rename_all = "snake_case")]
  12. pub enum FailureScenario {
  13. TrustPromptUnresolved,
  14. PromptMisdelivery,
  15. StaleBranch,
  16. CompileRedCrossCrate,
  17. McpHandshakeFailure,
  18. PartialPluginStartup,
  19. }
  20. impl FailureScenario {
  21. /// Returns all known failure scenarios.
  22. #[must_use]
  23. pub fn all() -> &'static [FailureScenario] {
  24. &[
  25. Self::TrustPromptUnresolved,
  26. Self::PromptMisdelivery,
  27. Self::StaleBranch,
  28. Self::CompileRedCrossCrate,
  29. Self::McpHandshakeFailure,
  30. Self::PartialPluginStartup,
  31. ]
  32. }
  33. }
  34. impl std::fmt::Display for FailureScenario {
  35. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  36. match self {
  37. Self::TrustPromptUnresolved => write!(f, "trust_prompt_unresolved"),
  38. Self::PromptMisdelivery => write!(f, "prompt_misdelivery"),
  39. Self::StaleBranch => write!(f, "stale_branch"),
  40. Self::CompileRedCrossCrate => write!(f, "compile_red_cross_crate"),
  41. Self::McpHandshakeFailure => write!(f, "mcp_handshake_failure"),
  42. Self::PartialPluginStartup => write!(f, "partial_plugin_startup"),
  43. }
  44. }
  45. }
  46. /// Individual step that can be executed as part of a recovery recipe.
  47. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  48. #[serde(rename_all = "snake_case")]
  49. pub enum RecoveryStep {
  50. AcceptTrustPrompt,
  51. RedirectPromptToAgent,
  52. RebaseBranch,
  53. CleanBuild,
  54. RetryMcpHandshake { timeout: u64 },
  55. RestartPlugin { name: String },
  56. EscalateToHuman { reason: String },
  57. }
  58. /// Policy governing what happens when automatic recovery is exhausted.
  59. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  60. #[serde(rename_all = "snake_case")]
  61. pub enum EscalationPolicy {
  62. AlertHuman,
  63. LogAndContinue,
  64. Abort,
  65. }
  66. /// A recovery recipe encodes the sequence of steps to attempt for a
  67. /// given failure scenario, along with the maximum number of automatic
  68. /// attempts and the escalation policy.
  69. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  70. pub struct RecoveryRecipe {
  71. pub scenario: FailureScenario,
  72. pub steps: Vec<RecoveryStep>,
  73. pub max_attempts: u32,
  74. pub escalation_policy: EscalationPolicy,
  75. }
  76. /// Outcome of a recovery attempt.
  77. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  78. #[serde(rename_all = "snake_case")]
  79. pub enum RecoveryResult {
  80. Recovered {
  81. steps_taken: u32,
  82. },
  83. PartialRecovery {
  84. recovered: Vec<RecoveryStep>,
  85. remaining: Vec<RecoveryStep>,
  86. },
  87. EscalationRequired {
  88. reason: String,
  89. },
  90. }
  91. /// Structured event emitted during recovery.
  92. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  93. #[serde(rename_all = "snake_case")]
  94. pub enum RecoveryEvent {
  95. RecoveryAttempted {
  96. scenario: FailureScenario,
  97. recipe: RecoveryRecipe,
  98. result: RecoveryResult,
  99. },
  100. RecoverySucceeded,
  101. RecoveryFailed,
  102. Escalated,
  103. }
  104. /// Minimal context for tracking recovery state and emitting events.
  105. ///
  106. /// Holds per-scenario attempt counts, a structured event log, and an
  107. /// optional simulation knob for controlling step outcomes during tests.
  108. #[derive(Debug, Clone, Default)]
  109. pub struct RecoveryContext {
  110. attempts: HashMap<FailureScenario, u32>,
  111. events: Vec<RecoveryEvent>,
  112. /// Optional step index at which simulated execution fails.
  113. /// `None` means all steps succeed.
  114. fail_at_step: Option<usize>,
  115. }
  116. impl RecoveryContext {
  117. #[must_use]
  118. pub fn new() -> Self {
  119. Self::default()
  120. }
  121. /// Configure a step index at which simulated execution will fail.
  122. #[must_use]
  123. pub fn with_fail_at_step(mut self, index: usize) -> Self {
  124. self.fail_at_step = Some(index);
  125. self
  126. }
  127. /// Returns the structured event log populated during recovery.
  128. #[must_use]
  129. pub fn events(&self) -> &[RecoveryEvent] {
  130. &self.events
  131. }
  132. /// Returns the number of recovery attempts made for a scenario.
  133. #[must_use]
  134. pub fn attempt_count(&self, scenario: &FailureScenario) -> u32 {
  135. self.attempts.get(scenario).copied().unwrap_or(0)
  136. }
  137. }
  138. /// Returns the known recovery recipe for the given failure scenario.
  139. #[must_use]
  140. pub fn recipe_for(scenario: &FailureScenario) -> RecoveryRecipe {
  141. match scenario {
  142. FailureScenario::TrustPromptUnresolved => RecoveryRecipe {
  143. scenario: *scenario,
  144. steps: vec![RecoveryStep::AcceptTrustPrompt],
  145. max_attempts: 1,
  146. escalation_policy: EscalationPolicy::AlertHuman,
  147. },
  148. FailureScenario::PromptMisdelivery => RecoveryRecipe {
  149. scenario: *scenario,
  150. steps: vec![RecoveryStep::RedirectPromptToAgent],
  151. max_attempts: 1,
  152. escalation_policy: EscalationPolicy::AlertHuman,
  153. },
  154. FailureScenario::StaleBranch => RecoveryRecipe {
  155. scenario: *scenario,
  156. steps: vec![RecoveryStep::RebaseBranch, RecoveryStep::CleanBuild],
  157. max_attempts: 1,
  158. escalation_policy: EscalationPolicy::AlertHuman,
  159. },
  160. FailureScenario::CompileRedCrossCrate => RecoveryRecipe {
  161. scenario: *scenario,
  162. steps: vec![RecoveryStep::CleanBuild],
  163. max_attempts: 1,
  164. escalation_policy: EscalationPolicy::AlertHuman,
  165. },
  166. FailureScenario::McpHandshakeFailure => RecoveryRecipe {
  167. scenario: *scenario,
  168. steps: vec![RecoveryStep::RetryMcpHandshake { timeout: 5000 }],
  169. max_attempts: 1,
  170. escalation_policy: EscalationPolicy::Abort,
  171. },
  172. FailureScenario::PartialPluginStartup => RecoveryRecipe {
  173. scenario: *scenario,
  174. steps: vec![
  175. RecoveryStep::RestartPlugin {
  176. name: "stalled".to_string(),
  177. },
  178. RecoveryStep::RetryMcpHandshake { timeout: 3000 },
  179. ],
  180. max_attempts: 1,
  181. escalation_policy: EscalationPolicy::LogAndContinue,
  182. },
  183. }
  184. }
  185. /// Attempts automatic recovery for the given failure scenario.
  186. ///
  187. /// Looks up the recipe, enforces the one-attempt-before-escalation
  188. /// policy, simulates step execution (controlled by the context), and
  189. /// emits structured [`RecoveryEvent`]s for every attempt.
  190. pub fn attempt_recovery(scenario: &FailureScenario, ctx: &mut RecoveryContext) -> RecoveryResult {
  191. let recipe = recipe_for(scenario);
  192. let attempt_count = ctx.attempts.entry(*scenario).or_insert(0);
  193. // Enforce one automatic recovery attempt before escalation.
  194. if *attempt_count >= recipe.max_attempts {
  195. let result = RecoveryResult::EscalationRequired {
  196. reason: format!(
  197. "max recovery attempts ({}) exceeded for {}",
  198. recipe.max_attempts, scenario
  199. ),
  200. };
  201. ctx.events.push(RecoveryEvent::RecoveryAttempted {
  202. scenario: *scenario,
  203. recipe,
  204. result: result.clone(),
  205. });
  206. ctx.events.push(RecoveryEvent::Escalated);
  207. return result;
  208. }
  209. *attempt_count += 1;
  210. // Execute steps, honoring the optional fail_at_step simulation.
  211. let fail_index = ctx.fail_at_step;
  212. let mut executed = Vec::new();
  213. let mut failed = false;
  214. for (i, step) in recipe.steps.iter().enumerate() {
  215. if fail_index == Some(i) {
  216. failed = true;
  217. break;
  218. }
  219. executed.push(step.clone());
  220. }
  221. let result = if failed {
  222. let remaining: Vec<RecoveryStep> = recipe.steps[executed.len()..].to_vec();
  223. if executed.is_empty() {
  224. RecoveryResult::EscalationRequired {
  225. reason: format!("recovery failed at first step for {}", scenario),
  226. }
  227. } else {
  228. RecoveryResult::PartialRecovery {
  229. recovered: executed,
  230. remaining,
  231. }
  232. }
  233. } else {
  234. RecoveryResult::Recovered {
  235. steps_taken: recipe.steps.len() as u32,
  236. }
  237. };
  238. // Emit the attempt as structured event data.
  239. ctx.events.push(RecoveryEvent::RecoveryAttempted {
  240. scenario: *scenario,
  241. recipe,
  242. result: result.clone(),
  243. });
  244. match &result {
  245. RecoveryResult::Recovered { .. } => {
  246. ctx.events.push(RecoveryEvent::RecoverySucceeded);
  247. }
  248. RecoveryResult::PartialRecovery { .. } => {
  249. ctx.events.push(RecoveryEvent::RecoveryFailed);
  250. }
  251. RecoveryResult::EscalationRequired { .. } => {
  252. ctx.events.push(RecoveryEvent::Escalated);
  253. }
  254. }
  255. result
  256. }
  257. #[cfg(test)]
  258. mod tests {
  259. use super::*;
  260. #[test]
  261. fn each_scenario_has_a_matching_recipe() {
  262. // given
  263. let scenarios = FailureScenario::all();
  264. // when / then
  265. for scenario in scenarios {
  266. let recipe = recipe_for(scenario);
  267. assert_eq!(
  268. recipe.scenario, *scenario,
  269. "recipe scenario should match requested scenario"
  270. );
  271. assert!(
  272. !recipe.steps.is_empty(),
  273. "recipe for {} should have at least one step",
  274. scenario
  275. );
  276. assert!(
  277. recipe.max_attempts >= 1,
  278. "recipe for {} should allow at least one attempt",
  279. scenario
  280. );
  281. }
  282. }
  283. #[test]
  284. fn successful_recovery_returns_recovered_and_emits_events() {
  285. // given
  286. let mut ctx = RecoveryContext::new();
  287. let scenario = FailureScenario::TrustPromptUnresolved;
  288. // when
  289. let result = attempt_recovery(&scenario, &mut ctx);
  290. // then
  291. assert_eq!(result, RecoveryResult::Recovered { steps_taken: 1 });
  292. assert_eq!(ctx.events().len(), 2);
  293. assert!(matches!(
  294. &ctx.events()[0],
  295. RecoveryEvent::RecoveryAttempted {
  296. scenario: s,
  297. result: r,
  298. ..
  299. } if *s == FailureScenario::TrustPromptUnresolved
  300. && matches!(r, RecoveryResult::Recovered { steps_taken: 1 })
  301. ));
  302. assert_eq!(ctx.events()[1], RecoveryEvent::RecoverySucceeded);
  303. }
  304. #[test]
  305. fn escalation_after_max_attempts_exceeded() {
  306. // given
  307. let mut ctx = RecoveryContext::new();
  308. let scenario = FailureScenario::PromptMisdelivery;
  309. // when — first attempt succeeds
  310. let first = attempt_recovery(&scenario, &mut ctx);
  311. assert!(matches!(first, RecoveryResult::Recovered { .. }));
  312. // when — second attempt should escalate
  313. let second = attempt_recovery(&scenario, &mut ctx);
  314. // then
  315. assert!(
  316. matches!(
  317. &second,
  318. RecoveryResult::EscalationRequired { reason }
  319. if reason.contains("max recovery attempts")
  320. ),
  321. "second attempt should require escalation, got: {second:?}"
  322. );
  323. assert_eq!(ctx.attempt_count(&scenario), 1);
  324. assert!(ctx
  325. .events()
  326. .iter()
  327. .any(|e| matches!(e, RecoveryEvent::Escalated)));
  328. }
  329. #[test]
  330. fn partial_recovery_when_step_fails_midway() {
  331. // given — PartialPluginStartup has two steps; fail at step index 1
  332. let mut ctx = RecoveryContext::new().with_fail_at_step(1);
  333. let scenario = FailureScenario::PartialPluginStartup;
  334. // when
  335. let result = attempt_recovery(&scenario, &mut ctx);
  336. // then
  337. match &result {
  338. RecoveryResult::PartialRecovery {
  339. recovered,
  340. remaining,
  341. } => {
  342. assert_eq!(recovered.len(), 1, "one step should have succeeded");
  343. assert_eq!(remaining.len(), 1, "one step should remain");
  344. assert!(matches!(recovered[0], RecoveryStep::RestartPlugin { .. }));
  345. assert!(matches!(
  346. remaining[0],
  347. RecoveryStep::RetryMcpHandshake { .. }
  348. ));
  349. }
  350. other => panic!("expected PartialRecovery, got {other:?}"),
  351. }
  352. assert!(ctx
  353. .events()
  354. .iter()
  355. .any(|e| matches!(e, RecoveryEvent::RecoveryFailed)));
  356. }
  357. #[test]
  358. fn first_step_failure_escalates_immediately() {
  359. // given — fail at step index 0
  360. let mut ctx = RecoveryContext::new().with_fail_at_step(0);
  361. let scenario = FailureScenario::CompileRedCrossCrate;
  362. // when
  363. let result = attempt_recovery(&scenario, &mut ctx);
  364. // then
  365. assert!(
  366. matches!(
  367. &result,
  368. RecoveryResult::EscalationRequired { reason }
  369. if reason.contains("failed at first step")
  370. ),
  371. "zero-step failure should escalate, got: {result:?}"
  372. );
  373. assert!(ctx
  374. .events()
  375. .iter()
  376. .any(|e| matches!(e, RecoveryEvent::Escalated)));
  377. }
  378. #[test]
  379. fn emitted_events_include_structured_attempt_data() {
  380. // given
  381. let mut ctx = RecoveryContext::new();
  382. let scenario = FailureScenario::McpHandshakeFailure;
  383. // when
  384. let _ = attempt_recovery(&scenario, &mut ctx);
  385. // then — verify the RecoveryAttempted event carries full context
  386. let attempted = ctx
  387. .events()
  388. .iter()
  389. .find(|e| matches!(e, RecoveryEvent::RecoveryAttempted { .. }))
  390. .expect("should have emitted RecoveryAttempted event");
  391. match attempted {
  392. RecoveryEvent::RecoveryAttempted {
  393. scenario: s,
  394. recipe,
  395. result,
  396. } => {
  397. assert_eq!(*s, scenario);
  398. assert_eq!(recipe.scenario, scenario);
  399. assert!(!recipe.steps.is_empty());
  400. assert!(matches!(result, RecoveryResult::Recovered { .. }));
  401. }
  402. _ => unreachable!(),
  403. }
  404. // Verify the event is serializable as structured JSON
  405. let json = serde_json::to_string(&ctx.events()[0])
  406. .expect("recovery event should be serializable to JSON");
  407. assert!(
  408. json.contains("mcp_handshake_failure"),
  409. "serialized event should contain scenario name"
  410. );
  411. }
  412. #[test]
  413. fn recovery_context_tracks_attempts_per_scenario() {
  414. // given
  415. let mut ctx = RecoveryContext::new();
  416. // when
  417. assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 0);
  418. attempt_recovery(&FailureScenario::StaleBranch, &mut ctx);
  419. // then
  420. assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 1);
  421. assert_eq!(ctx.attempt_count(&FailureScenario::PromptMisdelivery), 0);
  422. }
  423. #[test]
  424. fn stale_branch_recipe_has_rebase_then_clean_build() {
  425. // given
  426. let recipe = recipe_for(&FailureScenario::StaleBranch);
  427. // then
  428. assert_eq!(recipe.steps.len(), 2);
  429. assert_eq!(recipe.steps[0], RecoveryStep::RebaseBranch);
  430. assert_eq!(recipe.steps[1], RecoveryStep::CleanBuild);
  431. }
  432. #[test]
  433. fn partial_plugin_startup_recipe_has_restart_then_handshake() {
  434. // given
  435. let recipe = recipe_for(&FailureScenario::PartialPluginStartup);
  436. // then
  437. assert_eq!(recipe.steps.len(), 2);
  438. assert!(matches!(
  439. recipe.steps[0],
  440. RecoveryStep::RestartPlugin { .. }
  441. ));
  442. assert!(matches!(
  443. recipe.steps[1],
  444. RecoveryStep::RetryMcpHandshake { timeout: 3000 }
  445. ));
  446. assert_eq!(recipe.escalation_policy, EscalationPolicy::LogAndContinue);
  447. }
  448. #[test]
  449. fn failure_scenario_display_all_variants() {
  450. // given
  451. let cases = [
  452. (
  453. FailureScenario::TrustPromptUnresolved,
  454. "trust_prompt_unresolved",
  455. ),
  456. (FailureScenario::PromptMisdelivery, "prompt_misdelivery"),
  457. (FailureScenario::StaleBranch, "stale_branch"),
  458. (
  459. FailureScenario::CompileRedCrossCrate,
  460. "compile_red_cross_crate",
  461. ),
  462. (
  463. FailureScenario::McpHandshakeFailure,
  464. "mcp_handshake_failure",
  465. ),
  466. (
  467. FailureScenario::PartialPluginStartup,
  468. "partial_plugin_startup",
  469. ),
  470. ];
  471. // when / then
  472. for (scenario, expected) in &cases {
  473. assert_eq!(scenario.to_string(), *expected);
  474. }
  475. }
  476. #[test]
  477. fn multi_step_success_reports_correct_steps_taken() {
  478. // given — StaleBranch has 2 steps, no simulated failure
  479. let mut ctx = RecoveryContext::new();
  480. let scenario = FailureScenario::StaleBranch;
  481. // when
  482. let result = attempt_recovery(&scenario, &mut ctx);
  483. // then
  484. assert_eq!(result, RecoveryResult::Recovered { steps_taken: 2 });
  485. }
  486. #[test]
  487. fn mcp_handshake_recipe_uses_abort_escalation_policy() {
  488. // given
  489. let recipe = recipe_for(&FailureScenario::McpHandshakeFailure);
  490. // then
  491. assert_eq!(recipe.escalation_policy, EscalationPolicy::Abort);
  492. assert_eq!(recipe.max_attempts, 1);
  493. }
  494. }