recovery_recipes.rs 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. //! Recovery recipes for common failure scenarios.
  2. //!
  3. //! Encodes known automatic recoveries for the six failure scenarios
  4. //! listed in ROADMAP item 8, and enforces one automatic recovery
  5. //! attempt before escalation. Each attempt is emitted as a structured
  6. //! recovery event.
  7. use std::collections::HashMap;
  8. use serde::{Deserialize, Serialize};
  9. use crate::worker_boot::WorkerFailureKind;
  10. /// The six failure scenarios that have known recovery recipes.
  11. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
  12. #[serde(rename_all = "snake_case")]
  13. pub enum FailureScenario {
  14. TrustPromptUnresolved,
  15. PromptMisdelivery,
  16. StaleBranch,
  17. CompileRedCrossCrate,
  18. McpHandshakeFailure,
  19. PartialPluginStartup,
  20. ProviderFailure,
  21. }
  22. impl FailureScenario {
  23. /// Returns all known failure scenarios.
  24. #[must_use]
  25. pub fn all() -> &'static [FailureScenario] {
  26. &[
  27. Self::TrustPromptUnresolved,
  28. Self::PromptMisdelivery,
  29. Self::StaleBranch,
  30. Self::CompileRedCrossCrate,
  31. Self::McpHandshakeFailure,
  32. Self::PartialPluginStartup,
  33. Self::ProviderFailure,
  34. ]
  35. }
  36. /// Map a `WorkerFailureKind` to the corresponding `FailureScenario`.
  37. /// This is the bridge that lets recovery policy consume worker boot events.
  38. #[must_use]
  39. pub fn from_worker_failure_kind(kind: WorkerFailureKind) -> Self {
  40. match kind {
  41. WorkerFailureKind::TrustGate => Self::TrustPromptUnresolved,
  42. WorkerFailureKind::PromptDelivery => Self::PromptMisdelivery,
  43. WorkerFailureKind::Protocol => Self::McpHandshakeFailure,
  44. WorkerFailureKind::Provider => Self::ProviderFailure,
  45. }
  46. }
  47. }
  48. impl std::fmt::Display for FailureScenario {
  49. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  50. match self {
  51. Self::TrustPromptUnresolved => write!(f, "trust_prompt_unresolved"),
  52. Self::PromptMisdelivery => write!(f, "prompt_misdelivery"),
  53. Self::StaleBranch => write!(f, "stale_branch"),
  54. Self::CompileRedCrossCrate => write!(f, "compile_red_cross_crate"),
  55. Self::McpHandshakeFailure => write!(f, "mcp_handshake_failure"),
  56. Self::PartialPluginStartup => write!(f, "partial_plugin_startup"),
  57. Self::ProviderFailure => write!(f, "provider_failure"),
  58. }
  59. }
  60. }
  61. /// Individual step that can be executed as part of a recovery recipe.
  62. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  63. #[serde(rename_all = "snake_case")]
  64. pub enum RecoveryStep {
  65. AcceptTrustPrompt,
  66. RedirectPromptToAgent,
  67. RebaseBranch,
  68. CleanBuild,
  69. RetryMcpHandshake { timeout: u64 },
  70. RestartPlugin { name: String },
  71. RestartWorker,
  72. EscalateToHuman { reason: String },
  73. }
  74. /// Policy governing what happens when automatic recovery is exhausted.
  75. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  76. #[serde(rename_all = "snake_case")]
  77. pub enum EscalationPolicy {
  78. AlertHuman,
  79. LogAndContinue,
  80. Abort,
  81. }
  82. /// A recovery recipe encodes the sequence of steps to attempt for a
  83. /// given failure scenario, along with the maximum number of automatic
  84. /// attempts and the escalation policy.
  85. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  86. pub struct RecoveryRecipe {
  87. pub scenario: FailureScenario,
  88. pub steps: Vec<RecoveryStep>,
  89. pub max_attempts: u32,
  90. pub escalation_policy: EscalationPolicy,
  91. }
  92. /// Outcome of a recovery attempt.
  93. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  94. #[serde(rename_all = "snake_case")]
  95. pub enum RecoveryResult {
  96. Recovered {
  97. steps_taken: u32,
  98. },
  99. PartialRecovery {
  100. recovered: Vec<RecoveryStep>,
  101. remaining: Vec<RecoveryStep>,
  102. },
  103. EscalationRequired {
  104. reason: String,
  105. },
  106. }
  107. /// Structured event emitted during recovery.
  108. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  109. #[serde(rename_all = "snake_case")]
  110. pub enum RecoveryEvent {
  111. RecoveryAttempted {
  112. scenario: FailureScenario,
  113. recipe: RecoveryRecipe,
  114. result: RecoveryResult,
  115. },
  116. RecoverySucceeded,
  117. RecoveryFailed,
  118. Escalated,
  119. }
  120. /// Minimal context for tracking recovery state and emitting events.
  121. ///
  122. /// Holds per-scenario attempt counts, a structured event log, and an
  123. /// optional simulation knob for controlling step outcomes during tests.
  124. #[derive(Debug, Clone, Default)]
  125. pub struct RecoveryContext {
  126. attempts: HashMap<FailureScenario, u32>,
  127. events: Vec<RecoveryEvent>,
  128. /// Optional step index at which simulated execution fails.
  129. /// `None` means all steps succeed.
  130. fail_at_step: Option<usize>,
  131. }
  132. impl RecoveryContext {
  133. #[must_use]
  134. pub fn new() -> Self {
  135. Self::default()
  136. }
  137. /// Configure a step index at which simulated execution will fail.
  138. #[must_use]
  139. pub fn with_fail_at_step(mut self, index: usize) -> Self {
  140. self.fail_at_step = Some(index);
  141. self
  142. }
  143. /// Returns the structured event log populated during recovery.
  144. #[must_use]
  145. pub fn events(&self) -> &[RecoveryEvent] {
  146. &self.events
  147. }
  148. /// Returns the number of recovery attempts made for a scenario.
  149. #[must_use]
  150. pub fn attempt_count(&self, scenario: &FailureScenario) -> u32 {
  151. self.attempts.get(scenario).copied().unwrap_or(0)
  152. }
  153. }
  154. /// Returns the known recovery recipe for the given failure scenario.
  155. #[must_use]
  156. pub fn recipe_for(scenario: &FailureScenario) -> RecoveryRecipe {
  157. match scenario {
  158. FailureScenario::TrustPromptUnresolved => RecoveryRecipe {
  159. scenario: *scenario,
  160. steps: vec![RecoveryStep::AcceptTrustPrompt],
  161. max_attempts: 1,
  162. escalation_policy: EscalationPolicy::AlertHuman,
  163. },
  164. FailureScenario::PromptMisdelivery => RecoveryRecipe {
  165. scenario: *scenario,
  166. steps: vec![RecoveryStep::RedirectPromptToAgent],
  167. max_attempts: 1,
  168. escalation_policy: EscalationPolicy::AlertHuman,
  169. },
  170. FailureScenario::StaleBranch => RecoveryRecipe {
  171. scenario: *scenario,
  172. steps: vec![RecoveryStep::RebaseBranch, RecoveryStep::CleanBuild],
  173. max_attempts: 1,
  174. escalation_policy: EscalationPolicy::AlertHuman,
  175. },
  176. FailureScenario::CompileRedCrossCrate => RecoveryRecipe {
  177. scenario: *scenario,
  178. steps: vec![RecoveryStep::CleanBuild],
  179. max_attempts: 1,
  180. escalation_policy: EscalationPolicy::AlertHuman,
  181. },
  182. FailureScenario::McpHandshakeFailure => RecoveryRecipe {
  183. scenario: *scenario,
  184. steps: vec![RecoveryStep::RetryMcpHandshake { timeout: 5000 }],
  185. max_attempts: 1,
  186. escalation_policy: EscalationPolicy::Abort,
  187. },
  188. FailureScenario::PartialPluginStartup => RecoveryRecipe {
  189. scenario: *scenario,
  190. steps: vec![
  191. RecoveryStep::RestartPlugin {
  192. name: "stalled".to_string(),
  193. },
  194. RecoveryStep::RetryMcpHandshake { timeout: 3000 },
  195. ],
  196. max_attempts: 1,
  197. escalation_policy: EscalationPolicy::LogAndContinue,
  198. },
  199. FailureScenario::ProviderFailure => RecoveryRecipe {
  200. scenario: *scenario,
  201. steps: vec![RecoveryStep::RestartWorker],
  202. max_attempts: 1,
  203. escalation_policy: EscalationPolicy::AlertHuman,
  204. },
  205. }
  206. }
  207. /// Attempts automatic recovery for the given failure scenario.
  208. ///
  209. /// Looks up the recipe, enforces the one-attempt-before-escalation
  210. /// policy, simulates step execution (controlled by the context), and
  211. /// emits structured [`RecoveryEvent`]s for every attempt.
  212. pub fn attempt_recovery(scenario: &FailureScenario, ctx: &mut RecoveryContext) -> RecoveryResult {
  213. let recipe = recipe_for(scenario);
  214. let attempt_count = ctx.attempts.entry(*scenario).or_insert(0);
  215. // Enforce one automatic recovery attempt before escalation.
  216. if *attempt_count >= recipe.max_attempts {
  217. let result = RecoveryResult::EscalationRequired {
  218. reason: format!(
  219. "max recovery attempts ({}) exceeded for {}",
  220. recipe.max_attempts, scenario
  221. ),
  222. };
  223. ctx.events.push(RecoveryEvent::RecoveryAttempted {
  224. scenario: *scenario,
  225. recipe,
  226. result: result.clone(),
  227. });
  228. ctx.events.push(RecoveryEvent::Escalated);
  229. return result;
  230. }
  231. *attempt_count += 1;
  232. // Execute steps, honoring the optional fail_at_step simulation.
  233. let fail_index = ctx.fail_at_step;
  234. let mut executed = Vec::new();
  235. let mut failed = false;
  236. for (i, step) in recipe.steps.iter().enumerate() {
  237. if fail_index == Some(i) {
  238. failed = true;
  239. break;
  240. }
  241. executed.push(step.clone());
  242. }
  243. let result = if failed {
  244. let remaining: Vec<RecoveryStep> = recipe.steps[executed.len()..].to_vec();
  245. if executed.is_empty() {
  246. RecoveryResult::EscalationRequired {
  247. reason: format!("recovery failed at first step for {}", scenario),
  248. }
  249. } else {
  250. RecoveryResult::PartialRecovery {
  251. recovered: executed,
  252. remaining,
  253. }
  254. }
  255. } else {
  256. RecoveryResult::Recovered {
  257. steps_taken: recipe.steps.len() as u32,
  258. }
  259. };
  260. // Emit the attempt as structured event data.
  261. ctx.events.push(RecoveryEvent::RecoveryAttempted {
  262. scenario: *scenario,
  263. recipe,
  264. result: result.clone(),
  265. });
  266. match &result {
  267. RecoveryResult::Recovered { .. } => {
  268. ctx.events.push(RecoveryEvent::RecoverySucceeded);
  269. }
  270. RecoveryResult::PartialRecovery { .. } => {
  271. ctx.events.push(RecoveryEvent::RecoveryFailed);
  272. }
  273. RecoveryResult::EscalationRequired { .. } => {
  274. ctx.events.push(RecoveryEvent::Escalated);
  275. }
  276. }
  277. result
  278. }
  279. #[cfg(test)]
  280. mod tests {
  281. use super::*;
  282. #[test]
  283. fn each_scenario_has_a_matching_recipe() {
  284. // given
  285. let scenarios = FailureScenario::all();
  286. // when / then
  287. for scenario in scenarios {
  288. let recipe = recipe_for(scenario);
  289. assert_eq!(
  290. recipe.scenario, *scenario,
  291. "recipe scenario should match requested scenario"
  292. );
  293. assert!(
  294. !recipe.steps.is_empty(),
  295. "recipe for {} should have at least one step",
  296. scenario
  297. );
  298. assert!(
  299. recipe.max_attempts >= 1,
  300. "recipe for {} should allow at least one attempt",
  301. scenario
  302. );
  303. }
  304. }
  305. #[test]
  306. fn successful_recovery_returns_recovered_and_emits_events() {
  307. // given
  308. let mut ctx = RecoveryContext::new();
  309. let scenario = FailureScenario::TrustPromptUnresolved;
  310. // when
  311. let result = attempt_recovery(&scenario, &mut ctx);
  312. // then
  313. assert_eq!(result, RecoveryResult::Recovered { steps_taken: 1 });
  314. assert_eq!(ctx.events().len(), 2);
  315. assert!(matches!(
  316. &ctx.events()[0],
  317. RecoveryEvent::RecoveryAttempted {
  318. scenario: s,
  319. result: r,
  320. ..
  321. } if *s == FailureScenario::TrustPromptUnresolved
  322. && matches!(r, RecoveryResult::Recovered { steps_taken: 1 })
  323. ));
  324. assert_eq!(ctx.events()[1], RecoveryEvent::RecoverySucceeded);
  325. }
  326. #[test]
  327. fn escalation_after_max_attempts_exceeded() {
  328. // given
  329. let mut ctx = RecoveryContext::new();
  330. let scenario = FailureScenario::PromptMisdelivery;
  331. // when — first attempt succeeds
  332. let first = attempt_recovery(&scenario, &mut ctx);
  333. assert!(matches!(first, RecoveryResult::Recovered { .. }));
  334. // when — second attempt should escalate
  335. let second = attempt_recovery(&scenario, &mut ctx);
  336. // then
  337. assert!(
  338. matches!(
  339. &second,
  340. RecoveryResult::EscalationRequired { reason }
  341. if reason.contains("max recovery attempts")
  342. ),
  343. "second attempt should require escalation, got: {second:?}"
  344. );
  345. assert_eq!(ctx.attempt_count(&scenario), 1);
  346. assert!(ctx
  347. .events()
  348. .iter()
  349. .any(|e| matches!(e, RecoveryEvent::Escalated)));
  350. }
  351. #[test]
  352. fn partial_recovery_when_step_fails_midway() {
  353. // given — PartialPluginStartup has two steps; fail at step index 1
  354. let mut ctx = RecoveryContext::new().with_fail_at_step(1);
  355. let scenario = FailureScenario::PartialPluginStartup;
  356. // when
  357. let result = attempt_recovery(&scenario, &mut ctx);
  358. // then
  359. match &result {
  360. RecoveryResult::PartialRecovery {
  361. recovered,
  362. remaining,
  363. } => {
  364. assert_eq!(recovered.len(), 1, "one step should have succeeded");
  365. assert_eq!(remaining.len(), 1, "one step should remain");
  366. assert!(matches!(recovered[0], RecoveryStep::RestartPlugin { .. }));
  367. assert!(matches!(
  368. remaining[0],
  369. RecoveryStep::RetryMcpHandshake { .. }
  370. ));
  371. }
  372. other => panic!("expected PartialRecovery, got {other:?}"),
  373. }
  374. assert!(ctx
  375. .events()
  376. .iter()
  377. .any(|e| matches!(e, RecoveryEvent::RecoveryFailed)));
  378. }
  379. #[test]
  380. fn first_step_failure_escalates_immediately() {
  381. // given — fail at step index 0
  382. let mut ctx = RecoveryContext::new().with_fail_at_step(0);
  383. let scenario = FailureScenario::CompileRedCrossCrate;
  384. // when
  385. let result = attempt_recovery(&scenario, &mut ctx);
  386. // then
  387. assert!(
  388. matches!(
  389. &result,
  390. RecoveryResult::EscalationRequired { reason }
  391. if reason.contains("failed at first step")
  392. ),
  393. "zero-step failure should escalate, got: {result:?}"
  394. );
  395. assert!(ctx
  396. .events()
  397. .iter()
  398. .any(|e| matches!(e, RecoveryEvent::Escalated)));
  399. }
  400. #[test]
  401. fn emitted_events_include_structured_attempt_data() {
  402. // given
  403. let mut ctx = RecoveryContext::new();
  404. let scenario = FailureScenario::McpHandshakeFailure;
  405. // when
  406. let _ = attempt_recovery(&scenario, &mut ctx);
  407. // then — verify the RecoveryAttempted event carries full context
  408. let attempted = ctx
  409. .events()
  410. .iter()
  411. .find(|e| matches!(e, RecoveryEvent::RecoveryAttempted { .. }))
  412. .expect("should have emitted RecoveryAttempted event");
  413. match attempted {
  414. RecoveryEvent::RecoveryAttempted {
  415. scenario: s,
  416. recipe,
  417. result,
  418. } => {
  419. assert_eq!(*s, scenario);
  420. assert_eq!(recipe.scenario, scenario);
  421. assert!(!recipe.steps.is_empty());
  422. assert!(matches!(result, RecoveryResult::Recovered { .. }));
  423. }
  424. _ => unreachable!(),
  425. }
  426. // Verify the event is serializable as structured JSON
  427. let json = serde_json::to_string(&ctx.events()[0])
  428. .expect("recovery event should be serializable to JSON");
  429. assert!(
  430. json.contains("mcp_handshake_failure"),
  431. "serialized event should contain scenario name"
  432. );
  433. }
  434. #[test]
  435. fn recovery_context_tracks_attempts_per_scenario() {
  436. // given
  437. let mut ctx = RecoveryContext::new();
  438. // when
  439. assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 0);
  440. attempt_recovery(&FailureScenario::StaleBranch, &mut ctx);
  441. // then
  442. assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 1);
  443. assert_eq!(ctx.attempt_count(&FailureScenario::PromptMisdelivery), 0);
  444. }
  445. #[test]
  446. fn stale_branch_recipe_has_rebase_then_clean_build() {
  447. // given
  448. let recipe = recipe_for(&FailureScenario::StaleBranch);
  449. // then
  450. assert_eq!(recipe.steps.len(), 2);
  451. assert_eq!(recipe.steps[0], RecoveryStep::RebaseBranch);
  452. assert_eq!(recipe.steps[1], RecoveryStep::CleanBuild);
  453. }
  454. #[test]
  455. fn partial_plugin_startup_recipe_has_restart_then_handshake() {
  456. // given
  457. let recipe = recipe_for(&FailureScenario::PartialPluginStartup);
  458. // then
  459. assert_eq!(recipe.steps.len(), 2);
  460. assert!(matches!(
  461. recipe.steps[0],
  462. RecoveryStep::RestartPlugin { .. }
  463. ));
  464. assert!(matches!(
  465. recipe.steps[1],
  466. RecoveryStep::RetryMcpHandshake { timeout: 3000 }
  467. ));
  468. assert_eq!(recipe.escalation_policy, EscalationPolicy::LogAndContinue);
  469. }
  470. #[test]
  471. fn failure_scenario_display_all_variants() {
  472. // given
  473. let cases = [
  474. (
  475. FailureScenario::TrustPromptUnresolved,
  476. "trust_prompt_unresolved",
  477. ),
  478. (FailureScenario::PromptMisdelivery, "prompt_misdelivery"),
  479. (FailureScenario::StaleBranch, "stale_branch"),
  480. (
  481. FailureScenario::CompileRedCrossCrate,
  482. "compile_red_cross_crate",
  483. ),
  484. (
  485. FailureScenario::McpHandshakeFailure,
  486. "mcp_handshake_failure",
  487. ),
  488. (
  489. FailureScenario::PartialPluginStartup,
  490. "partial_plugin_startup",
  491. ),
  492. ];
  493. // when / then
  494. for (scenario, expected) in &cases {
  495. assert_eq!(scenario.to_string(), *expected);
  496. }
  497. }
  498. #[test]
  499. fn multi_step_success_reports_correct_steps_taken() {
  500. // given — StaleBranch has 2 steps, no simulated failure
  501. let mut ctx = RecoveryContext::new();
  502. let scenario = FailureScenario::StaleBranch;
  503. // when
  504. let result = attempt_recovery(&scenario, &mut ctx);
  505. // then
  506. assert_eq!(result, RecoveryResult::Recovered { steps_taken: 2 });
  507. }
  508. #[test]
  509. fn mcp_handshake_recipe_uses_abort_escalation_policy() {
  510. // given
  511. let recipe = recipe_for(&FailureScenario::McpHandshakeFailure);
  512. // then
  513. assert_eq!(recipe.escalation_policy, EscalationPolicy::Abort);
  514. assert_eq!(recipe.max_attempts, 1);
  515. }
  516. #[test]
  517. fn worker_failure_kind_maps_to_failure_scenario() {
  518. // given / when / then — verify the bridge is correct
  519. assert_eq!(
  520. FailureScenario::from_worker_failure_kind(WorkerFailureKind::TrustGate),
  521. FailureScenario::TrustPromptUnresolved,
  522. );
  523. assert_eq!(
  524. FailureScenario::from_worker_failure_kind(WorkerFailureKind::PromptDelivery),
  525. FailureScenario::PromptMisdelivery,
  526. );
  527. assert_eq!(
  528. FailureScenario::from_worker_failure_kind(WorkerFailureKind::Protocol),
  529. FailureScenario::McpHandshakeFailure,
  530. );
  531. assert_eq!(
  532. FailureScenario::from_worker_failure_kind(WorkerFailureKind::Provider),
  533. FailureScenario::ProviderFailure,
  534. );
  535. }
  536. #[test]
  537. fn provider_failure_recipe_uses_restart_worker_step() {
  538. // given
  539. let recipe = recipe_for(&FailureScenario::ProviderFailure);
  540. // then
  541. assert_eq!(recipe.scenario, FailureScenario::ProviderFailure);
  542. assert!(recipe.steps.contains(&RecoveryStep::RestartWorker));
  543. assert_eq!(recipe.escalation_policy, EscalationPolicy::AlertHuman);
  544. assert_eq!(recipe.max_attempts, 1);
  545. }
  546. #[test]
  547. fn provider_failure_recovery_attempt_succeeds_then_escalates() {
  548. // given
  549. let mut ctx = RecoveryContext::new();
  550. let scenario = FailureScenario::ProviderFailure;
  551. // when — first attempt
  552. let first = attempt_recovery(&scenario, &mut ctx);
  553. assert!(matches!(first, RecoveryResult::Recovered { .. }));
  554. // when — second attempt should escalate (max_attempts=1)
  555. let second = attempt_recovery(&scenario, &mut ctx);
  556. assert!(matches!(second, RecoveryResult::EscalationRequired { .. }));
  557. assert!(ctx
  558. .events()
  559. .iter()
  560. .any(|e| matches!(e, RecoveryEvent::Escalated)));
  561. }
  562. }