Parcourir la source

feat(runtime): recovery recipes

Jobdori il y a 2 mois
Parent
commit
2787981632
2 fichiers modifiés avec 559 ajouts et 0 suppressions
  1. 5 0
      rust/crates/runtime/src/lib.rs
  2. 554 0
      rust/crates/runtime/src/recovery_recipes.rs

+ 5 - 0
rust/crates/runtime/src/lib.rs

@@ -14,6 +14,7 @@ mod mcp_stdio;
 pub mod mcp_tool_bridge;
 mod oauth;
 pub mod permission_enforcer;
+pub mod recovery_recipes;
 mod permissions;
 mod prompt;
 mod remote;
@@ -80,6 +81,10 @@ pub use permissions::{
     PermissionContext, PermissionMode, PermissionOutcome, PermissionOverride, PermissionPolicy,
     PermissionPromptDecision, PermissionPrompter, PermissionRequest,
 };
+pub use recovery_recipes::{
+    attempt_recovery, recipe_for, EscalationPolicy, FailureScenario, RecoveryContext,
+    RecoveryEvent, RecoveryRecipe, RecoveryResult, RecoveryStep,
+};
 pub use prompt::{
     load_system_prompt, prepend_bullets, ContextFile, ProjectContext, PromptBuildError,
     SystemPromptBuilder, FRONTIER_MODEL_NAME, SYSTEM_PROMPT_DYNAMIC_BOUNDARY,

+ 554 - 0
rust/crates/runtime/src/recovery_recipes.rs

@@ -0,0 +1,554 @@
+//! Recovery recipes for common failure scenarios.
+//!
+//! Encodes known automatic recoveries for the six failure scenarios
+//! listed in ROADMAP item 8, and enforces one automatic recovery
+//! attempt before escalation. Each attempt is emitted as a structured
+//! recovery event.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+/// The six failure scenarios that have known recovery recipes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum FailureScenario {
+    TrustPromptUnresolved,
+    PromptMisdelivery,
+    StaleBranch,
+    CompileRedCrossCrate,
+    McpHandshakeFailure,
+    PartialPluginStartup,
+}
+
+impl FailureScenario {
+    /// Returns all known failure scenarios.
+    #[must_use]
+    pub fn all() -> &'static [FailureScenario] {
+        &[
+            Self::TrustPromptUnresolved,
+            Self::PromptMisdelivery,
+            Self::StaleBranch,
+            Self::CompileRedCrossCrate,
+            Self::McpHandshakeFailure,
+            Self::PartialPluginStartup,
+        ]
+    }
+}
+
+impl std::fmt::Display for FailureScenario {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::TrustPromptUnresolved => write!(f, "trust_prompt_unresolved"),
+            Self::PromptMisdelivery => write!(f, "prompt_misdelivery"),
+            Self::StaleBranch => write!(f, "stale_branch"),
+            Self::CompileRedCrossCrate => write!(f, "compile_red_cross_crate"),
+            Self::McpHandshakeFailure => write!(f, "mcp_handshake_failure"),
+            Self::PartialPluginStartup => write!(f, "partial_plugin_startup"),
+        }
+    }
+}
+
+/// Individual step that can be executed as part of a recovery recipe.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum RecoveryStep {
+    AcceptTrustPrompt,
+    RedirectPromptToAgent,
+    RebaseBranch,
+    CleanBuild,
+    RetryMcpHandshake { timeout: u64 },
+    RestartPlugin { name: String },
+    EscalateToHuman { reason: String },
+}
+
+/// Policy governing what happens when automatic recovery is exhausted.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum EscalationPolicy {
+    AlertHuman,
+    LogAndContinue,
+    Abort,
+}
+
+/// A recovery recipe encodes the sequence of steps to attempt for a
+/// given failure scenario, along with the maximum number of automatic
+/// attempts and the escalation policy.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct RecoveryRecipe {
+    pub scenario: FailureScenario,
+    pub steps: Vec<RecoveryStep>,
+    pub max_attempts: u32,
+    pub escalation_policy: EscalationPolicy,
+}
+
+/// Outcome of a recovery attempt.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum RecoveryResult {
+    Recovered {
+        steps_taken: u32,
+    },
+    PartialRecovery {
+        recovered: Vec<RecoveryStep>,
+        remaining: Vec<RecoveryStep>,
+    },
+    EscalationRequired {
+        reason: String,
+    },
+}
+
+/// Structured event emitted during recovery.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum RecoveryEvent {
+    RecoveryAttempted {
+        scenario: FailureScenario,
+        recipe: RecoveryRecipe,
+        result: RecoveryResult,
+    },
+    RecoverySucceeded,
+    RecoveryFailed,
+    Escalated,
+}
+
+/// Minimal context for tracking recovery state and emitting events.
+///
+/// Holds per-scenario attempt counts, a structured event log, and an
+/// optional simulation knob for controlling step outcomes during tests.
+#[derive(Debug, Clone, Default)]
+pub struct RecoveryContext {
+    attempts: HashMap<FailureScenario, u32>,
+    events: Vec<RecoveryEvent>,
+    /// Optional step index at which simulated execution fails.
+    /// `None` means all steps succeed.
+    fail_at_step: Option<usize>,
+}
+
+impl RecoveryContext {
+    #[must_use]
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Configure a step index at which simulated execution will fail.
+    #[must_use]
+    pub fn with_fail_at_step(mut self, index: usize) -> Self {
+        self.fail_at_step = Some(index);
+        self
+    }
+
+    /// Returns the structured event log populated during recovery.
+    #[must_use]
+    pub fn events(&self) -> &[RecoveryEvent] {
+        &self.events
+    }
+
+    /// Returns the number of recovery attempts made for a scenario.
+    #[must_use]
+    pub fn attempt_count(&self, scenario: &FailureScenario) -> u32 {
+        self.attempts.get(scenario).copied().unwrap_or(0)
+    }
+}
+
+/// Returns the known recovery recipe for the given failure scenario.
+#[must_use]
+pub fn recipe_for(scenario: &FailureScenario) -> RecoveryRecipe {
+    match scenario {
+        FailureScenario::TrustPromptUnresolved => RecoveryRecipe {
+            scenario: *scenario,
+            steps: vec![RecoveryStep::AcceptTrustPrompt],
+            max_attempts: 1,
+            escalation_policy: EscalationPolicy::AlertHuman,
+        },
+        FailureScenario::PromptMisdelivery => RecoveryRecipe {
+            scenario: *scenario,
+            steps: vec![RecoveryStep::RedirectPromptToAgent],
+            max_attempts: 1,
+            escalation_policy: EscalationPolicy::AlertHuman,
+        },
+        FailureScenario::StaleBranch => RecoveryRecipe {
+            scenario: *scenario,
+            steps: vec![RecoveryStep::RebaseBranch, RecoveryStep::CleanBuild],
+            max_attempts: 1,
+            escalation_policy: EscalationPolicy::AlertHuman,
+        },
+        FailureScenario::CompileRedCrossCrate => RecoveryRecipe {
+            scenario: *scenario,
+            steps: vec![RecoveryStep::CleanBuild],
+            max_attempts: 1,
+            escalation_policy: EscalationPolicy::AlertHuman,
+        },
+        FailureScenario::McpHandshakeFailure => RecoveryRecipe {
+            scenario: *scenario,
+            steps: vec![RecoveryStep::RetryMcpHandshake { timeout: 5000 }],
+            max_attempts: 1,
+            escalation_policy: EscalationPolicy::Abort,
+        },
+        FailureScenario::PartialPluginStartup => RecoveryRecipe {
+            scenario: *scenario,
+            steps: vec![
+                RecoveryStep::RestartPlugin {
+                    name: "stalled".to_string(),
+                },
+                RecoveryStep::RetryMcpHandshake { timeout: 3000 },
+            ],
+            max_attempts: 1,
+            escalation_policy: EscalationPolicy::LogAndContinue,
+        },
+    }
+}
+
+/// Attempts automatic recovery for the given failure scenario.
+///
+/// Looks up the recipe, enforces the one-attempt-before-escalation
+/// policy, simulates step execution (controlled by the context), and
+/// emits structured [`RecoveryEvent`]s for every attempt.
+pub fn attempt_recovery(scenario: &FailureScenario, ctx: &mut RecoveryContext) -> RecoveryResult {
+    let recipe = recipe_for(scenario);
+    let attempt_count = ctx.attempts.entry(*scenario).or_insert(0);
+
+    // Enforce one automatic recovery attempt before escalation.
+    if *attempt_count >= recipe.max_attempts {
+        let result = RecoveryResult::EscalationRequired {
+            reason: format!(
+                "max recovery attempts ({}) exceeded for {}",
+                recipe.max_attempts, scenario
+            ),
+        };
+        ctx.events.push(RecoveryEvent::RecoveryAttempted {
+            scenario: *scenario,
+            recipe,
+            result: result.clone(),
+        });
+        ctx.events.push(RecoveryEvent::Escalated);
+        return result;
+    }
+
+    *attempt_count += 1;
+
+    // Execute steps, honoring the optional fail_at_step simulation.
+    let fail_index = ctx.fail_at_step;
+    let mut executed = Vec::new();
+    let mut failed = false;
+
+    for (i, step) in recipe.steps.iter().enumerate() {
+        if fail_index == Some(i) {
+            failed = true;
+            break;
+        }
+        executed.push(step.clone());
+    }
+
+    let result = if failed {
+        let remaining: Vec<RecoveryStep> = recipe.steps[executed.len()..].to_vec();
+        if executed.is_empty() {
+            RecoveryResult::EscalationRequired {
+                reason: format!("recovery failed at first step for {}", scenario),
+            }
+        } else {
+            RecoveryResult::PartialRecovery {
+                recovered: executed,
+                remaining,
+            }
+        }
+    } else {
+        RecoveryResult::Recovered {
+            steps_taken: recipe.steps.len() as u32,
+        }
+    };
+
+    // Emit the attempt as structured event data.
+    ctx.events.push(RecoveryEvent::RecoveryAttempted {
+        scenario: *scenario,
+        recipe,
+        result: result.clone(),
+    });
+
+    match &result {
+        RecoveryResult::Recovered { .. } => {
+            ctx.events.push(RecoveryEvent::RecoverySucceeded);
+        }
+        RecoveryResult::PartialRecovery { .. } => {
+            ctx.events.push(RecoveryEvent::RecoveryFailed);
+        }
+        RecoveryResult::EscalationRequired { .. } => {
+            ctx.events.push(RecoveryEvent::Escalated);
+        }
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn each_scenario_has_a_matching_recipe() {
+        // given
+        let scenarios = FailureScenario::all();
+
+        // when / then
+        for scenario in scenarios {
+            let recipe = recipe_for(scenario);
+            assert_eq!(
+                recipe.scenario, *scenario,
+                "recipe scenario should match requested scenario"
+            );
+            assert!(
+                !recipe.steps.is_empty(),
+                "recipe for {} should have at least one step",
+                scenario
+            );
+            assert!(
+                recipe.max_attempts >= 1,
+                "recipe for {} should allow at least one attempt",
+                scenario
+            );
+        }
+    }
+
+    #[test]
+    fn successful_recovery_returns_recovered_and_emits_events() {
+        // given
+        let mut ctx = RecoveryContext::new();
+        let scenario = FailureScenario::TrustPromptUnresolved;
+
+        // when
+        let result = attempt_recovery(&scenario, &mut ctx);
+
+        // then
+        assert_eq!(result, RecoveryResult::Recovered { steps_taken: 1 });
+        assert_eq!(ctx.events().len(), 2);
+        assert!(matches!(
+            &ctx.events()[0],
+            RecoveryEvent::RecoveryAttempted {
+                scenario: s,
+                result: r,
+                ..
+            } if *s == FailureScenario::TrustPromptUnresolved
+              && matches!(r, RecoveryResult::Recovered { steps_taken: 1 })
+        ));
+        assert_eq!(ctx.events()[1], RecoveryEvent::RecoverySucceeded);
+    }
+
+    #[test]
+    fn escalation_after_max_attempts_exceeded() {
+        // given
+        let mut ctx = RecoveryContext::new();
+        let scenario = FailureScenario::PromptMisdelivery;
+
+        // when — first attempt succeeds
+        let first = attempt_recovery(&scenario, &mut ctx);
+        assert!(matches!(first, RecoveryResult::Recovered { .. }));
+
+        // when — second attempt should escalate
+        let second = attempt_recovery(&scenario, &mut ctx);
+
+        // then
+        assert!(
+            matches!(
+                &second,
+                RecoveryResult::EscalationRequired { reason }
+                    if reason.contains("max recovery attempts")
+            ),
+            "second attempt should require escalation, got: {second:?}"
+        );
+        assert_eq!(ctx.attempt_count(&scenario), 1);
+        assert!(ctx
+            .events()
+            .iter()
+            .any(|e| matches!(e, RecoveryEvent::Escalated)));
+    }
+
+    #[test]
+    fn partial_recovery_when_step_fails_midway() {
+        // given — PartialPluginStartup has two steps; fail at step index 1
+        let mut ctx = RecoveryContext::new().with_fail_at_step(1);
+        let scenario = FailureScenario::PartialPluginStartup;
+
+        // when
+        let result = attempt_recovery(&scenario, &mut ctx);
+
+        // then
+        match &result {
+            RecoveryResult::PartialRecovery {
+                recovered,
+                remaining,
+            } => {
+                assert_eq!(recovered.len(), 1, "one step should have succeeded");
+                assert_eq!(remaining.len(), 1, "one step should remain");
+                assert!(matches!(recovered[0], RecoveryStep::RestartPlugin { .. }));
+                assert!(matches!(
+                    remaining[0],
+                    RecoveryStep::RetryMcpHandshake { .. }
+                ));
+            }
+            other => panic!("expected PartialRecovery, got {other:?}"),
+        }
+        assert!(ctx
+            .events()
+            .iter()
+            .any(|e| matches!(e, RecoveryEvent::RecoveryFailed)));
+    }
+
+    #[test]
+    fn first_step_failure_escalates_immediately() {
+        // given — fail at step index 0
+        let mut ctx = RecoveryContext::new().with_fail_at_step(0);
+        let scenario = FailureScenario::CompileRedCrossCrate;
+
+        // when
+        let result = attempt_recovery(&scenario, &mut ctx);
+
+        // then
+        assert!(
+            matches!(
+                &result,
+                RecoveryResult::EscalationRequired { reason }
+                    if reason.contains("failed at first step")
+            ),
+            "zero-step failure should escalate, got: {result:?}"
+        );
+        assert!(ctx
+            .events()
+            .iter()
+            .any(|e| matches!(e, RecoveryEvent::Escalated)));
+    }
+
+    #[test]
+    fn emitted_events_include_structured_attempt_data() {
+        // given
+        let mut ctx = RecoveryContext::new();
+        let scenario = FailureScenario::McpHandshakeFailure;
+
+        // when
+        let _ = attempt_recovery(&scenario, &mut ctx);
+
+        // then — verify the RecoveryAttempted event carries full context
+        let attempted = ctx
+            .events()
+            .iter()
+            .find(|e| matches!(e, RecoveryEvent::RecoveryAttempted { .. }))
+            .expect("should have emitted RecoveryAttempted event");
+
+        match attempted {
+            RecoveryEvent::RecoveryAttempted {
+                scenario: s,
+                recipe,
+                result,
+            } => {
+                assert_eq!(*s, scenario);
+                assert_eq!(recipe.scenario, scenario);
+                assert!(!recipe.steps.is_empty());
+                assert!(matches!(result, RecoveryResult::Recovered { .. }));
+            }
+            _ => unreachable!(),
+        }
+
+        // Verify the event is serializable as structured JSON
+        let json = serde_json::to_string(&ctx.events()[0])
+            .expect("recovery event should be serializable to JSON");
+        assert!(
+            json.contains("mcp_handshake_failure"),
+            "serialized event should contain scenario name"
+        );
+    }
+
+    #[test]
+    fn recovery_context_tracks_attempts_per_scenario() {
+        // given
+        let mut ctx = RecoveryContext::new();
+
+        // when
+        assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 0);
+        attempt_recovery(&FailureScenario::StaleBranch, &mut ctx);
+
+        // then
+        assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 1);
+        assert_eq!(ctx.attempt_count(&FailureScenario::PromptMisdelivery), 0);
+    }
+
+    #[test]
+    fn stale_branch_recipe_has_rebase_then_clean_build() {
+        // given
+        let recipe = recipe_for(&FailureScenario::StaleBranch);
+
+        // then
+        assert_eq!(recipe.steps.len(), 2);
+        assert_eq!(recipe.steps[0], RecoveryStep::RebaseBranch);
+        assert_eq!(recipe.steps[1], RecoveryStep::CleanBuild);
+    }
+
+    #[test]
+    fn partial_plugin_startup_recipe_has_restart_then_handshake() {
+        // given
+        let recipe = recipe_for(&FailureScenario::PartialPluginStartup);
+
+        // then
+        assert_eq!(recipe.steps.len(), 2);
+        assert!(matches!(
+            recipe.steps[0],
+            RecoveryStep::RestartPlugin { .. }
+        ));
+        assert!(matches!(
+            recipe.steps[1],
+            RecoveryStep::RetryMcpHandshake { timeout: 3000 }
+        ));
+        assert_eq!(recipe.escalation_policy, EscalationPolicy::LogAndContinue);
+    }
+
+    #[test]
+    fn failure_scenario_display_all_variants() {
+        // given
+        let cases = [
+            (
+                FailureScenario::TrustPromptUnresolved,
+                "trust_prompt_unresolved",
+            ),
+            (FailureScenario::PromptMisdelivery, "prompt_misdelivery"),
+            (FailureScenario::StaleBranch, "stale_branch"),
+            (
+                FailureScenario::CompileRedCrossCrate,
+                "compile_red_cross_crate",
+            ),
+            (
+                FailureScenario::McpHandshakeFailure,
+                "mcp_handshake_failure",
+            ),
+            (
+                FailureScenario::PartialPluginStartup,
+                "partial_plugin_startup",
+            ),
+        ];
+
+        // when / then
+        for (scenario, expected) in &cases {
+            assert_eq!(scenario.to_string(), *expected);
+        }
+    }
+
+    #[test]
+    fn multi_step_success_reports_correct_steps_taken() {
+        // given — StaleBranch has 2 steps, no simulated failure
+        let mut ctx = RecoveryContext::new();
+        let scenario = FailureScenario::StaleBranch;
+
+        // when
+        let result = attempt_recovery(&scenario, &mut ctx);
+
+        // then
+        assert_eq!(result, RecoveryResult::Recovered { steps_taken: 2 });
+    }
+
+    #[test]
+    fn mcp_handshake_recipe_uses_abort_escalation_policy() {
+        // given
+        let recipe = recipe_for(&FailureScenario::McpHandshakeFailure);
+
+        // then
+        assert_eq!(recipe.escalation_policy, EscalationPolicy::Abort);
+        assert_eq!(recipe.max_attempts, 1);
+    }
+}