mcp_lifecycle_hardened.rs 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839
  1. use std::collections::{BTreeMap, BTreeSet};
  2. use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
  3. use serde::{Deserialize, Serialize};
  4. fn now_secs() -> u64 {
  5. SystemTime::now()
  6. .duration_since(UNIX_EPOCH)
  7. .unwrap_or_default()
  8. .as_secs()
  9. }
  10. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
  11. #[serde(rename_all = "snake_case")]
  12. pub enum McpLifecyclePhase {
  13. ConfigLoad,
  14. ServerRegistration,
  15. SpawnConnect,
  16. InitializeHandshake,
  17. ToolDiscovery,
  18. ResourceDiscovery,
  19. Ready,
  20. Invocation,
  21. ErrorSurfacing,
  22. Shutdown,
  23. Cleanup,
  24. }
  25. impl McpLifecyclePhase {
  26. #[must_use]
  27. pub fn all() -> [Self; 11] {
  28. [
  29. Self::ConfigLoad,
  30. Self::ServerRegistration,
  31. Self::SpawnConnect,
  32. Self::InitializeHandshake,
  33. Self::ToolDiscovery,
  34. Self::ResourceDiscovery,
  35. Self::Ready,
  36. Self::Invocation,
  37. Self::ErrorSurfacing,
  38. Self::Shutdown,
  39. Self::Cleanup,
  40. ]
  41. }
  42. }
  43. impl std::fmt::Display for McpLifecyclePhase {
  44. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  45. match self {
  46. Self::ConfigLoad => write!(f, "config_load"),
  47. Self::ServerRegistration => write!(f, "server_registration"),
  48. Self::SpawnConnect => write!(f, "spawn_connect"),
  49. Self::InitializeHandshake => write!(f, "initialize_handshake"),
  50. Self::ToolDiscovery => write!(f, "tool_discovery"),
  51. Self::ResourceDiscovery => write!(f, "resource_discovery"),
  52. Self::Ready => write!(f, "ready"),
  53. Self::Invocation => write!(f, "invocation"),
  54. Self::ErrorSurfacing => write!(f, "error_surfacing"),
  55. Self::Shutdown => write!(f, "shutdown"),
  56. Self::Cleanup => write!(f, "cleanup"),
  57. }
  58. }
  59. }
  60. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  61. pub struct McpErrorSurface {
  62. pub phase: McpLifecyclePhase,
  63. pub server_name: Option<String>,
  64. pub message: String,
  65. pub context: BTreeMap<String, String>,
  66. pub recoverable: bool,
  67. pub timestamp: u64,
  68. }
  69. impl McpErrorSurface {
  70. #[must_use]
  71. pub fn new(
  72. phase: McpLifecyclePhase,
  73. server_name: Option<String>,
  74. message: impl Into<String>,
  75. context: BTreeMap<String, String>,
  76. recoverable: bool,
  77. ) -> Self {
  78. Self {
  79. phase,
  80. server_name,
  81. message: message.into(),
  82. context,
  83. recoverable,
  84. timestamp: now_secs(),
  85. }
  86. }
  87. }
  88. impl std::fmt::Display for McpErrorSurface {
  89. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  90. write!(
  91. f,
  92. "MCP lifecycle error during {}: {}",
  93. self.phase, self.message
  94. )?;
  95. if let Some(server_name) = &self.server_name {
  96. write!(f, " (server: {server_name})")?;
  97. }
  98. if !self.context.is_empty() {
  99. write!(f, " with context {:?}", self.context)?;
  100. }
  101. if self.recoverable {
  102. write!(f, " [recoverable]")?;
  103. }
  104. Ok(())
  105. }
  106. }
  107. impl std::error::Error for McpErrorSurface {}
  108. #[derive(Debug, Clone, PartialEq, Eq)]
  109. pub enum McpPhaseResult {
  110. Success {
  111. phase: McpLifecyclePhase,
  112. duration: Duration,
  113. },
  114. Failure {
  115. phase: McpLifecyclePhase,
  116. error: McpErrorSurface,
  117. },
  118. Timeout {
  119. phase: McpLifecyclePhase,
  120. waited: Duration,
  121. error: McpErrorSurface,
  122. },
  123. }
  124. impl McpPhaseResult {
  125. #[must_use]
  126. pub fn phase(&self) -> McpLifecyclePhase {
  127. match self {
  128. Self::Success { phase, .. }
  129. | Self::Failure { phase, .. }
  130. | Self::Timeout { phase, .. } => *phase,
  131. }
  132. }
  133. }
  134. #[derive(Debug, Clone, Default)]
  135. pub struct McpLifecycleState {
  136. current_phase: Option<McpLifecyclePhase>,
  137. phase_errors: BTreeMap<McpLifecyclePhase, Vec<McpErrorSurface>>,
  138. phase_timestamps: BTreeMap<McpLifecyclePhase, u64>,
  139. phase_results: Vec<McpPhaseResult>,
  140. }
  141. impl McpLifecycleState {
  142. #[must_use]
  143. pub fn new() -> Self {
  144. Self::default()
  145. }
  146. #[must_use]
  147. pub fn current_phase(&self) -> Option<McpLifecyclePhase> {
  148. self.current_phase
  149. }
  150. #[must_use]
  151. pub fn errors_for_phase(&self, phase: McpLifecyclePhase) -> &[McpErrorSurface] {
  152. self.phase_errors
  153. .get(&phase)
  154. .map(Vec::as_slice)
  155. .unwrap_or(&[])
  156. }
  157. #[must_use]
  158. pub fn results(&self) -> &[McpPhaseResult] {
  159. &self.phase_results
  160. }
  161. #[must_use]
  162. pub fn phase_timestamps(&self) -> &BTreeMap<McpLifecyclePhase, u64> {
  163. &self.phase_timestamps
  164. }
  165. #[must_use]
  166. pub fn phase_timestamp(&self, phase: McpLifecyclePhase) -> Option<u64> {
  167. self.phase_timestamps.get(&phase).copied()
  168. }
  169. fn record_phase(&mut self, phase: McpLifecyclePhase) {
  170. self.current_phase = Some(phase);
  171. self.phase_timestamps.insert(phase, now_secs());
  172. }
  173. fn record_error(&mut self, error: McpErrorSurface) {
  174. self.phase_errors
  175. .entry(error.phase)
  176. .or_default()
  177. .push(error);
  178. }
  179. fn record_result(&mut self, result: McpPhaseResult) {
  180. self.phase_results.push(result);
  181. }
  182. fn can_resume_after_error(&self) -> bool {
  183. match self.phase_results.last() {
  184. Some(McpPhaseResult::Failure { error, .. } | McpPhaseResult::Timeout { error, .. }) => {
  185. error.recoverable
  186. }
  187. _ => false,
  188. }
  189. }
  190. }
  191. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  192. pub struct McpFailedServer {
  193. pub server_name: String,
  194. pub phase: McpLifecyclePhase,
  195. pub error: McpErrorSurface,
  196. }
  197. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  198. pub struct McpDegradedReport {
  199. pub working_servers: Vec<String>,
  200. pub failed_servers: Vec<McpFailedServer>,
  201. pub available_tools: Vec<String>,
  202. pub missing_tools: Vec<String>,
  203. }
  204. impl McpDegradedReport {
  205. #[must_use]
  206. pub fn new(
  207. working_servers: Vec<String>,
  208. failed_servers: Vec<McpFailedServer>,
  209. available_tools: Vec<String>,
  210. expected_tools: Vec<String>,
  211. ) -> Self {
  212. let working_servers = dedupe_sorted(working_servers);
  213. let available_tools = dedupe_sorted(available_tools);
  214. let available_tool_set: BTreeSet<_> = available_tools.iter().cloned().collect();
  215. let expected_tools = dedupe_sorted(expected_tools);
  216. let missing_tools = expected_tools
  217. .into_iter()
  218. .filter(|tool| !available_tool_set.contains(tool))
  219. .collect();
  220. Self {
  221. working_servers,
  222. failed_servers,
  223. available_tools,
  224. missing_tools,
  225. }
  226. }
  227. }
  228. #[derive(Debug, Clone, Default)]
  229. pub struct McpLifecycleValidator {
  230. state: McpLifecycleState,
  231. }
  232. impl McpLifecycleValidator {
  233. #[must_use]
  234. pub fn new() -> Self {
  235. Self::default()
  236. }
  237. #[must_use]
  238. pub fn state(&self) -> &McpLifecycleState {
  239. &self.state
  240. }
  241. #[must_use]
  242. pub fn validate_phase_transition(from: McpLifecyclePhase, to: McpLifecyclePhase) -> bool {
  243. match (from, to) {
  244. (McpLifecyclePhase::ConfigLoad, McpLifecyclePhase::ServerRegistration)
  245. | (McpLifecyclePhase::ServerRegistration, McpLifecyclePhase::SpawnConnect)
  246. | (McpLifecyclePhase::SpawnConnect, McpLifecyclePhase::InitializeHandshake)
  247. | (McpLifecyclePhase::InitializeHandshake, McpLifecyclePhase::ToolDiscovery)
  248. | (McpLifecyclePhase::ToolDiscovery, McpLifecyclePhase::ResourceDiscovery)
  249. | (McpLifecyclePhase::ToolDiscovery, McpLifecyclePhase::Ready)
  250. | (McpLifecyclePhase::ResourceDiscovery, McpLifecyclePhase::Ready)
  251. | (McpLifecyclePhase::Ready, McpLifecyclePhase::Invocation)
  252. | (McpLifecyclePhase::Invocation, McpLifecyclePhase::Ready)
  253. | (McpLifecyclePhase::ErrorSurfacing, McpLifecyclePhase::Ready)
  254. | (McpLifecyclePhase::ErrorSurfacing, McpLifecyclePhase::Shutdown)
  255. | (McpLifecyclePhase::Shutdown, McpLifecyclePhase::Cleanup) => true,
  256. (_, McpLifecyclePhase::Shutdown) => from != McpLifecyclePhase::Cleanup,
  257. (_, McpLifecyclePhase::ErrorSurfacing) => {
  258. from != McpLifecyclePhase::Cleanup && from != McpLifecyclePhase::Shutdown
  259. }
  260. _ => false,
  261. }
  262. }
  263. pub fn run_phase(&mut self, phase: McpLifecyclePhase) -> McpPhaseResult {
  264. let started = Instant::now();
  265. if let Some(current_phase) = self.state.current_phase() {
  266. if current_phase == McpLifecyclePhase::ErrorSurfacing
  267. && phase == McpLifecyclePhase::Ready
  268. && !self.state.can_resume_after_error()
  269. {
  270. return self.record_failure(McpErrorSurface::new(
  271. phase,
  272. None,
  273. "cannot return to ready after a non-recoverable MCP lifecycle failure",
  274. BTreeMap::from([
  275. ("from".to_string(), current_phase.to_string()),
  276. ("to".to_string(), phase.to_string()),
  277. ]),
  278. false,
  279. ));
  280. }
  281. if !Self::validate_phase_transition(current_phase, phase) {
  282. return self.record_failure(McpErrorSurface::new(
  283. phase,
  284. None,
  285. format!("invalid MCP lifecycle transition from {current_phase} to {phase}"),
  286. BTreeMap::from([
  287. ("from".to_string(), current_phase.to_string()),
  288. ("to".to_string(), phase.to_string()),
  289. ]),
  290. false,
  291. ));
  292. }
  293. } else if phase != McpLifecyclePhase::ConfigLoad {
  294. return self.record_failure(McpErrorSurface::new(
  295. phase,
  296. None,
  297. format!("invalid initial MCP lifecycle phase {phase}"),
  298. BTreeMap::from([("phase".to_string(), phase.to_string())]),
  299. false,
  300. ));
  301. }
  302. self.state.record_phase(phase);
  303. let result = McpPhaseResult::Success {
  304. phase,
  305. duration: started.elapsed(),
  306. };
  307. self.state.record_result(result.clone());
  308. result
  309. }
  310. pub fn record_failure(&mut self, error: McpErrorSurface) -> McpPhaseResult {
  311. let phase = error.phase;
  312. self.state.record_error(error.clone());
  313. self.state.record_phase(McpLifecyclePhase::ErrorSurfacing);
  314. let result = McpPhaseResult::Failure { phase, error };
  315. self.state.record_result(result.clone());
  316. result
  317. }
  318. pub fn record_timeout(
  319. &mut self,
  320. phase: McpLifecyclePhase,
  321. waited: Duration,
  322. server_name: Option<String>,
  323. mut context: BTreeMap<String, String>,
  324. ) -> McpPhaseResult {
  325. context.insert("waited_ms".to_string(), waited.as_millis().to_string());
  326. let error = McpErrorSurface::new(
  327. phase,
  328. server_name,
  329. format!(
  330. "MCP lifecycle phase {phase} timed out after {} ms",
  331. waited.as_millis()
  332. ),
  333. context,
  334. true,
  335. );
  336. self.state.record_error(error.clone());
  337. self.state.record_phase(McpLifecyclePhase::ErrorSurfacing);
  338. let result = McpPhaseResult::Timeout {
  339. phase,
  340. waited,
  341. error,
  342. };
  343. self.state.record_result(result.clone());
  344. result
  345. }
  346. }
  347. fn dedupe_sorted(mut values: Vec<String>) -> Vec<String> {
  348. values.sort();
  349. values.dedup();
  350. values
  351. }
  352. #[cfg(test)]
  353. mod tests {
  354. use super::*;
  355. use serde_json::json;
  356. #[test]
  357. fn phase_display_matches_serde_name() {
  358. // given
  359. let phases = McpLifecyclePhase::all();
  360. // when
  361. let serialized = phases
  362. .into_iter()
  363. .map(|phase| {
  364. (
  365. phase.to_string(),
  366. serde_json::to_value(phase).expect("serialize phase"),
  367. )
  368. })
  369. .collect::<Vec<_>>();
  370. // then
  371. for (display, json_value) in serialized {
  372. assert_eq!(json_value, json!(display));
  373. }
  374. }
  375. #[test]
  376. fn given_startup_path_when_running_to_cleanup_then_each_control_transition_succeeds() {
  377. // given
  378. let mut validator = McpLifecycleValidator::new();
  379. let phases = [
  380. McpLifecyclePhase::ConfigLoad,
  381. McpLifecyclePhase::ServerRegistration,
  382. McpLifecyclePhase::SpawnConnect,
  383. McpLifecyclePhase::InitializeHandshake,
  384. McpLifecyclePhase::ToolDiscovery,
  385. McpLifecyclePhase::ResourceDiscovery,
  386. McpLifecyclePhase::Ready,
  387. McpLifecyclePhase::Invocation,
  388. McpLifecyclePhase::Ready,
  389. McpLifecyclePhase::Shutdown,
  390. McpLifecyclePhase::Cleanup,
  391. ];
  392. // when
  393. let results = phases
  394. .into_iter()
  395. .map(|phase| validator.run_phase(phase))
  396. .collect::<Vec<_>>();
  397. // then
  398. assert!(results
  399. .iter()
  400. .all(|result| matches!(result, McpPhaseResult::Success { .. })));
  401. assert_eq!(
  402. validator.state().current_phase(),
  403. Some(McpLifecyclePhase::Cleanup)
  404. );
  405. for phase in [
  406. McpLifecyclePhase::ConfigLoad,
  407. McpLifecyclePhase::ServerRegistration,
  408. McpLifecyclePhase::SpawnConnect,
  409. McpLifecyclePhase::InitializeHandshake,
  410. McpLifecyclePhase::ToolDiscovery,
  411. McpLifecyclePhase::ResourceDiscovery,
  412. McpLifecyclePhase::Ready,
  413. McpLifecyclePhase::Invocation,
  414. McpLifecyclePhase::Shutdown,
  415. McpLifecyclePhase::Cleanup,
  416. ] {
  417. assert!(validator.state().phase_timestamp(phase).is_some());
  418. }
  419. }
  420. #[test]
  421. fn given_tool_discovery_when_resource_discovery_is_skipped_then_ready_is_still_allowed() {
  422. // given
  423. let mut validator = McpLifecycleValidator::new();
  424. for phase in [
  425. McpLifecyclePhase::ConfigLoad,
  426. McpLifecyclePhase::ServerRegistration,
  427. McpLifecyclePhase::SpawnConnect,
  428. McpLifecyclePhase::InitializeHandshake,
  429. McpLifecyclePhase::ToolDiscovery,
  430. ] {
  431. let result = validator.run_phase(phase);
  432. assert!(matches!(result, McpPhaseResult::Success { .. }));
  433. }
  434. // when
  435. let result = validator.run_phase(McpLifecyclePhase::Ready);
  436. // then
  437. assert!(matches!(result, McpPhaseResult::Success { .. }));
  438. assert_eq!(
  439. validator.state().current_phase(),
  440. Some(McpLifecyclePhase::Ready)
  441. );
  442. }
  443. #[test]
  444. fn validates_expected_phase_transitions() {
  445. // given
  446. let valid_transitions = [
  447. (
  448. McpLifecyclePhase::ConfigLoad,
  449. McpLifecyclePhase::ServerRegistration,
  450. ),
  451. (
  452. McpLifecyclePhase::ServerRegistration,
  453. McpLifecyclePhase::SpawnConnect,
  454. ),
  455. (
  456. McpLifecyclePhase::SpawnConnect,
  457. McpLifecyclePhase::InitializeHandshake,
  458. ),
  459. (
  460. McpLifecyclePhase::InitializeHandshake,
  461. McpLifecyclePhase::ToolDiscovery,
  462. ),
  463. (
  464. McpLifecyclePhase::ToolDiscovery,
  465. McpLifecyclePhase::ResourceDiscovery,
  466. ),
  467. (McpLifecyclePhase::ToolDiscovery, McpLifecyclePhase::Ready),
  468. (
  469. McpLifecyclePhase::ResourceDiscovery,
  470. McpLifecyclePhase::Ready,
  471. ),
  472. (McpLifecyclePhase::Ready, McpLifecyclePhase::Invocation),
  473. (McpLifecyclePhase::Invocation, McpLifecyclePhase::Ready),
  474. (McpLifecyclePhase::Ready, McpLifecyclePhase::Shutdown),
  475. (
  476. McpLifecyclePhase::Invocation,
  477. McpLifecyclePhase::ErrorSurfacing,
  478. ),
  479. (
  480. McpLifecyclePhase::ErrorSurfacing,
  481. McpLifecyclePhase::Shutdown,
  482. ),
  483. (McpLifecyclePhase::Shutdown, McpLifecyclePhase::Cleanup),
  484. ];
  485. // when / then
  486. for (from, to) in valid_transitions {
  487. assert!(McpLifecycleValidator::validate_phase_transition(from, to));
  488. }
  489. assert!(!McpLifecycleValidator::validate_phase_transition(
  490. McpLifecyclePhase::Ready,
  491. McpLifecyclePhase::ConfigLoad,
  492. ));
  493. assert!(!McpLifecycleValidator::validate_phase_transition(
  494. McpLifecyclePhase::Cleanup,
  495. McpLifecyclePhase::Ready,
  496. ));
  497. }
  498. #[test]
  499. fn given_invalid_transition_when_running_phase_then_structured_failure_is_recorded() {
  500. // given
  501. let mut validator = McpLifecycleValidator::new();
  502. let _ = validator.run_phase(McpLifecyclePhase::ConfigLoad);
  503. let _ = validator.run_phase(McpLifecyclePhase::ServerRegistration);
  504. // when
  505. let result = validator.run_phase(McpLifecyclePhase::Ready);
  506. // then
  507. match result {
  508. McpPhaseResult::Failure { phase, error } => {
  509. assert_eq!(phase, McpLifecyclePhase::Ready);
  510. assert!(!error.recoverable);
  511. assert_eq!(error.phase, McpLifecyclePhase::Ready);
  512. assert_eq!(
  513. error.context.get("from").map(String::as_str),
  514. Some("server_registration")
  515. );
  516. assert_eq!(error.context.get("to").map(String::as_str), Some("ready"));
  517. }
  518. other => panic!("expected failure result, got {other:?}"),
  519. }
  520. assert_eq!(
  521. validator.state().current_phase(),
  522. Some(McpLifecyclePhase::ErrorSurfacing)
  523. );
  524. assert_eq!(
  525. validator
  526. .state()
  527. .errors_for_phase(McpLifecyclePhase::Ready)
  528. .len(),
  529. 1
  530. );
  531. }
  532. #[test]
  533. fn given_each_phase_when_failure_is_recorded_then_error_is_tracked_per_phase() {
  534. // given
  535. let mut validator = McpLifecycleValidator::new();
  536. // when / then
  537. for phase in McpLifecyclePhase::all() {
  538. let result = validator.record_failure(McpErrorSurface::new(
  539. phase,
  540. Some("alpha".to_string()),
  541. format!("failure at {phase}"),
  542. BTreeMap::from([("server".to_string(), "alpha".to_string())]),
  543. phase == McpLifecyclePhase::ResourceDiscovery,
  544. ));
  545. match result {
  546. McpPhaseResult::Failure { phase: failed_phase, error } => {
  547. assert_eq!(failed_phase, phase);
  548. assert_eq!(error.phase, phase);
  549. assert_eq!(
  550. error.recoverable,
  551. phase == McpLifecyclePhase::ResourceDiscovery
  552. );
  553. }
  554. other => panic!("expected failure result, got {other:?}"),
  555. }
  556. assert_eq!(validator.state().errors_for_phase(phase).len(), 1);
  557. }
  558. }
  559. #[test]
  560. fn given_spawn_connect_timeout_when_recorded_then_waited_duration_is_preserved() {
  561. // given
  562. let mut validator = McpLifecycleValidator::new();
  563. let waited = Duration::from_millis(250);
  564. // when
  565. let result = validator.record_timeout(
  566. McpLifecyclePhase::SpawnConnect,
  567. waited,
  568. Some("alpha".to_string()),
  569. BTreeMap::from([("attempt".to_string(), "1".to_string())]),
  570. );
  571. // then
  572. match result {
  573. McpPhaseResult::Timeout {
  574. phase,
  575. waited: actual,
  576. error,
  577. } => {
  578. assert_eq!(phase, McpLifecyclePhase::SpawnConnect);
  579. assert_eq!(actual, waited);
  580. assert!(error.recoverable);
  581. assert_eq!(error.server_name.as_deref(), Some("alpha"));
  582. }
  583. other => panic!("expected timeout result, got {other:?}"),
  584. }
  585. let errors = validator
  586. .state()
  587. .errors_for_phase(McpLifecyclePhase::SpawnConnect);
  588. assert_eq!(errors.len(), 1);
  589. assert_eq!(
  590. errors[0].context.get("waited_ms").map(String::as_str),
  591. Some("250")
  592. );
  593. assert_eq!(
  594. validator.state().current_phase(),
  595. Some(McpLifecyclePhase::ErrorSurfacing)
  596. );
  597. }
  598. #[test]
  599. fn given_partial_server_health_when_building_degraded_report_then_missing_tools_are_reported() {
  600. // given
  601. let failed = vec![McpFailedServer {
  602. server_name: "broken".to_string(),
  603. phase: McpLifecyclePhase::InitializeHandshake,
  604. error: McpErrorSurface::new(
  605. McpLifecyclePhase::InitializeHandshake,
  606. Some("broken".to_string()),
  607. "initialize failed",
  608. BTreeMap::from([("reason".to_string(), "broken pipe".to_string())]),
  609. false,
  610. ),
  611. }];
  612. // when
  613. let report = McpDegradedReport::new(
  614. vec!["alpha".to_string(), "beta".to_string(), "alpha".to_string()],
  615. failed,
  616. vec![
  617. "alpha.echo".to_string(),
  618. "beta.search".to_string(),
  619. "alpha.echo".to_string(),
  620. ],
  621. vec![
  622. "alpha.echo".to_string(),
  623. "beta.search".to_string(),
  624. "broken.fetch".to_string(),
  625. ],
  626. );
  627. // then
  628. assert_eq!(
  629. report.working_servers,
  630. vec!["alpha".to_string(), "beta".to_string()]
  631. );
  632. assert_eq!(report.failed_servers.len(), 1);
  633. assert_eq!(report.failed_servers[0].server_name, "broken");
  634. assert_eq!(
  635. report.available_tools,
  636. vec!["alpha.echo".to_string(), "beta.search".to_string()]
  637. );
  638. assert_eq!(report.missing_tools, vec!["broken.fetch".to_string()]);
  639. }
  640. #[test]
  641. fn given_failure_during_resource_discovery_when_shutting_down_then_cleanup_still_succeeds() {
  642. // given
  643. let mut validator = McpLifecycleValidator::new();
  644. for phase in [
  645. McpLifecyclePhase::ConfigLoad,
  646. McpLifecyclePhase::ServerRegistration,
  647. McpLifecyclePhase::SpawnConnect,
  648. McpLifecyclePhase::InitializeHandshake,
  649. McpLifecyclePhase::ToolDiscovery,
  650. ] {
  651. let result = validator.run_phase(phase);
  652. assert!(matches!(result, McpPhaseResult::Success { .. }));
  653. }
  654. let _ = validator.record_failure(McpErrorSurface::new(
  655. McpLifecyclePhase::ResourceDiscovery,
  656. Some("alpha".to_string()),
  657. "resource listing failed",
  658. BTreeMap::from([("reason".to_string(), "timeout".to_string())]),
  659. true,
  660. ));
  661. // when
  662. let shutdown = validator.run_phase(McpLifecyclePhase::Shutdown);
  663. let cleanup = validator.run_phase(McpLifecyclePhase::Cleanup);
  664. // then
  665. assert!(matches!(shutdown, McpPhaseResult::Success { .. }));
  666. assert!(matches!(cleanup, McpPhaseResult::Success { .. }));
  667. assert_eq!(
  668. validator.state().current_phase(),
  669. Some(McpLifecyclePhase::Cleanup)
  670. );
  671. assert!(validator
  672. .state()
  673. .phase_timestamp(McpLifecyclePhase::ErrorSurfacing)
  674. .is_some());
  675. }
  676. #[test]
  677. fn error_surface_display_includes_phase_server_and_recoverable_flag() {
  678. // given
  679. let error = McpErrorSurface::new(
  680. McpLifecyclePhase::SpawnConnect,
  681. Some("alpha".to_string()),
  682. "process exited early",
  683. BTreeMap::from([("exit_code".to_string(), "1".to_string())]),
  684. true,
  685. );
  686. // when
  687. let rendered = error.to_string();
  688. // then
  689. assert!(rendered.contains("spawn_connect"));
  690. assert!(rendered.contains("process exited early"));
  691. assert!(rendered.contains("server: alpha"));
  692. assert!(rendered.contains("recoverable"));
  693. let trait_object: &dyn std::error::Error = &error;
  694. assert_eq!(trait_object.to_string(), rendered);
  695. }
  696. #[test]
  697. fn given_nonrecoverable_failure_when_returning_to_ready_then_validator_rejects_resume() {
  698. // given
  699. let mut validator = McpLifecycleValidator::new();
  700. for phase in [
  701. McpLifecyclePhase::ConfigLoad,
  702. McpLifecyclePhase::ServerRegistration,
  703. McpLifecyclePhase::SpawnConnect,
  704. McpLifecyclePhase::InitializeHandshake,
  705. McpLifecyclePhase::ToolDiscovery,
  706. McpLifecyclePhase::Ready,
  707. ] {
  708. let result = validator.run_phase(phase);
  709. assert!(matches!(result, McpPhaseResult::Success { .. }));
  710. }
  711. let _ = validator.record_failure(McpErrorSurface::new(
  712. McpLifecyclePhase::Invocation,
  713. Some("alpha".to_string()),
  714. "tool call corrupted the session",
  715. BTreeMap::from([("reason".to_string(), "invalid frame".to_string())]),
  716. false,
  717. ));
  718. // when
  719. let result = validator.run_phase(McpLifecyclePhase::Ready);
  720. // then
  721. match result {
  722. McpPhaseResult::Failure { phase, error } => {
  723. assert_eq!(phase, McpLifecyclePhase::Ready);
  724. assert!(!error.recoverable);
  725. assert!(error.message.contains("non-recoverable"));
  726. }
  727. other => panic!("expected failure result, got {other:?}"),
  728. }
  729. assert_eq!(
  730. validator.state().current_phase(),
  731. Some(McpLifecyclePhase::ErrorSurfacing)
  732. );
  733. }
  734. #[test]
  735. fn given_recoverable_failure_when_returning_to_ready_then_validator_allows_resume() {
  736. // given
  737. let mut validator = McpLifecycleValidator::new();
  738. for phase in [
  739. McpLifecyclePhase::ConfigLoad,
  740. McpLifecyclePhase::ServerRegistration,
  741. McpLifecyclePhase::SpawnConnect,
  742. McpLifecyclePhase::InitializeHandshake,
  743. McpLifecyclePhase::ToolDiscovery,
  744. McpLifecyclePhase::Ready,
  745. ] {
  746. let result = validator.run_phase(phase);
  747. assert!(matches!(result, McpPhaseResult::Success { .. }));
  748. }
  749. let _ = validator.record_failure(McpErrorSurface::new(
  750. McpLifecyclePhase::Invocation,
  751. Some("alpha".to_string()),
  752. "tool call failed but can be retried",
  753. BTreeMap::from([("reason".to_string(), "upstream timeout".to_string())]),
  754. true,
  755. ));
  756. // when
  757. let result = validator.run_phase(McpLifecyclePhase::Ready);
  758. // then
  759. assert!(matches!(result, McpPhaseResult::Success { .. }));
  760. assert_eq!(
  761. validator.state().current_phase(),
  762. Some(McpLifecyclePhase::Ready)
  763. );
  764. }
  765. }