diff options
| author | soryu <soryu@soryu.co> | 2026-02-01 00:52:02 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2026-02-01 00:52:02 +0000 |
| commit | c076455ea651a6be3764af7392ff41175ec0bed1 (patch) | |
| tree | ee30a9055ac2be46379ddf6dd245b069ea26521c | |
| parent | 999ecf644f58af7de0b0a36b22a69897d8056a1c (diff) | |
| download | soryu-makima/task-task-8958634f-8958634f.tar.gz soryu-makima/task-task-8958634f-8958634f.zip | |
[WIP] Heartbeat checkpoint - 2026-02-01 00:52:02 UTCmakima/task-task-8958634f-8958634f
| -rw-r--r-- | makima/src/db/models.rs | 2 | ||||
| -rw-r--r-- | makima/src/server/handlers/mesh_daemon.rs | 51 | ||||
| -rw-r--r-- | makima/src/server/handlers/mesh_supervisor.rs | 82 |
3 files changed, 111 insertions, 24 deletions
diff --git a/makima/src/db/models.rs b/makima/src/db/models.rs index a7d2cda..ca6409c 100644 --- a/makima/src/db/models.rs +++ b/makima/src/db/models.rs @@ -2155,7 +2155,7 @@ pub enum SupervisorSavePoint { /// Supervisor restoration context /// Contains all information needed to restore a supervisor after crash -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +#[derive(Debug, Clone, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct SupervisorRestorationContext { /// The restored supervisor state diff --git a/makima/src/server/handlers/mesh_daemon.rs b/makima/src/server/handlers/mesh_daemon.rs index 4c6a045..b7398a8 100644 --- a/makima/src/server/handlers/mesh_daemon.rs +++ b/makima/src/server/handlers/mesh_daemon.rs @@ -536,6 +536,57 @@ pub enum DaemonMessage { #[serde(rename = "baseSha")] base_sha: String, }, + /// Supervisor state update for crash recovery. + /// Sent periodically or at key save points to persist state. + SupervisorStateUpdate { + /// Task ID of the supervisor. + #[serde(rename = "taskId")] + task_id: Uuid, + /// Contract ID. + #[serde(rename = "contractId")] + contract_id: Uuid, + /// Save point type that triggered this update. + #[serde(rename = "savePoint")] + save_point: String, + /// Current supervisor activity state. + state: Option<String>, + /// Human-readable current activity. + #[serde(rename = "currentActivity")] + current_activity: Option<String>, + /// Progress percentage (0-100). + progress: Option<i32>, + /// Last LLM response for context restoration. + #[serde(rename = "lastLlmResponse")] + last_llm_response: Option<String>, + /// Task that was just spawned (if save_point is "task_spawn"). + #[serde(rename = "spawnedTaskId")] + spawned_task_id: Option<Uuid>, + /// Question ID (if save_point is "question_asked"). + #[serde(rename = "questionId")] + question_id: Option<Uuid>, + /// Error message (if state is "error"). + #[serde(rename = "errorMessage")] + error_message: Option<String>, + /// Updated conversation history (sent on llm_response save points). + #[serde(rename = "conversationHistory")] + conversation_history: Option<serde_json::Value>, + }, + /// Supervisor heartbeat for lightweight state updates. + SupervisorHeartbeat { + /// Task ID of the supervisor. + #[serde(rename = "taskId")] + task_id: Uuid, + /// Contract ID. + #[serde(rename = "contractId")] + contract_id: Uuid, + /// Current state (optional). + state: Option<String>, + /// Current activity description (optional). + #[serde(rename = "currentActivity")] + current_activity: Option<String>, + /// Progress percentage (optional). + progress: Option<i32>, + }, } /// Validated daemon authentication result. diff --git a/makima/src/server/handlers/mesh_supervisor.rs b/makima/src/server/handlers/mesh_supervisor.rs index b33c1c9..0e4b18b 100644 --- a/makima/src/server/handlers/mesh_supervisor.rs +++ b/makima/src/server/handlers/mesh_supervisor.rs @@ -2988,9 +2988,7 @@ pub async fn get_restoration_context( }), ).into_response() } - repository::StateValidationResult::Valid(supervisor_state) | - repository::StateValidationResult::PartiallyValid { state: supervisor_state, .. } | - repository::StateValidationResult::PhaseStale { state: supervisor_state, .. } => { + repository::StateValidationResult::Valid(supervisor_state) => { // Get full restoration context let (pending_tasks, pending_question) = get_restoration_details( pool, @@ -2999,35 +2997,73 @@ pub async fn get_restoration_context( auth_info.owner_id, ).await; - let restoration_type = match validation { - repository::StateValidationResult::Valid(_) => "full_restore", - repository::StateValidationResult::PartiallyValid { .. } => "partial_restore", - repository::StateValidationResult::PhaseStale { .. } => "checkpoint_restore", - _ => "unknown", - }; + let restoration_count = supervisor_state.restoration_count.unwrap_or(0); + let last_llm_response = supervisor_state.last_llm_response.clone(); + let message = format!("Supervisor state found. Last activity: {}. Restoring from {} phase.", + supervisor_state.last_activity.format("%Y-%m-%d %H:%M:%S UTC"), + supervisor_state.phase); - let message = match validation { - repository::StateValidationResult::Valid(_) => - format!("Supervisor state found. Last activity: {}. Restoring from {} phase.", - supervisor_state.last_activity.format("%Y-%m-%d %H:%M:%S UTC"), - supervisor_state.phase), - repository::StateValidationResult::PartiallyValid { invalid_task_ids, .. } => - format!("Partial state found. {} task(s) no longer exist. Restoring with available context.", - invalid_task_ids.len()), - repository::StateValidationResult::PhaseStale { current_phase, .. } => - format!("State found but phase changed from {} to {}. Restoring with updated phase.", - supervisor_state.phase, current_phase), - _ => "Unknown restoration type".to_string(), - }; + ( + StatusCode::OK, + Json(RestorationContextResponse { + found: true, + restoration_type: "full_restore".to_string(), + message, + state: Some(supervisor_state), + pending_tasks, + pending_question, + last_llm_response, + restoration_count, + }), + ).into_response() + } + repository::StateValidationResult::PartiallyValid { state: supervisor_state, invalid_task_ids } => { + // Get full restoration context + let (pending_tasks, pending_question) = get_restoration_details( + pool, + &state, + &supervisor_state, + auth_info.owner_id, + ).await; + + let restoration_count = supervisor_state.restoration_count.unwrap_or(0); + let last_llm_response = supervisor_state.last_llm_response.clone(); + let message = format!("Partial state found. {} task(s) no longer exist. Restoring with available context.", + invalid_task_ids.len()); + + ( + StatusCode::OK, + Json(RestorationContextResponse { + found: true, + restoration_type: "partial_restore".to_string(), + message, + state: Some(supervisor_state), + pending_tasks, + pending_question, + last_llm_response, + restoration_count, + }), + ).into_response() + } + repository::StateValidationResult::PhaseStale { state: supervisor_state, current_phase } => { + // Get full restoration context + let (pending_tasks, pending_question) = get_restoration_details( + pool, + &state, + &supervisor_state, + auth_info.owner_id, + ).await; let restoration_count = supervisor_state.restoration_count.unwrap_or(0); let last_llm_response = supervisor_state.last_llm_response.clone(); + let message = format!("State found but phase changed from {} to {}. Restoring with updated phase.", + supervisor_state.phase, current_phase); ( StatusCode::OK, Json(RestorationContextResponse { found: true, - restoration_type: restoration_type.to_string(), + restoration_type: "checkpoint_restore".to_string(), message, state: Some(supervisor_state), pending_tasks, |
