Add resume to daemon tasks

author: soryu <soryu@soryu.co> 2026-01-23 23:52:35 +0000
committer: soryu <soryu@soryu.co> 2026-01-23 23:52:35 +0000
commit: 579c983d3efb8f1414ffb45b9e031f741cce5f76 (patch)
tree: 1a0060f19a4f4eea8fb9cff9eb52a46cedcdc152
parent: f6f0790217d4098ffb6d2b3df08b0cf83ff61727 (diff)
download: soryu-579c983d3efb8f1414ffb45b9e031f741cce5f76.tar.gz
soryu-579c983d3efb8f1414ffb45b9e031f741cce5f76.zip
8 files changed, 583 insertions, 12 deletions
diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs
index 1a307d1..96dc252 100644
--- a/makima/src/bin/makima.rs
+++ b/makima/src/bin/makima.rs
@@ -165,7 +165,7 @@ async fn run_daemon(
         "[3/5] Opening local database: {}",
         config.local_db.path.display()
     );
-    let _local_db = LocalDb::open(&config.local_db.path)?;
+    let local_db = Arc::new(std::sync::Mutex::new(LocalDb::open(&config.local_db.path)?));
     eprintln!("      Database opened");
 
     // Initialize worktree directories
@@ -244,8 +244,14 @@ async fn run_daemon(
         checkpoint_patches: config.process.checkpoint_patches.clone(),
     };
 
-    // Create task manager
-    let task_manager = Arc::new(TaskManager::new(task_config, ws_tx.clone()));
+    // Create task manager with local database for crash recovery
+    let task_manager = Arc::new(TaskManager::new(task_config, ws_tx.clone(), local_db));
+
+    // Recover any orphaned tasks from previous daemon run
+    let recovered = task_manager.recover_orphaned_tasks().await;
+    if !recovered.is_empty() {
+        eprintln!("      Recovered {} orphaned tasks with intact worktrees", recovered.len());
+    }
 
     // Spawn command handler
     let task_manager_clone = task_manager.clone();
@@ -260,6 +266,22 @@ async fn run_daemon(
         tracing::info!("Command handler stopped");
     });
 
+    // Spawn periodic worktree health check (every 60 seconds)
+    let health_check_manager = task_manager.clone();
+    tokio::spawn(async move {
+        let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
+        loop {
+            interval.tick().await;
+            let affected = health_check_manager.check_worktree_health().await;
+            if !affected.is_empty() {
+                tracing::info!(
+                    count = affected.len(),
+                    "Worktree health check detected missing worktrees - tasks marked for retry"
+                );
+            }
+        }
+    });
+
     // Handle shutdown signals
     let shutdown_signal = async {
         tokio::signal::ctrl_c()
diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs
index cb4bde2..3fdde9b 100644
--- a/makima/src/daemon/task/manager.rs
+++ b/makima/src/daemon/task/manager.rs
@@ -21,6 +21,7 @@ use crate::daemon::process::{ClaudeInputMessage, ProcessManager};
 use crate::daemon::storage;
 use crate::daemon::temp::TempManager;
 use crate::daemon::worktree::{is_new_repo_request, ConflictResolution, WorktreeInfo, WorktreeManager};
+use crate::daemon::db::local::LocalDb;
 use crate::daemon::ws::{BranchInfo, DaemonCommand, DaemonMessage};
 
 /// Generate a secure random API key for orchestrator tool access.
@@ -1045,11 +1046,17 @@ pub struct TaskManager {
     git_user_email: Arc<RwLock<Option<String>>>,
     /// Inherited git user.name for worktrees.
     git_user_name: Arc<RwLock<Option<String>>>,
+    /// Local SQLite database for crash recovery.
+    local_db: Arc<std::sync::Mutex<LocalDb>>,
 }
 
 impl TaskManager {
-    /// Create a new task manager.
-    pub fn new(config: TaskConfig, ws_tx: mpsc::Sender<DaemonMessage>) -> Self {
+    /// Create a new task manager with local database for crash recovery.
+    pub fn new(
+        config: TaskConfig,
+        ws_tx: mpsc::Sender<DaemonMessage>,
+        local_db: Arc<std::sync::Mutex<LocalDb>>,
+    ) -> Self {
         let worktree_manager = Arc::new(WorktreeManager::new(config.worktree_base_dir.clone()));
         let process_manager = Arc::new(
             ProcessManager::with_command(config.claude_command.clone())
@@ -1075,9 +1082,239 @@ impl TaskManager {
             active_pids: Arc::new(RwLock::new(HashMap::new())),
             git_user_email: Arc::new(RwLock::new(None)),
             git_user_name: Arc::new(RwLock::new(None)),
+            local_db,
         }
     }
 
+    /// Persist task state to local SQLite database for crash recovery.
+    fn persist_task_to_local_db(&self, task: &ManagedTask) {
+        use crate::daemon::db::local::LocalTask;
+
+        let local_task = LocalTask {
+            id: task.id,
+            server_task_id: task.id, // Same as task id
+            state: task.state.clone(),
+            container_id: None,
+            overlay_path: task.worktree.as_ref().map(|w| w.path.to_string_lossy().to_string()),
+            repo_url: task.repo_source.clone(),
+            base_branch: task.base_branch.clone(),
+            plan: task.plan.clone(),
+            created_at: chrono::Utc::now(),
+            started_at: task.started_at.map(|_| chrono::Utc::now()),
+            completed_at: task.completed_at.map(|_| chrono::Utc::now()),
+            error_message: task.error.clone(),
+        };
+
+        if let Ok(db) = self.local_db.lock() {
+            if let Err(e) = db.save_task(&local_task) {
+                tracing::warn!(task_id = %task.id, error = %e, "Failed to persist task to local database");
+            } else {
+                tracing::debug!(task_id = %task.id, state = ?task.state, "Persisted task to local database");
+            }
+        }
+    }
+
+    /// Remove completed/failed task from local database.
+    fn remove_task_from_local_db(&self, task_id: Uuid) {
+        if let Ok(db) = self.local_db.lock() {
+            if let Err(e) = db.delete_task(task_id) {
+                tracing::warn!(task_id = %task_id, error = %e, "Failed to remove task from local database");
+            } else {
+                tracing::debug!(task_id = %task_id, "Removed task from local database");
+            }
+        }
+    }
+
+    /// Recover orphaned tasks from local database after daemon restart.
+    /// Returns list of task IDs that have worktrees and can potentially be recovered.
+    pub async fn recover_orphaned_tasks(&self) -> Vec<Uuid> {
+        tracing::info!("=== STARTING ORPHANED TASK RECOVERY ===");
+
+        let active_tasks = {
+            let db = match self.local_db.lock() {
+                Ok(db) => db,
+                Err(e) => {
+                    tracing::error!(error = %e, "Failed to lock local database for recovery");
+                    return Vec::new();
+                }
+            };
+
+            match db.get_active_tasks() {
+                Ok(tasks) => tasks,
+                Err(e) => {
+                    tracing::error!(error = %e, "Failed to load active tasks from local database");
+                    return Vec::new();
+                }
+            }
+        };
+
+        if active_tasks.is_empty() {
+            tracing::info!("No orphaned tasks found in local database");
+            return Vec::new();
+        }
+
+        tracing::info!(count = active_tasks.len(), "Found orphaned tasks in local database");
+
+        let mut recoverable_task_ids = Vec::new();
+
+        for local_task in active_tasks {
+            tracing::info!(
+                task_id = %local_task.id,
+                state = ?local_task.state,
+                overlay_path = ?local_task.overlay_path,
+                "Checking orphaned task"
+            );
+
+            // Check if worktree exists on filesystem
+            let worktree_exists = if let Some(ref path) = local_task.overlay_path {
+                let path = std::path::PathBuf::from(path);
+                path.exists() && path.join(".git").exists()
+            } else {
+                // Try to find worktree by task ID pattern (scan worktrees directory)
+                let short_id = &local_task.id.to_string()[..8];
+                let worktrees_dir = self.worktree_manager.base_dir();
+                let mut found = false;
+
+                if let Ok(mut entries) = tokio::fs::read_dir(worktrees_dir).await {
+                    while let Ok(Some(entry)) = entries.next_entry().await {
+                        let name = entry.file_name();
+                        let name_str = name.to_string_lossy();
+                        if name_str.starts_with(short_id) {
+                            let path = entry.path();
+                            if path.join(".git").exists() {
+                                found = true;
+                                break;
+                            }
+                        }
+                    }
+                }
+                found
+            };
+
+            if worktree_exists {
+                tracing::info!(
+                    task_id = %local_task.id,
+                    "Found worktree for orphaned task - can be recovered"
+                );
+                recoverable_task_ids.push(local_task.id);
+
+                // Send structured recovery notification to server
+                let msg = DaemonMessage::task_recovery_detected(
+                    local_task.id,
+                    local_task.state.as_str(),
+                    true, // worktree intact
+                    local_task.overlay_path.clone(),
+                    false, // doesn't need patch since worktree is intact
+                );
+                let _ = self.ws_tx.send(msg).await;
+            } else {
+                tracing::warn!(
+                    task_id = %local_task.id,
+                    "Worktree missing for orphaned task - marking as lost"
+                );
+
+                // Update local db to mark as failed
+                if let Ok(db) = self.local_db.lock() {
+                    let _ = db.update_task_state(local_task.id, TaskState::Failed);
+                }
+            }
+        }
+
+        tracing::info!(
+            recoverable = recoverable_task_ids.len(),
+            "=== ORPHANED TASK RECOVERY COMPLETE ==="
+        );
+
+        recoverable_task_ids
+    }
+
+    /// Check worktree health for all running tasks.
+    /// If a worktree is missing, marks the task as interrupted and notifies the server.
+    /// This allows the retry orchestrator to pick up the task and restore it from checkpoint.
+    pub async fn check_worktree_health(&self) -> Vec<Uuid> {
+        let mut affected_task_ids = Vec::new();
+
+        // Get all running tasks
+        let tasks_snapshot: Vec<(Uuid, Option<PathBuf>)> = {
+            let tasks = self.tasks.read().await;
+            tasks
+                .iter()
+                .filter(|(_, t)| matches!(t.state, TaskState::Running | TaskState::Starting))
+                .map(|(id, t)| (*id, t.worktree.as_ref().map(|w| w.path.clone())))
+                .collect()
+        };
+
+        if tasks_snapshot.is_empty() {
+            return affected_task_ids;
+        }
+
+        for (task_id, worktree_path) in tasks_snapshot {
+            let worktree_exists = if let Some(ref path) = worktree_path {
+                path.exists() && path.join(".git").exists()
+            } else {
+                // No worktree set - scan by task ID
+                let short_id = &task_id.to_string()[..8];
+                let worktrees_dir = self.worktree_manager.base_dir();
+                let mut found = false;
+
+                if let Ok(mut entries) = tokio::fs::read_dir(worktrees_dir).await {
+                    while let Ok(Some(entry)) = entries.next_entry().await {
+                        let name = entry.file_name();
+                        let name_str = name.to_string_lossy();
+                        if name_str.starts_with(short_id) {
+                            let path = entry.path();
+                            if path.join(".git").exists() {
+                                found = true;
+                                break;
+                            }
+                        }
+                    }
+                }
+                found
+            };
+
+            if !worktree_exists {
+                tracing::warn!(
+                    task_id = %task_id,
+                    worktree_path = ?worktree_path,
+                    "Worktree missing for running task - marking as interrupted for retry"
+                );
+
+                affected_task_ids.push(task_id);
+
+                // Update task state to interrupted
+                {
+                    let mut tasks = self.tasks.write().await;
+                    if let Some(task) = tasks.get_mut(&task_id) {
+                        task.state = TaskState::Interrupted;
+                        task.error = Some("Worktree directory was deleted".to_string());
+                        task.completed_at = Some(Instant::now());
+                    }
+                }
+
+                // Notify server - task needs recovery/retry
+                let msg = DaemonMessage::task_complete(
+                    task_id,
+                    false,
+                    Some("Worktree deleted - task interrupted for recovery".to_string()),
+                );
+                let _ = self.ws_tx.send(msg).await;
+
+                // Remove from local db since server will handle retry
+                self.remove_task_from_local_db(task_id);
+            }
+        }
+
+        if !affected_task_ids.is_empty() {
+            tracing::info!(
+                count = affected_task_ids.len(),
+                "Worktree health check found missing worktrees"
+            );
+        }
+
+        affected_task_ids
+    }
+
     /// Check if a task can be spawned given contract-based concurrency limits.
     /// Returns the concurrency key to use (contract_id or task_id for standalone).
     async fn try_acquire_concurrency_slot(
@@ -1823,6 +2060,9 @@ impl TaskManager {
             error: None,
         };
 
+        // Persist task to local database for crash recovery
+        self.persist_task_to_local_db(&task);
+
         self.tasks.write().await.insert(task_id, task);
         tracing::info!(task_id = %task_id, "Task entry created and stored");
 
@@ -1871,6 +2111,7 @@ impl TaskManager {
             heartbeat_commit_interval_secs: self.config.heartbeat_commit_interval_secs,
             contract_task_counts: self.contract_task_counts.clone(),
             checkpoint_patches: self.config.checkpoint_patches.clone(),
+            local_db: self.local_db.clone(),
         }
     }
 
@@ -3190,6 +3431,8 @@ struct TaskManagerInner {
     contract_task_counts: Arc<RwLock<HashMap<Uuid, usize>>>,
     /// Checkpoint patch storage configuration.
     checkpoint_patches: CheckpointPatchConfig,
+    /// Local SQLite database for crash recovery.
+    local_db: Arc<std::sync::Mutex<LocalDb>>,
 }
 
 impl TaskManagerInner {
@@ -3210,6 +3453,17 @@ impl TaskManagerInner {
         }
     }
 
+    /// Remove completed/failed task from local database.
+    fn remove_task_from_local_db(&self, task_id: Uuid) {
+        if let Ok(db) = self.local_db.lock() {
+            if let Err(e) = db.delete_task(task_id) {
+                tracing::warn!(task_id = %task_id, error = %e, "Failed to remove task from local database");
+            } else {
+                tracing::debug!(task_id = %task_id, "Removed task from local database");
+            }
+        }
+    }
+
     /// Run a task to completion.
     #[allow(clippy::too_many_arguments)]
     async fn run_task(
@@ -4375,6 +4629,9 @@ impl TaskManagerInner {
             tracing::info!(task_id = %task_id, success = success, "Notifying server of task completion");
             let msg = DaemonMessage::task_complete(task_id, success, error);
             let _ = self.ws_tx.send(msg).await;
+
+            // Remove completed task from local database (no longer needs crash recovery)
+            self.remove_task_from_local_db(task_id);
         }
 
         // Note: Worktrees are kept until explicitly deleted (per user preference)
@@ -4578,6 +4835,9 @@ impl TaskManagerInner {
         // Notify server
         let msg = DaemonMessage::task_complete(task_id, false, Some(error.to_string()));
         let _ = self.ws_tx.send(msg).await;
+
+        // Remove failed task from local database
+        self.remove_task_from_local_db(task_id);
     }
 
     /// Apply inherited git config to a worktree directory.
@@ -4837,6 +5097,7 @@ impl Clone for TaskManagerInner {
             heartbeat_commit_interval_secs: self.heartbeat_commit_interval_secs,
             contract_task_counts: self.contract_task_counts.clone(),
             checkpoint_patches: self.checkpoint_patches.clone(),
+            local_db: self.local_db.clone(),
         }
     }
 }
diff --git a/makima/src/daemon/ws/protocol.rs b/makima/src/daemon/ws/protocol.rs
index ec9b09e..2e7caef 100644
--- a/makima/src/daemon/ws/protocol.rs
+++ b/makima/src/daemon/ws/protocol.rs
@@ -60,6 +60,25 @@ pub enum DaemonMessage {
         error: Option<String>,
     },
 
+    /// Task recovery detected after daemon restart.
+    /// Sent when daemon finds orphaned tasks that can be recovered.
+    TaskRecoveryDetected {
+        #[serde(rename = "taskId")]
+        task_id: Uuid,
+        /// Previous state of the task before daemon restart.
+        #[serde(rename = "previousState")]
+        previous_state: String,
+        /// Whether the worktree is still intact.
+        #[serde(rename = "worktreeIntact")]
+        worktree_intact: bool,
+        /// Path to the worktree if available.
+        #[serde(rename = "worktreePath")]
+        worktree_path: Option<String>,
+        /// Whether the task needs a checkpoint patch for recovery.
+        #[serde(rename = "needsPatch")]
+        needs_patch: bool,
+    },
+
     /// Register a tool key for orchestrator API access.
     RegisterToolKey {
         #[serde(rename = "taskId")]
@@ -698,6 +717,23 @@ impl DaemonMessage {
         }
     }
 
+    /// Create a task recovery detected message.
+    pub fn task_recovery_detected(
+        task_id: Uuid,
+        previous_state: &str,
+        worktree_intact: bool,
+        worktree_path: Option<String>,
+        needs_patch: bool,
+    ) -> Self {
+        Self::TaskRecoveryDetected {
+            task_id,
+            previous_state: previous_state.to_string(),
+            worktree_intact,
+            worktree_path,
+            needs_patch,
+        }
+    }
+
     /// Create a register tool key message.
     pub fn register_tool_key(task_id: Uuid, key: String) -> Self {
         Self::RegisterToolKey { task_id, key }
diff --git a/makima/src/db/repository.rs b/makima/src/db/repository.rs
index da44899..d3e4c56 100644
--- a/makima/src/db/repository.rs
+++ b/makima/src/db/repository.rs
@@ -823,6 +823,26 @@ pub async fn get_pending_tasks_for_contract(
     .await
 }
 
+/// Get all contracts that have pending tasks awaiting retry.
+/// Returns tuples of (contract_id, owner_id) for contracts with retryable tasks.
+pub async fn get_all_pending_task_contracts(
+    pool: &PgPool,
+) -> Result<Vec<(Uuid, Uuid)>, sqlx::Error> {
+    sqlx::query_as::<_, (Uuid, Uuid)>(
+        r#"
+        SELECT DISTINCT contract_id, owner_id
+        FROM tasks
+        WHERE contract_id IS NOT NULL
+          AND status = 'pending'
+          AND is_supervisor = false
+          AND retry_count < max_retries
+        ORDER BY owner_id, contract_id
+        "#,
+    )
+    .fetch_all(pool)
+    .await
+}
+
 /// Mark a task as pending for retry after daemon failure.
 /// Increments retry count and adds the failed daemon to exclusion list.
 pub async fn mark_task_for_retry(
diff --git a/makima/src/server/handlers/mesh.rs b/makima/src/server/handlers/mesh.rs
index 240e1f7..3d05f35 100644
--- a/makima/src/server/handlers/mesh.rs
+++ b/makima/src/server/handlers/mesh.rs
@@ -6,6 +6,7 @@ use axum::{
     response::IntoResponse,
     Json,
 };
+use base64::Engine;
 use uuid::Uuid;
 
 use crate::db::models::{
@@ -2265,6 +2266,30 @@ pub async fn reassign_task(
         }
     };
 
+    // Fetch latest checkpoint patch for worktree recovery during reassignment
+    let (patch_data, patch_base_sha) = match repository::get_latest_checkpoint_patch(pool, id).await {
+        Ok(Some(patch)) => {
+            tracing::info!(
+                old_task_id = %id,
+                new_task_id = %new_task.id,
+                patch_size = patch.patch_size_bytes,
+                base_sha = %patch.base_commit_sha,
+                files_count = patch.files_count,
+                "Including checkpoint patch for task reassignment recovery"
+            );
+            let encoded = base64::engine::general_purpose::STANDARD.encode(&patch.patch_data);
+            (Some(encoded), Some(patch.base_commit_sha))
+        }
+        Ok(None) => {
+            tracing::debug!(old_task_id = %id, "No checkpoint patch found for reassignment");
+            (None, None)
+        }
+        Err(e) => {
+            tracing::warn!(old_task_id = %id, error = %e, "Failed to fetch checkpoint patch for reassignment");
+            (None, None)
+        }
+    };
+
     // Send SpawnTask command to daemon for the new task
     let command = DaemonCommand::SpawnTask {
         task_id: new_task.id,
@@ -2285,8 +2310,8 @@ pub async fn reassign_task(
         autonomous_loop: false,
         resume_session: false,
         conversation_history: None,
-        patch_data: None,
-        patch_base_sha: None,
+        patch_data,
+        patch_base_sha,
     };
 
     tracing::info!(
diff --git a/makima/src/server/handlers/mesh_daemon.rs b/makima/src/server/handlers/mesh_daemon.rs
index 65db373..53ee806 100644
--- a/makima/src/server/handlers/mesh_daemon.rs
+++ b/makima/src/server/handlers/mesh_daemon.rs
@@ -291,6 +291,19 @@ pub enum DaemonMessage {
         success: bool,
         error: Option<String>,
     },
+    /// Task recovery detected after daemon restart
+    TaskRecoveryDetected {
+        #[serde(rename = "taskId")]
+        task_id: Uuid,
+        #[serde(rename = "previousState")]
+        previous_state: String,
+        #[serde(rename = "worktreeIntact")]
+        worktree_intact: bool,
+        #[serde(rename = "worktreePath")]
+        worktree_path: Option<String>,
+        #[serde(rename = "needsPatch")]
+        needs_patch: bool,
+    },
     /// Register a tool key for orchestrator API access
     RegisterToolKey {
         #[serde(rename = "taskId")]
@@ -990,6 +1003,110 @@ async fn handle_daemon_connection(socket: WebSocket, state: SharedState, auth_re
                                     });
                                 }
                             }
+                            Ok(DaemonMessage::TaskRecoveryDetected {
+                                task_id,
+                                previous_state,
+                                worktree_intact,
+                                worktree_path,
+                                needs_patch,
+                            }) => {
+                                tracing::info!(
+                                    task_id = %task_id,
+                                    previous_state = %previous_state,
+                                    worktree_intact = worktree_intact,
+                                    worktree_path = ?worktree_path,
+                                    needs_patch = needs_patch,
+                                    "Task recovery detected after daemon restart"
+                                );
+
+                                // Update task in database based on recovery state
+                                if let Some(ref pool) = state.db_pool {
+                                    let pool = pool.clone();
+                                    let state = state.clone();
+                                    tokio::spawn(async move {
+                                        if worktree_intact {
+                                            // Worktree exists - task can be resumed on this daemon
+                                            // Update task status to 'pending' so it can be picked up
+                                            match sqlx::query(
+                                                r#"
+                                                UPDATE tasks
+                                                SET status = 'pending',
+                                                    daemon_id = NULL,
+                                                    error_message = 'Daemon restarted - task ready for resumption',
+                                                    interrupted_at = NOW(),
+                                                    updated_at = NOW()
+                                                WHERE id = $1 AND owner_id = $2
+                                                RETURNING id
+                                                "#,
+                                            )
+                                            .bind(task_id)
+                                            .bind(owner_id)
+                                            .fetch_optional(&pool)
+                                            .await
+                                            {
+                                                Ok(Some(_)) => {
+                                                    tracing::info!(
+                                                        task_id = %task_id,
+                                                        "Task marked as pending for resumption"
+                                                    );
+                                                    state.broadcast_task_update(TaskUpdateNotification {
+                                                        task_id,
+                                                        owner_id: Some(owner_id),
+                                                        version: 0,
+                                                        status: "pending".into(),
+                                                        updated_fields: vec![
+                                                            "status".into(),
+                                                            "daemon_id".into(),
+                                                            "interrupted_at".into(),
+                                                        ],
+                                                        updated_by: "daemon_recovery".into(),
+                                                    });
+                                                }
+                                                Ok(None) => {
+                                                    tracing::warn!(
+                                                        task_id = %task_id,
+                                                        "Task not found during recovery update"
+                                                    );
+                                                }
+                                                Err(e) => {
+                                                    tracing::error!(
+                                                        task_id = %task_id,
+                                                        error = %e,
+                                                        "Failed to update task during recovery"
+                                                    );
+                                                }
+                                            }
+                                        } else {
+                                            // Worktree missing - mark for retry with patch restoration
+                                            match repository::mark_task_for_retry(
+                                                &pool,
+                                                task_id,
+                                                daemon_uuid, // Mark this daemon as failed
+                                            ).await {
+                                                Ok(Some(_)) => {
+                                                    tracing::info!(
+                                                        task_id = %task_id,
+                                                        "Task marked for retry (worktree missing)"
+                                                    );
+                                                }
+                                                Ok(None) => {
+                                                    tracing::warn!(
+                                                        task_id = %task_id,
+                                                        "Task not found or exceeded retries"
+                                                    );
+                                                }
+                                                Err(e) => {
+                                                    tracing::error!(
+                                                        task_id = %task_id,
+                                                        error = %e,
+                                                        "Failed to mark task for retry"
+                                                    );
+                                                }
+                                            }
+                                        }
+                                    });
+                                }
+                            }
                             Ok(DaemonMessage::Authenticate { .. }) => {
                                 // Already authenticated, ignore
                             }
diff --git a/makima/src/server/handlers/mesh_supervisor.rs b/makima/src/server/handlers/mesh_supervisor.rs
index 21c9515..1b5e376 100644
--- a/makima/src/server/handlers/mesh_supervisor.rs
+++ b/makima/src/server/handlers/mesh_supervisor.rs
@@ -279,8 +279,9 @@ async fn verify_supervisor_auth(
 
 /// Try to start a pending task on an available daemon.
 /// Returns Ok(Some(task)) if a task was started, Ok(None) if no tasks could be started.
-/// For retried tasks, excludes daemons that previously failed the task.
-async fn try_start_pending_task(
+/// For retried tasks, excludes daemons that previously failed the task and includes
+/// checkpoint patch data for worktree recovery.
+pub async fn try_start_pending_task(
     state: &SharedState,
     contract_id: Uuid,
     owner_id: Uuid,
@@ -348,6 +349,34 @@ async fn try_start_pending_task(
             }
         };
 
+        // For retried tasks, fetch checkpoint patch for worktree recovery
+        let (patch_data, patch_base_sha) = if task.retry_count > 0 {
+            // This is a retry - try to restore from checkpoint
+            match repository::get_latest_checkpoint_patch(pool, task.id).await {
+                Ok(Some(patch)) => {
+                    tracing::info!(
+                        task_id = %task.id,
+                        retry_count = task.retry_count,
+                        patch_size = patch.patch_size_bytes,
+                        base_sha = %patch.base_commit_sha,
+                        "Including checkpoint patch for task retry recovery"
+                    );
+                    let encoded = base64::engine::general_purpose::STANDARD.encode(&patch.patch_data);
+                    (Some(encoded), Some(patch.base_commit_sha))
+                }
+                Ok(None) => {
+                    tracing::debug!(task_id = %task.id, "No checkpoint patch found for retry");
+                    (None, None)
+                }
+                Err(e) => {
+                    tracing::warn!(task_id = %task.id, error = %e, "Failed to fetch checkpoint patch for retry");
+                    (None, None)
+                }
+            }
+        } else {
+            (None, None)
+        };
+
         // Send spawn command
         let cmd = DaemonCommand::SpawnTask {
             task_id: updated_task.id,
@@ -366,10 +395,10 @@ async fn try_start_pending_task(
             contract_id: updated_task.contract_id,
             is_supervisor: false,
             autonomous_loop: false,
-            resume_session: false,
+            resume_session: task.retry_count > 0, // Use --continue for retried tasks
             conversation_history: None,
-            patch_data: None,
-            patch_base_sha: None,
+            patch_data,
+            patch_base_sha,
         };
 
         if let Err(e) = state.send_daemon_command(daemon.id, cmd).await {
diff --git a/makima/src/server/mod.rs b/makima/src/server/mod.rs
index 3a27513..de20569 100644
--- a/makima/src/server/mod.rs
+++ b/makima/src/server/mod.rs
@@ -251,6 +251,9 @@ const ANONYMOUS_TASK_MAX_AGE_DAYS: i32 = 7;
 /// Interval for checkpoint patch cleanup (hourly)
 const CHECKPOINT_PATCH_CLEANUP_INTERVAL_SECS: u64 = 3600;
 
+// Retry orchestrator checks for pending tasks every 30 seconds
+const RETRY_ORCHESTRATOR_INTERVAL_SECS: u64 = 30;
+
 /// Run the HTTP server with graceful shutdown support.
 ///
 /// # Arguments
@@ -387,6 +390,64 @@ pub async fn run_server(state: SharedState, addr: &str) -> anyhow::Result<()> {
                 }
             }
         });
+
+        // Clone state and pool for retry orchestrator
+        let retry_pool = pool.clone();
+        let retry_state = state.clone();
+
+        // Spawn retry orchestrator - periodically retries pending tasks on available daemons
+        tokio::spawn(async move {
+            let mut interval = tokio::time::interval(
+                std::time::Duration::from_secs(RETRY_ORCHESTRATOR_INTERVAL_SECS)
+            );
+            loop {
+                interval.tick().await;
+
+                // Get all contracts with pending tasks awaiting retry
+                match crate::db::repository::get_all_pending_task_contracts(&retry_pool).await {
+                    Ok(contract_owners) => {
+                        for (contract_id, owner_id) in contract_owners {
+                            // Try to start a pending task for this contract
+                            match handlers::mesh_supervisor::try_start_pending_task(
+                                &retry_state,
+                                contract_id,
+                                owner_id,
+                            ).await {
+                                Ok(Some(task)) => {
+                                    tracing::info!(
+                                        task_id = %task.id,
+                                        contract_id = %contract_id,
+                                        retry_count = task.retry_count,
+                                        "Retry orchestrator started pending task"
+                                    );
+                                }
+                                Ok(None) => {
+                                    // No tasks could be started (no available daemons, etc.)
+                                }
+                                Err(e) => {
+                                    tracing::warn!(
+                                        contract_id = %contract_id,
+                                        error = %e,
+                                        "Retry orchestrator failed to start pending task"
+                                    );
+                                }
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        tracing::warn!(
+                            error = %e,
+                            "Retry orchestrator failed to query pending task contracts"
+                        );
+                    }
+                }
+            }
+        });
+
+        tracing::info!(
+            "Retry orchestrator started (interval: {}s)",
+            RETRY_ORCHESTRATOR_INTERVAL_SECS
+        );
     }
 
     let app = make_router(state);
author	soryu <soryu@soryu.co>	2026-01-23 23:52:35 +0000
committer	soryu <soryu@soryu.co>	2026-01-23 23:52:35 +0000
commit	579c983d3efb8f1414ffb45b9e031f741cce5f76 (patch)
tree	1a0060f19a4f4eea8fb9cff9eb52a46cedcdc152
parent	f6f0790217d4098ffb6d2b3df08b0cf83ff61727 (diff)
download	soryu-579c983d3efb8f1414ffb45b9e031f741cce5f76.tar.gz soryu-579c983d3efb8f1414ffb45b9e031f741cce5f76.zip