diff options
| author | soryu <soryu@soryu.co> | 2026-01-16 19:50:27 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2026-01-17 05:38:07 +0000 |
| commit | 75d9644d44ba998a32ed14c072e883a75145ab72 (patch) | |
| tree | b82dee94632fd40764a92a9b11da24ef21600ed5 /makima/src | |
| parent | 6b94b5895ed27e3aef052a1843fb3f334397d1b4 (diff) | |
| download | soryu-75d9644d44ba998a32ed14c072e883a75145ab72.tar.gz soryu-75d9644d44ba998a32ed14c072e883a75145ab72.zip | |
Add autopilot panel and retry system
Diffstat (limited to 'makima/src')
| -rw-r--r-- | makima/src/db/models.rs | 19 | ||||
| -rw-r--r-- | makima/src/db/repository.rs | 92 | ||||
| -rw-r--r-- | makima/src/server/handlers/mesh_daemon.rs | 83 | ||||
| -rw-r--r-- | makima/src/server/handlers/mesh_supervisor.rs | 155 |
4 files changed, 255 insertions, 94 deletions
diff --git a/makima/src/db/models.rs b/makima/src/db/models.rs index 0e1303c..72ba6f2 100644 --- a/makima/src/db/models.rs +++ b/makima/src/db/models.rs @@ -6,6 +6,11 @@ use sqlx::FromRow; use utoipa::ToSchema; use uuid::Uuid; +/// Default max retries for task daemon failover (3 attempts) +fn default_max_retries() -> i32 { + 3 +} + /// Flexible datetime deserialization module. /// Accepts both date-only ("2026-01-15") and full ISO 8601 datetime ("2026-01-15T00:00:00Z") formats. pub mod flexible_datetime { @@ -500,6 +505,20 @@ pub struct Task { /// Files to copy from parent task's worktree when starting. #[serde(skip_serializing_if = "Option::is_none")] pub copy_files: Option<serde_json::Value>, + + // Retry tracking for daemon failover + /// Number of times this task has been retried after daemon failure + #[serde(default)] + pub retry_count: i32, + /// Maximum retry attempts before marking as permanently failed + #[serde(default = "default_max_retries")] + pub max_retries: i32, + /// Array of daemon IDs that have failed this task (excluded from retry) + #[serde(skip_serializing_if = "Option::is_none")] + pub failed_daemon_ids: Option<Vec<Uuid>>, + /// When the task was last interrupted due to daemon disconnect + #[serde(skip_serializing_if = "Option::is_none")] + pub interrupted_at: Option<DateTime<Utc>>, } impl Task { diff --git a/makima/src/db/repository.rs b/makima/src/db/repository.rs index 2b069d5..43b8e3a 100644 --- a/makima/src/db/repository.rs +++ b/makima/src/db/repository.rs @@ -790,6 +790,8 @@ pub async fn list_tasks_by_contract( } /// Get pending tasks for a contract (non-supervisor tasks only). +/// Includes tasks that were interrupted (retry candidates). +/// Prioritizes interrupted tasks and excludes those that exceeded max_retries. pub async fn get_pending_tasks_for_contract( pool: &PgPool, contract_id: Uuid, @@ -801,7 +803,11 @@ pub async fn get_pending_tasks_for_contract( WHERE contract_id = $1 AND owner_id = $2 AND status = 'pending' AND is_supervisor = false - ORDER BY priority DESC, created_at ASC + AND retry_count < max_retries + ORDER BY + interrupted_at DESC NULLS LAST, + priority DESC, + created_at ASC "#, ) .bind(contract_id) @@ -810,6 +816,61 @@ pub async fn get_pending_tasks_for_contract( .await } +/// Mark a task as pending for retry after daemon failure. +/// Increments retry count and adds the failed daemon to exclusion list. +pub async fn mark_task_for_retry( + pool: &PgPool, + task_id: Uuid, + failed_daemon_id: Uuid, +) -> Result<Option<Task>, sqlx::Error> { + sqlx::query_as::<_, Task>( + r#" + UPDATE tasks + SET status = 'pending', + daemon_id = NULL, + retry_count = retry_count + 1, + failed_daemon_ids = array_append(COALESCE(failed_daemon_ids, '{}'), $2), + last_active_daemon_id = $2, + interrupted_at = NOW(), + error_message = 'Daemon disconnected, awaiting retry', + updated_at = NOW() + WHERE id = $1 + AND retry_count < max_retries + RETURNING * + "#, + ) + .bind(task_id) + .bind(failed_daemon_id) + .fetch_optional(pool) + .await +} + +/// Mark a task as permanently failed (exceeded retry limit). +pub async fn mark_task_permanently_failed( + pool: &PgPool, + task_id: Uuid, + failed_daemon_id: Uuid, +) -> Result<(), sqlx::Error> { + sqlx::query( + r#" + UPDATE tasks + SET status = 'failed', + daemon_id = NULL, + retry_count = retry_count + 1, + failed_daemon_ids = array_append(COALESCE(failed_daemon_ids, '{}'), $2), + last_active_daemon_id = $2, + error_message = 'Task failed: exceeded maximum retry attempts', + updated_at = NOW() + WHERE id = $1 + "#, + ) + .bind(task_id) + .bind(failed_daemon_id) + .execute(pool) + .await?; + Ok(()) +} + /// Update a task by ID with optimistic locking. pub async fn update_task( pool: &PgPool, @@ -3008,6 +3069,35 @@ pub async fn get_available_daemons( .await } +/// Get daemons with capacity info for selection, excluding specified daemon IDs. +/// Used for task retry to avoid reassigning to daemons that have already failed. +pub async fn get_available_daemons_excluding( + pool: &PgPool, + owner_id: Uuid, + exclude_daemon_ids: &[Uuid], +) -> Result<Vec<DaemonWithCapacity>, sqlx::Error> { + sqlx::query_as::<_, DaemonWithCapacity>( + r#" + SELECT id, owner_id, connection_id, hostname, machine_id, + max_concurrent_tasks, current_task_count, + capacity_score, task_queue_length, supports_migration, + status, last_heartbeat_at, connected_at + FROM daemons + WHERE owner_id = $1 + AND status = 'connected' + AND id != ALL($2) + ORDER BY + COALESCE(capacity_score, 100) DESC, + (max_concurrent_tasks - current_task_count) DESC, + COALESCE(task_queue_length, 0) ASC + "#, + ) + .bind(owner_id) + .bind(exclude_daemon_ids) + .fetch_all(pool) + .await +} + /// Create a daemon task assignment. pub async fn create_daemon_task_assignment( pool: &PgPool, diff --git a/makima/src/server/handlers/mesh_daemon.rs b/makima/src/server/handlers/mesh_daemon.rs index 4bcb5cd..beb676e 100644 --- a/makima/src/server/handlers/mesh_daemon.rs +++ b/makima/src/server/handlers/mesh_daemon.rs @@ -20,6 +20,7 @@ use sqlx::Row; use tokio::sync::mpsc; use uuid::Uuid; +use crate::db::models::Task; use crate::db::repository; use crate::server::auth::{hash_api_key, API_KEY_HEADER}; use crate::server::messages::ApiError; @@ -1334,42 +1335,86 @@ async fn handle_daemon_connection(socket: WebSocket, state: SharedState, auth_re ); } - // Find tasks assigned to this daemon that are still active - if let Err(e) = clear_daemon_from_tasks(&pool, daemon_uuid).await { + // Find tasks assigned to this daemon and mark for retry or fail permanently + if let Err(e) = handle_daemon_disconnect_tasks(&pool, daemon_uuid).await { tracing::error!( daemon_id = %daemon_uuid, error = %e, - "Failed to clear daemon from tasks on disconnect" + "Failed to handle daemon disconnect for tasks" ); } }); } } -/// Clear daemon_id from tasks when daemon disconnects -async fn clear_daemon_from_tasks(pool: &sqlx::PgPool, daemon_id: Uuid) -> Result<(), sqlx::Error> { - // Update tasks that were running on this daemon to failed state - let result = sqlx::query( +/// Handle tasks when daemon disconnects - mark for retry or fail permanently. +async fn handle_daemon_disconnect_tasks(pool: &sqlx::PgPool, daemon_id: Uuid) -> Result<(), sqlx::Error> { + // Get all active tasks on this daemon + let active_tasks: Vec<Task> = sqlx::query_as( r#" - UPDATE tasks - SET daemon_id = NULL, - status = 'failed', - error_message = 'Daemon disconnected', - updated_at = NOW() + SELECT * FROM tasks WHERE daemon_id = $1 AND status IN ('starting', 'running', 'initializing') "#, ) .bind(daemon_id) - .execute(pool) + .fetch_all(pool) .await?; - if result.rows_affected() > 0 { - tracing::warn!( - daemon_id = %daemon_id, - tasks_affected = result.rows_affected(), - "Marked tasks as failed due to daemon disconnect" - ); + if active_tasks.is_empty() { + return Ok(()); + } + + tracing::info!( + daemon_id = %daemon_id, + task_count = active_tasks.len(), + "Processing tasks for disconnected daemon" + ); + + for task in active_tasks { + if task.retry_count < task.max_retries { + // Mark for retry + match repository::mark_task_for_retry(pool, task.id, daemon_id).await { + Ok(Some(updated_task)) => { + tracing::info!( + task_id = %task.id, + task_name = %task.name, + retry_count = updated_task.retry_count, + max_retries = updated_task.max_retries, + "Task marked for retry after daemon disconnect" + ); + } + Ok(None) => { + tracing::warn!( + task_id = %task.id, + "Task not found or already at max retries" + ); + } + Err(e) => { + tracing::error!( + task_id = %task.id, + error = %e, + "Failed to mark task for retry" + ); + } + } + } else { + // Exceeded retries, mark as permanently failed + if let Err(e) = repository::mark_task_permanently_failed(pool, task.id, daemon_id).await { + tracing::error!( + task_id = %task.id, + error = %e, + "Failed to mark task as permanently failed" + ); + } else { + tracing::warn!( + task_id = %task.id, + task_name = %task.name, + retry_count = task.retry_count + 1, + "Task permanently failed: exceeded maximum retries" + ); + } + } } Ok(()) diff --git a/makima/src/server/handlers/mesh_supervisor.rs b/makima/src/server/handlers/mesh_supervisor.rs index 1014fdc..754d086 100644 --- a/makima/src/server/handlers/mesh_supervisor.rs +++ b/makima/src/server/handlers/mesh_supervisor.rs @@ -256,6 +256,7 @@ async fn verify_supervisor_auth( /// Try to start a pending task on an available daemon. /// Returns Ok(Some(task)) if a task was started, Ok(None) if no tasks could be started. +/// For retried tasks, excludes daemons that previously failed the task. async fn try_start_pending_task( state: &SharedState, contract_id: Uuid, @@ -263,7 +264,7 @@ async fn try_start_pending_task( ) -> Result<Option<Task>, String> { let pool = state.db_pool.as_ref().ok_or("Database not configured")?; - // Get pending tasks for this contract + // Get pending tasks for this contract (includes interrupted tasks awaiting retry) let pending_tasks = repository::get_pending_tasks_for_contract(pool, contract_id, owner_id) .await .map_err(|e| format!("Failed to get pending tasks: {}", e))?; @@ -272,89 +273,95 @@ async fn try_start_pending_task( return Ok(None); } - // Get available daemons with capacity - let daemons = repository::get_available_daemons(pool, owner_id) - .await - .map_err(|e| format!("Failed to get available daemons: {}", e))?; - - // Find a daemon with capacity - let available_daemon = daemons.iter().find(|d| { - d.current_task_count < d.max_concurrent_tasks - && state.daemon_connections.contains_key(&d.connection_id) - }); + // Try each pending task until we find one we can start + for task in &pending_tasks { + // Get excluded daemon IDs for this task (daemons that have already failed it) + let exclude_ids: Vec<Uuid> = task.failed_daemon_ids.clone().unwrap_or_default(); - let daemon = match available_daemon { - Some(d) => d, - None => return Ok(None), // No daemon with capacity - }; + // Get available daemons excluding failed ones for this task + let daemons = repository::get_available_daemons_excluding(pool, owner_id, &exclude_ids) + .await + .map_err(|e| format!("Failed to get available daemons: {}", e))?; - // Try to start the first pending task - let task = &pending_tasks[0]; + // Find a daemon with capacity + let available_daemon = daemons.iter().find(|d| { + d.current_task_count < d.max_concurrent_tasks + && state.daemon_connections.contains_key(&d.connection_id) + }); - // Get repo URL from task or contract - let repo_url = if let Some(url) = &task.repository_url { - Some(url.clone()) - } else { - match repository::list_contract_repositories(pool, contract_id).await { - Ok(repos) => repos - .iter() - .find(|r| r.is_primary) - .or(repos.first()) - .and_then(|r| r.repository_url.clone().or_else(|| r.local_path.clone())), - Err(_) => None, - } - }; + let daemon = match available_daemon { + Some(d) => d, + None => continue, // Try next task + }; - // Update task with daemon assignment - let update_req = UpdateTaskRequest { - status: Some("starting".to_string()), - daemon_id: Some(daemon.id), - version: Some(task.version), - ..Default::default() - }; + // Get repo URL from task or contract + let repo_url = if let Some(url) = &task.repository_url { + Some(url.clone()) + } else { + match repository::list_contract_repositories(pool, contract_id).await { + Ok(repos) => repos + .iter() + .find(|r| r.is_primary) + .or(repos.first()) + .and_then(|r| r.repository_url.clone().or_else(|| r.local_path.clone())), + Err(_) => None, + } + }; - let updated_task = match repository::update_task_for_owner(pool, task.id, owner_id, update_req).await { - Ok(Some(t)) => t, - Ok(None) => return Ok(None), - Err(e) => { - tracing::warn!(task_id = %task.id, error = %e, "Failed to update task for daemon assignment"); - return Ok(None); - } - }; + // Update task with daemon assignment + let update_req = UpdateTaskRequest { + status: Some("starting".to_string()), + daemon_id: Some(daemon.id), + version: Some(task.version), + ..Default::default() + }; - // Send spawn command - let cmd = DaemonCommand::SpawnTask { - task_id: updated_task.id, - task_name: updated_task.name.clone(), - plan: updated_task.plan.clone(), - repo_url, - base_branch: updated_task.base_branch.clone(), - target_branch: updated_task.target_branch.clone(), - parent_task_id: updated_task.parent_task_id, - depth: updated_task.depth, - is_orchestrator: false, - target_repo_path: updated_task.target_repo_path.clone(), - completion_action: updated_task.completion_action.clone(), - continue_from_task_id: updated_task.continue_from_task_id, - copy_files: updated_task.copy_files.as_ref().and_then(|v| serde_json::from_value(v.clone()).ok()), - contract_id: updated_task.contract_id, - is_supervisor: false, - }; + let updated_task = match repository::update_task_for_owner(pool, task.id, owner_id, update_req).await { + Ok(Some(t)) => t, + Ok(None) => continue, // Task was modified concurrently, try next + Err(e) => { + tracing::warn!(task_id = %task.id, error = %e, "Failed to update task for daemon assignment"); + continue; // Try next task + } + }; - if let Err(e) = state.send_daemon_command(daemon.id, cmd).await { - tracing::warn!(error = %e, daemon_id = %daemon.id, task_id = %task.id, "Failed to send spawn command"); - // Rollback - let rollback_req = UpdateTaskRequest { - status: Some("pending".to_string()), - clear_daemon_id: true, - ..Default::default() + // Send spawn command + let cmd = DaemonCommand::SpawnTask { + task_id: updated_task.id, + task_name: updated_task.name.clone(), + plan: updated_task.plan.clone(), + repo_url, + base_branch: updated_task.base_branch.clone(), + target_branch: updated_task.target_branch.clone(), + parent_task_id: updated_task.parent_task_id, + depth: updated_task.depth, + is_orchestrator: false, + target_repo_path: updated_task.target_repo_path.clone(), + completion_action: updated_task.completion_action.clone(), + continue_from_task_id: updated_task.continue_from_task_id, + copy_files: updated_task.copy_files.as_ref().and_then(|v| serde_json::from_value(v.clone()).ok()), + contract_id: updated_task.contract_id, + is_supervisor: false, }; - let _ = repository::update_task_for_owner(pool, task.id, owner_id, rollback_req).await; - return Ok(None); + + if let Err(e) = state.send_daemon_command(daemon.id, cmd).await { + tracing::warn!(error = %e, daemon_id = %daemon.id, task_id = %task.id, "Failed to send spawn command"); + // Rollback + let rollback_req = UpdateTaskRequest { + status: Some("pending".to_string()), + clear_daemon_id: true, + ..Default::default() + }; + let _ = repository::update_task_for_owner(pool, task.id, owner_id, rollback_req).await; + continue; // Try next task + } + + tracing::info!(task_id = %task.id, daemon_id = %daemon.id, "Started pending task from wait loop"); + return Ok(Some(updated_task)); } - tracing::info!(task_id = %task.id, daemon_id = %daemon.id, "Started pending task from wait loop"); - Ok(Some(updated_task)) + // No tasks could be started + Ok(None) } // ============================================================================= |
