summaryrefslogtreecommitdiff
path: root/makima/src/orchestration/directive.rs
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-04-30 10:43:31 +0100
committerGitHub <noreply@github.com>2026-04-30 10:43:31 +0100
commitc3e97bbcc32bd18d9344dd44cc54dfcdce32100b (patch)
tree7fe772669a614968fff9510abbc054091baf75e2 /makima/src/orchestration/directive.rs
parent2b695485753d55f956746b73c31c2deba0ed0a29 (diff)
downloadsoryu-c3e97bbcc32bd18d9344dd44cc54dfcdce32100b.tar.gz
soryu-c3e97bbcc32bd18d9344dd44cc54dfcdce32100b.zip
fix(directive): cancel orphaned planner and kick reconciler on goal update (#104)
Resolves the user-visible bug where editing a directive's goal mid-flight shows "saved" but does not actually replan: the running planner kept emitting add-step calls based on the OLD goal while a fresh planner was supposed to take over, and the user had to wait up to 15s for the next reconciler tick before any replanning even started. ## What was happening PUT /api/v1/directives/{id}/goal already had two paths: - Small change + planner running → SendMessage interrupt + KEEP orchestrator. - Everything else → clear orchestrator_task_id and let phase_replanning spawn a new planner on the next 15s tick. The "everything else" path cleared the directive's pointer to the planner task but never cancelled the task itself. The task kept executing and could race the new planner by adding more steps from the stale plan. Worse, those new steps could push MAX(steps.created_at) past the just-bumped goal_updated_at, suppressing phase_replanning entirely. ## Fix 1. New helper `try_cancel_running_planner()` (orchestration/directive.rs): sends `InterruptTask { graceful: true }` to the daemon owning the orchestrator task and marks the task `interrupted` in the DB. All errors are logged and swallowed so the goal update still completes. 2. update_goal handler calls the helper whenever it is about to take the "clear orchestrator_task_id" branch, so the orphaned planner stops producing stale-plan steps before its DB linkage is cut. 3. New `AppState::directive_kick` (tokio::sync::Notify) lets the handler signal the reconciler to run a tick immediately. The reconciler loop in server/mod.rs now selects between its 15s interval and the notify, so the user no longer waits up to 15s after editing a goal before replanning actually starts. update_goal calls `kick_directive_reconciler()` after the goal is persisted (both paths). ## Why not also loosen `get_directives_needing_replanning` The query already covers the common cases once the orphan-cancel lands — without a still-running orphan adding fresh steps, goal_updated_at reliably exceeds MAX(steps.created_at) after a goal edit. Loosening the predicate risked spurious replans for directives that legitimately have no steps yet (those are handled by `phase_planning`). Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'makima/src/orchestration/directive.rs')
-rw-r--r--makima/src/orchestration/directive.rs82
1 files changed, 82 insertions, 0 deletions
diff --git a/makima/src/orchestration/directive.rs b/makima/src/orchestration/directive.rs
index 22279e8..7dbfe65 100644
--- a/makima/src/orchestration/directive.rs
+++ b/makima/src/orchestration/directive.rs
@@ -2547,6 +2547,88 @@ pub enum GoalEditInterruptResult {
Skipped,
}
+/// Best-effort cancellation of a directive's currently-running orchestrator
+/// (planner) task. Used by the goal-update path: when we are about to clear
+/// `orchestrator_task_id` from the directive, the still-running task would
+/// otherwise keep emitting `add-step` calls based on the OLD goal, racing the
+/// freshly-spawned replanner. We send an `InterruptTask` daemon command and
+/// mark the task `interrupted` in the DB so monitoring sees a clean state.
+///
+/// All errors are logged and translated into `Ok(false)` — the caller's path
+/// (clearing orchestrator_task_id, kicking the reconciler) is still safe to
+/// proceed.
+pub async fn try_cancel_running_planner(
+ pool: &PgPool,
+ state: &SharedState,
+ directive_id: Uuid,
+ orchestrator_task_id: Uuid,
+) -> Result<bool, anyhow::Error> {
+ let task = match repository::get_task(pool, orchestrator_task_id).await? {
+ Some(t) => t,
+ None => return Ok(false),
+ };
+
+ let cancellable = matches!(
+ task.status.as_str(),
+ "queued" | "pending" | "starting" | "running"
+ );
+ if !cancellable {
+ return Ok(false);
+ }
+
+ if let Some(daemon_id) = task.daemon_id {
+ let command = DaemonCommand::InterruptTask {
+ task_id: orchestrator_task_id,
+ graceful: true,
+ };
+ if let Err(e) = state.send_daemon_command(daemon_id, command).await {
+ tracing::warn!(
+ directive_id = %directive_id,
+ task_id = %orchestrator_task_id,
+ daemon_id = %daemon_id,
+ error = %e,
+ "Failed to dispatch InterruptTask to orphaned planner; will mark interrupted in DB"
+ );
+ } else {
+ tracing::info!(
+ directive_id = %directive_id,
+ task_id = %orchestrator_task_id,
+ daemon_id = %daemon_id,
+ "Sent InterruptTask to orphaned planner before clearing orchestrator"
+ );
+ }
+ } else {
+ tracing::debug!(
+ directive_id = %directive_id,
+ task_id = %orchestrator_task_id,
+ "Orphaned planner has no daemon assignment; skipping daemon command"
+ );
+ }
+
+ // Mark the task interrupted regardless of whether the daemon was reachable
+ // — this is the source of truth the monitor consults next tick.
+ let upd = crate::db::models::UpdateTaskRequest {
+ status: Some("interrupted".to_string()),
+ version: Some(task.version),
+ error_message: Some(
+ "Cancelled because the directive's goal was updated".to_string(),
+ ),
+ ..Default::default()
+ };
+ if let Err(e) =
+ repository::update_task_for_owner(pool, orchestrator_task_id, task.owner_id, upd).await
+ {
+ tracing::warn!(
+ directive_id = %directive_id,
+ task_id = %orchestrator_task_id,
+ error = %e,
+ "Failed to mark orphaned planner interrupted"
+ );
+ }
+
+ Ok(true)
+}
+
/// Attempt to interrupt a directive's currently-running planner with a goal
/// edit summary instead of replanning from scratch.
///