From 88a4f15ce1310f8ee8693835be14aa5280233f17 Mon Sep 17 00:00:00 2001
From: soryu <soryu@soryu.co>
Date: Thu, 5 Feb 2026 23:42:48 +0000
Subject: Add directive-first chain system redesign

Redesigns the chain system with a directive-first architecture where
Directive is the top-level entity (the "why/what") and Chains are
generated execution plans (the "how") that can be dynamically modified.

Backend:
- Add database migration for directive system tables
- Add Directive, DirectiveChain, ChainStep, DirectiveEvent models
- Add DirectiveVerifier and DirectiveApproval models
- Add orchestration module with engine, planner, and verifier
- Add comprehensive API handlers for directives
- Add daemon CLI commands for directive management
- Add directive skill documentation
- Integrate contract completion with directive engine
- Add SSE endpoint for real-time directive events

Frontend:
- Add directives route with split-view layout
- Add 6-tab detail view (Overview, Chain, Events, Evaluations, Approvals, Verifiers)
- Add React Flow DAG visualization for chain steps
- Add SSE subscription hook for real-time event updates
- Add useDirectives and useDirectiveEventSubscription hooks
- Add directive types and API functions

Fixes:
- Fix test failures in ws/protocol, task_output, completion_gate, patch
- Fix word boundary matching in looks_like_task()
- Fix parse_last() to find actual last completion gate
- Fix create_export_patch when merge-base equals HEAD
- Clean up clippy warnings in new code

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 makima/src/llm/contract_evaluator.rs | 564 ++++-------------------------------
 1 file changed, 53 insertions(+), 511 deletions(-)

(limited to 'makima/src/llm/contract_evaluator.rs')
diff --git a/makima/src/llm/contract_evaluator.rs b/makima/src/llm/contract_evaluator.rs
index fcc4826..e63bbfa 100644
--- a/makima/src/llm/contract_evaluator.rs
+++ b/makima/src/llm/contract_evaluator.rs
@@ -1,25 +1,19 @@
 //! Contract Evaluator - LLM-based evaluation of completed contracts against directive.
 //!
-//! This module provides functionality for:
-//! - Gathering deliverables, files, and task outputs from completed contracts
-//! - Building evaluation prompts using directive and acceptance criteria
-//! - Calling LLM to evaluate work against requirements
-//! - Parsing evaluation responses
+//! This module will be reimplemented as part of the directive verification engine.
+//! See the orchestration module for the new evaluation system.
+//!
+//! The new evaluation system will provide:
+//! - Tiered verification (programmatic verifiers first, then LLM evaluation)
+//! - Composite confidence scoring (weighted combination of results)
+//! - Pluggable verifier interface (test runner, linter, build, type checker)
+//! - Proper integration with the directive chain steps
 
 use serde::{Deserialize, Serialize};
 use sqlx::PgPool;
 use uuid::Uuid;
 
-use crate::db::{
-    models::{
-        ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest,
-        DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult,
-    },
-    repository,
-};
-
-use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent};
-use super::tools::Tool;
+// use crate::db::models::{Contract, DirectiveAcceptanceCriterion, DirectiveRequirement};
 
 /// Result of contract evaluation
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -30,526 +24,74 @@ pub struct ContractEvaluationResult {
     /// Overall score from 0.0 to 1.0
     pub overall_score: f64,
     /// Results for each acceptance criterion
-    pub criteria_results: Vec<EvaluationCriterionResult>,
+    pub criteria_results: Vec<EvaluationCriterionResultLegacy>,
     /// Summary feedback from the evaluator
     pub summary_feedback: String,
     /// Instructions for rework if failed
     pub rework_instructions: Option<String>,
 }
 
-/// Context gathered for evaluation
-#[derive(Debug, Clone)]
-pub struct EvaluationContext {
-    /// The contract being evaluated
-    pub contract: Contract,
-    /// The chain contract record
-    pub chain_contract: ChainContract,
-    /// The directive document
-    pub directive: ChainDirective,
-    /// Files associated with the contract
-    pub files: Vec<FileContent>,
-    /// Task outputs from the contract
-    pub task_outputs: Vec<TaskOutput>,
-    /// Deliverables marked as complete
-    pub deliverables: Vec<DeliverableInfo>,
-    /// Acceptance criteria specific to this contract
-    pub acceptance_criteria: Vec<DirectiveAcceptanceCriterion>,
-    /// Requirements mapped to this contract
-    pub requirements: Vec<DirectiveRequirement>,
+/// Per-criterion evaluation result (legacy - kept for compatibility)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct EvaluationCriterionResultLegacy {
+    pub criterion_id: String,
+    pub criterion_text: String,
+    pub passed: bool,
+    /// Score (0.0-1.0)
+    pub score: f64,
+    pub feedback: String,
+    /// Evidence supporting the evaluation
+    pub evidence: Vec<String>,
 }
 
-/// File content for evaluation
-#[derive(Debug, Clone, Serialize)]
+/// File content for evaluation context
+#[derive(Debug, Clone)]
 pub struct FileContent {
     pub path: String,
-    pub description: Option<String>,
     pub content: String,
-    pub is_deliverable: bool,
 }
 
-/// Task output for evaluation
-#[derive(Debug, Clone, Serialize)]
-pub struct TaskOutput {
-    pub task_name: String,
-    pub output_summary: String,
-    pub exit_code: Option<i32>,
-}
-
-/// Deliverable info for evaluation
-#[derive(Debug, Clone, Serialize)]
-pub struct DeliverableInfo {
-    pub name: String,
-    pub status: String,
-    pub file_path: Option<String>,
-}
-
-/// Error types for evaluation
-#[derive(Debug, thiserror::Error)]
-pub enum EvaluationError {
-    #[error("Database error: {0}")]
-    Database(#[from] sqlx::Error),
-
-    #[error("Contract not found: {0}")]
-    ContractNotFound(Uuid),
-
-    #[error("Chain contract not found for contract: {0}")]
-    ChainContractNotFound(Uuid),
-
-    #[error("Directive not found for chain: {0}")]
-    DirectiveNotFound(Uuid),
-
-    #[error("LLM evaluation failed: {0}")]
-    LlmError(String),
-
-    #[error("Failed to parse evaluation response: {0}")]
-    ParseError(String),
-}
-
-/// Contract evaluator for directive-driven evaluation
+/// Contract evaluator for LLM-based assessment.
+///
+/// NOTE: This is a stub implementation. The full evaluation system will be
+/// implemented as part of the orchestration/verifier module.
 pub struct ContractEvaluator {
-    pool: PgPool,
-    claude_client: ClaudeClient,
-    model: ClaudeModel,
-    /// Minimum score required to pass (default 0.8)
-    pass_threshold: f64,
+    _pool: PgPool,
 }
 
 impl ContractEvaluator {
-    /// Create a new evaluator
-    pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self {
-        Self {
-            pool,
-            claude_client,
-            model: ClaudeModel::Sonnet,
-            pass_threshold: 0.8,
-        }
-    }
-
-    /// Set the LLM model to use for evaluation
-    pub fn with_model(mut self, model: ClaudeModel) -> Self {
-        self.model = model;
-        self
-    }
-
-    /// Set the pass threshold
-    pub fn with_pass_threshold(mut self, threshold: f64) -> Self {
-        self.pass_threshold = threshold;
-        self
+    /// Create a new contract evaluator.
+    pub fn new(pool: PgPool) -> Self {
+        Self { _pool: pool }
     }
 
-    /// Evaluate a completed contract against the directive
+    /// Evaluate a contract - stub implementation.
+    ///
+    /// This will be reimplemented in the orchestration module with:
+    /// - Programmatic verification (tests, lint, build)
+    /// - LLM evaluation
+    /// - Composite scoring
     pub async fn evaluate_contract(
         &self,
-        contract_id: Uuid,
-        owner_id: Uuid,
-    ) -> Result<ContractEvaluationResult, EvaluationError> {
-        // Gather evaluation context
-        let context = self.gather_context(contract_id, owner_id).await?;
-
-        // Build evaluation prompt
-        let prompt = self.build_evaluation_prompt(&context);
-
-        // Call LLM for evaluation
-        let response = self.call_llm_for_evaluation(&prompt).await?;
-
-        // Parse the response
-        let result = self.parse_evaluation_response(&response, &context)?;
-
-        Ok(result)
-    }
-
-    /// Gather all context needed for evaluation
-    async fn gather_context(
-        &self,
-        contract_id: Uuid,
-        owner_id: Uuid,
-    ) -> Result<EvaluationContext, EvaluationError> {
-        // Get contract
-        let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id)
-            .await?
-            .ok_or(EvaluationError::ContractNotFound(contract_id))?;
-
-        // Get chain contract
-        let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id)
-            .await?
-            .ok_or(EvaluationError::ChainContractNotFound(contract_id))?;
-
-        // Get directive
-        let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id)
-            .await?
-            .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?;
-
-        // Get files directly from repository
-        let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id)
-            .await
-            .unwrap_or_default();
-
-        // Get tasks directly from repository
-        let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id)
-            .await
-            .unwrap_or_default();
-
-        // Build file contents from FileSummary
-        // Note: FileSummary doesn't have content, so we use name and description
-        let files: Vec<FileContent> = contract_files.iter().map(|f| {
-            FileContent {
-                path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()),
-                description: f.description.clone(),
-                content: format!("[File: {} - content not loaded in summary view]", f.name),
-                is_deliverable: false, // FileSummary doesn't track deliverable status
-            }
-        }).collect();
-
-        // Build task outputs from TaskSummary
-        let task_outputs: Vec<TaskOutput> = contract_tasks.iter().map(|t| {
-            TaskOutput {
-                task_name: t.name.clone(),
-                output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)),
-                exit_code: None,
-            }
-        }).collect();
-
-        // Build deliverables info from files marked as deliverables
-        // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables
-        let deliverables: Vec<DeliverableInfo> = contract_files.iter()
-            .map(|f| DeliverableInfo {
-                name: f.name.clone(),
-                status: "complete".to_string(),
-                file_path: f.repo_file_path.clone(),
-            })
-            .collect();
-
-        // Parse requirements and acceptance criteria from directive
-        let requirements: Vec<DirectiveRequirement> =
-            serde_json::from_value(directive.requirements.clone()).unwrap_or_default();
-
-        let all_criteria: Vec<DirectiveAcceptanceCriterion> =
-            serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default();
-
-        // Get contract definition to find mapped requirements
-        // For now, use all acceptance criteria
-        let acceptance_criteria = all_criteria;
-
-        Ok(EvaluationContext {
-            contract,
-            chain_contract,
-            directive,
-            files,
-            task_outputs,
-            deliverables,
-            acceptance_criteria,
-            requirements,
-        })
+        _contract_id: Uuid,
+    ) -> Result<ContractEvaluationResult, ContractEvaluatorError> {
+        // TODO: Implement using the new directive evaluation system
+        Err(ContractEvaluatorError::NotImplemented(
+            "Contract evaluator will be reimplemented with directive system".to_string(),
+        ))
     }
-
-    /// Build the evaluation prompt
-    fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String {
-        let mut prompt = String::new();
-
-        prompt.push_str("# Contract Completion Evaluation\n\n");
-        prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n");
-
-        // Contract info
-        prompt.push_str("## Contract Information\n\n");
-        prompt.push_str(&format!("**Name:** {}\n", context.contract.name));
-        if let Some(ref desc) = context.contract.description {
-            prompt.push_str(&format!("**Description:** {}\n", desc));
-        }
-        prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type));
-        prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase));
-        prompt.push_str("\n");
-
-        // Requirements
-        if !context.requirements.is_empty() {
-            prompt.push_str("## Requirements\n\n");
-            for req in &context.requirements {
-                prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title));
-                if !req.description.is_empty() {
-                    prompt.push_str(&format!("  {}\n", req.description));
-                }
-            }
-            prompt.push_str("\n");
-        }
-
-        // Acceptance criteria
-        if !context.acceptance_criteria.is_empty() {
-            prompt.push_str("## Acceptance Criteria\n\n");
-            for (i, criterion) in context.acceptance_criteria.iter().enumerate() {
-                prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description));
-                prompt.push_str(&format!("   - Testable: {}\n", criterion.testable));
-                if !criterion.requirement_ids.is_empty() {
-                    prompt.push_str(&format!("   - Covers: {}\n", criterion.requirement_ids.join(", ")));
-                }
-            }
-            prompt.push_str("\n");
-        }
-
-        // Deliverables
-        if !context.deliverables.is_empty() {
-            prompt.push_str("## Deliverables\n\n");
-            for d in &context.deliverables {
-                prompt.push_str(&format!("- {} ({})\n", d.name, d.status));
-            }
-            prompt.push_str("\n");
-        }
-
-        // Files
-        if !context.files.is_empty() {
-            prompt.push_str("## Files Created/Modified\n\n");
-            for file in &context.files {
-                prompt.push_str(&format!("### {}", file.path));
-                if file.is_deliverable {
-                    prompt.push_str(" [DELIVERABLE]");
-                }
-                prompt.push_str("\n");
-                if let Some(ref desc) = file.description {
-                    prompt.push_str(&format!("*{}*\n", desc));
-                }
-                // Truncate content if too long
-                let content = if file.content.len() > 5000 {
-                    format!("{}...\n[Content truncated - {} chars total]",
-                        &file.content[..5000], file.content.len())
-                } else {
-                    file.content.clone()
-                };
-                prompt.push_str("```\n");
-                prompt.push_str(&content);
-                prompt.push_str("\n```\n\n");
-            }
-        }
-
-        // Task outputs
-        if !context.task_outputs.is_empty() {
-            prompt.push_str("## Task Outputs\n\n");
-            for task in &context.task_outputs {
-                prompt.push_str(&format!("### {}\n", task.task_name));
-                prompt.push_str(&format!("{}\n\n", task.output_summary));
-            }
-        }
-
-        // Evaluation instructions
-        prompt.push_str("## Evaluation Instructions\n\n");
-        prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n");
-        prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n");
-        prompt.push_str("Respond with a JSON object in the following format:\n\n");
-        prompt.push_str("```json\n");
-        prompt.push_str(r#"{
-  "passed": true/false,
-  "overallScore": 0.0-1.0,
-  "criteriaResults": [
-    {
-      "criterionId": "criterion identifier or index",
-      "met": true/false,
-      "score": 0.0-1.0,
-      "feedback": "explanation of why criterion was/wasn't met"
-    }
-  ],
-  "summaryFeedback": "overall summary of the evaluation",
-  "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)"
-}
-"#);
-        prompt.push_str("```\n\n");
-        prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold));
-        prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n");
-
-        prompt
-    }
-
-    /// Call LLM for evaluation
-    async fn call_llm_for_evaluation(&self, prompt: &str) -> Result<String, EvaluationError> {
-        let messages = vec![Message {
-            role: "user".to_string(),
-            content: MessageContent::Text(prompt.to_string()),
-        }];
-
-        // Use chat_with_tools with empty tools array for simple chat
-        let empty_tools: Vec<Tool> = vec![];
-        let result = self
-            .claude_client
-            .chat_with_tools(messages, &empty_tools)
-            .await
-            .map_err(|e| EvaluationError::LlmError(e.to_string()))?;
-
-        // ChatResult.content is already an Option<String>
-        let text = result.content.unwrap_or_default();
-
-        Ok(text)
-    }
-
-    /// Parse the LLM response into an evaluation result
-    fn parse_evaluation_response(
-        &self,
-        response: &str,
-        context: &EvaluationContext,
-    ) -> Result<ContractEvaluationResult, EvaluationError> {
-        // Extract JSON from response (may be wrapped in markdown code blocks)
-        let json_str = extract_json_from_response(response)?;
-
-        // Parse the JSON
-        let parsed: EvaluationResponseJson = serde_json::from_str(&json_str)
-            .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?;
-
-        // Convert to our result type
-        let criteria_results: Vec<EvaluationCriterionResult> = parsed
-            .criteria_results
-            .into_iter()
-            .map(|cr| EvaluationCriterionResult {
-                criterion_id: cr.criterion_id.clone(),
-                criterion_text: cr.criterion_id, // Use ID as text if not provided
-                passed: cr.passed,
-                score: cr.score,
-                feedback: cr.feedback,
-                evidence: vec![],
-            })
-            .collect();
-
-        // Determine pass/fail based on threshold and results
-        let passed = parsed.passed && parsed.overall_score >= self.pass_threshold;
-
-        Ok(ContractEvaluationResult {
-            passed,
-            overall_score: parsed.overall_score,
-            criteria_results,
-            summary_feedback: parsed.summary_feedback,
-            rework_instructions: if passed { None } else { parsed.rework_instructions },
-        })
-    }
-
-    /// Save evaluation result to database
-    pub async fn save_evaluation(
-        &self,
-        contract_id: Uuid,
-        chain_id: Uuid,
-        chain_contract_id: Uuid,
-        result: &ContractEvaluationResult,
-    ) -> Result<ContractEvaluation, EvaluationError> {
-        let req = CreateContractEvaluationRequest {
-            contract_id,
-            chain_id: Some(chain_id),
-            chain_contract_id: Some(chain_contract_id),
-            evaluator_model: Some(format!("{:?}", self.model)),
-            passed: result.passed,
-            overall_score: Some(result.overall_score),
-            criteria_results: result.criteria_results.clone(),
-            summary_feedback: result.summary_feedback.clone(),
-            rework_instructions: result.rework_instructions.clone(),
-        };
-
-        let evaluation = repository::create_contract_evaluation(&self.pool, req).await?;
-
-        // Update chain contract status
-        let status = if result.passed { "passed" } else { "failed" };
-        repository::update_chain_contract_evaluation_status(
-            &self.pool,
-            chain_contract_id,
-            status,
-            Some(evaluation.id),
-            result.rework_instructions.as_deref(),
-        )
-        .await?;
-
-        Ok(evaluation)
-    }
-}
-
-/// JSON structure for parsing LLM response
-#[derive(Debug, Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct EvaluationResponseJson {
-    passed: bool,
-    overall_score: f64,
-    criteria_results: Vec<CriterionResultJson>,
-    summary_feedback: String,
-    rework_instructions: Option<String>,
-}
-
-#[derive(Debug, Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct CriterionResultJson {
-    criterion_id: String,
-    #[serde(alias = "met")]
-    passed: bool,
-    #[serde(default)]
-    score: f64,
-    feedback: String,
-}
-
-/// Extract JSON from a response that may contain markdown code blocks
-fn extract_json_from_response(response: &str) -> Result<String, EvaluationError> {
-    // Try to find JSON in code blocks first
-    if let Some(start) = response.find("```json") {
-        let json_start = start + 7;
-        if let Some(end) = response[json_start..].find("```") {
-            return Ok(response[json_start..json_start + end].trim().to_string());
-        }
-    }
-
-    // Try plain code blocks
-    if let Some(start) = response.find("```") {
-        let json_start = start + 3;
-        // Skip any language identifier on the same line
-        let actual_start = response[json_start..]
-            .find('\n')
-            .map(|i| json_start + i + 1)
-            .unwrap_or(json_start);
-        if let Some(end) = response[actual_start..].find("```") {
-            return Ok(response[actual_start..actual_start + end].trim().to_string());
-        }
-    }
-
-    // Try to find raw JSON (starts with {)
-    if let Some(start) = response.find('{') {
-        // Find matching closing brace
-        let mut depth = 0;
-        let mut end = start;
-        for (i, c) in response[start..].char_indices() {
-            match c {
-                '{' => depth += 1,
-                '}' => {
-                    depth -= 1;
-                    if depth == 0 {
-                        end = start + i + 1;
-                        break;
-                    }
-                }
-                _ => {}
-            }
-        }
-        if end > start {
-            return Ok(response[start..end].to_string());
-        }
-    }
-
-    Err(EvaluationError::ParseError(
-        "Could not find JSON in response".to_string(),
-    ))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_extract_json_from_code_block() {
-        let response = r#"Here is the evaluation:
-
-```json
-{
-  "passed": true,
-  "overallScore": 0.85
 }
-```
 
-Done."#;
+/// Error types for contract evaluation.
+#[derive(Debug, thiserror::Error)]
+pub enum ContractEvaluatorError {
+    #[error("Database error: {0}")]
+    Database(#[from] sqlx::Error),
 
-        let json = extract_json_from_response(response).unwrap();
-        assert!(json.contains("\"passed\": true"));
-    }
+    #[error("LLM error: {0}")]
+    Llm(String),
 
-    #[test]
-    fn test_extract_json_raw() {
-        let response = r#"The result is {"passed": false, "overallScore": 0.5}"#;
-        let json = extract_json_from_response(response).unwrap();
-        assert!(json.contains("\"passed\": false"));
-    }
+    #[error("Not implemented: {0}")]
+    NotImplemented(String),
 }
-- 
cgit v1.2.3