From 88a4f15ce1310f8ee8693835be14aa5280233f17 Mon Sep 17 00:00:00 2001 From: soryu Date: Thu, 5 Feb 2026 23:42:48 +0000 Subject: Add directive-first chain system redesign Redesigns the chain system with a directive-first architecture where Directive is the top-level entity (the "why/what") and Chains are generated execution plans (the "how") that can be dynamically modified. Backend: - Add database migration for directive system tables - Add Directive, DirectiveChain, ChainStep, DirectiveEvent models - Add DirectiveVerifier and DirectiveApproval models - Add orchestration module with engine, planner, and verifier - Add comprehensive API handlers for directives - Add daemon CLI commands for directive management - Add directive skill documentation - Integrate contract completion with directive engine - Add SSE endpoint for real-time directive events Frontend: - Add directives route with split-view layout - Add 6-tab detail view (Overview, Chain, Events, Evaluations, Approvals, Verifiers) - Add React Flow DAG visualization for chain steps - Add SSE subscription hook for real-time event updates - Add useDirectives and useDirectiveEventSubscription hooks - Add directive types and API functions Fixes: - Fix test failures in ws/protocol, task_output, completion_gate, patch - Fix word boundary matching in looks_like_task() - Fix parse_last() to find actual last completion gate - Fix create_export_patch when merge-base equals HEAD - Clean up clippy warnings in new code Co-Authored-By: Claude Opus 4.5 --- makima/src/llm/contract_evaluator.rs | 564 ++++------------------------------- 1 file changed, 53 insertions(+), 511 deletions(-) (limited to 'makima/src/llm/contract_evaluator.rs') diff --git a/makima/src/llm/contract_evaluator.rs b/makima/src/llm/contract_evaluator.rs index fcc4826..e63bbfa 100644 --- a/makima/src/llm/contract_evaluator.rs +++ b/makima/src/llm/contract_evaluator.rs @@ -1,25 +1,19 @@ //! Contract Evaluator - LLM-based evaluation of completed contracts against directive. //! -//! This module provides functionality for: -//! - Gathering deliverables, files, and task outputs from completed contracts -//! - Building evaluation prompts using directive and acceptance criteria -//! - Calling LLM to evaluate work against requirements -//! - Parsing evaluation responses +//! This module will be reimplemented as part of the directive verification engine. +//! See the orchestration module for the new evaluation system. +//! +//! The new evaluation system will provide: +//! - Tiered verification (programmatic verifiers first, then LLM evaluation) +//! - Composite confidence scoring (weighted combination of results) +//! - Pluggable verifier interface (test runner, linter, build, type checker) +//! - Proper integration with the directive chain steps use serde::{Deserialize, Serialize}; use sqlx::PgPool; use uuid::Uuid; -use crate::db::{ - models::{ - ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest, - DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult, - }, - repository, -}; - -use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent}; -use super::tools::Tool; +// use crate::db::models::{Contract, DirectiveAcceptanceCriterion, DirectiveRequirement}; /// Result of contract evaluation #[derive(Debug, Clone, Serialize, Deserialize)] @@ -30,526 +24,74 @@ pub struct ContractEvaluationResult { /// Overall score from 0.0 to 1.0 pub overall_score: f64, /// Results for each acceptance criterion - pub criteria_results: Vec, + pub criteria_results: Vec, /// Summary feedback from the evaluator pub summary_feedback: String, /// Instructions for rework if failed pub rework_instructions: Option, } -/// Context gathered for evaluation -#[derive(Debug, Clone)] -pub struct EvaluationContext { - /// The contract being evaluated - pub contract: Contract, - /// The chain contract record - pub chain_contract: ChainContract, - /// The directive document - pub directive: ChainDirective, - /// Files associated with the contract - pub files: Vec, - /// Task outputs from the contract - pub task_outputs: Vec, - /// Deliverables marked as complete - pub deliverables: Vec, - /// Acceptance criteria specific to this contract - pub acceptance_criteria: Vec, - /// Requirements mapped to this contract - pub requirements: Vec, +/// Per-criterion evaluation result (legacy - kept for compatibility) +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct EvaluationCriterionResultLegacy { + pub criterion_id: String, + pub criterion_text: String, + pub passed: bool, + /// Score (0.0-1.0) + pub score: f64, + pub feedback: String, + /// Evidence supporting the evaluation + pub evidence: Vec, } -/// File content for evaluation -#[derive(Debug, Clone, Serialize)] +/// File content for evaluation context +#[derive(Debug, Clone)] pub struct FileContent { pub path: String, - pub description: Option, pub content: String, - pub is_deliverable: bool, } -/// Task output for evaluation -#[derive(Debug, Clone, Serialize)] -pub struct TaskOutput { - pub task_name: String, - pub output_summary: String, - pub exit_code: Option, -} - -/// Deliverable info for evaluation -#[derive(Debug, Clone, Serialize)] -pub struct DeliverableInfo { - pub name: String, - pub status: String, - pub file_path: Option, -} - -/// Error types for evaluation -#[derive(Debug, thiserror::Error)] -pub enum EvaluationError { - #[error("Database error: {0}")] - Database(#[from] sqlx::Error), - - #[error("Contract not found: {0}")] - ContractNotFound(Uuid), - - #[error("Chain contract not found for contract: {0}")] - ChainContractNotFound(Uuid), - - #[error("Directive not found for chain: {0}")] - DirectiveNotFound(Uuid), - - #[error("LLM evaluation failed: {0}")] - LlmError(String), - - #[error("Failed to parse evaluation response: {0}")] - ParseError(String), -} - -/// Contract evaluator for directive-driven evaluation +/// Contract evaluator for LLM-based assessment. +/// +/// NOTE: This is a stub implementation. The full evaluation system will be +/// implemented as part of the orchestration/verifier module. pub struct ContractEvaluator { - pool: PgPool, - claude_client: ClaudeClient, - model: ClaudeModel, - /// Minimum score required to pass (default 0.8) - pass_threshold: f64, + _pool: PgPool, } impl ContractEvaluator { - /// Create a new evaluator - pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self { - Self { - pool, - claude_client, - model: ClaudeModel::Sonnet, - pass_threshold: 0.8, - } - } - - /// Set the LLM model to use for evaluation - pub fn with_model(mut self, model: ClaudeModel) -> Self { - self.model = model; - self - } - - /// Set the pass threshold - pub fn with_pass_threshold(mut self, threshold: f64) -> Self { - self.pass_threshold = threshold; - self + /// Create a new contract evaluator. + pub fn new(pool: PgPool) -> Self { + Self { _pool: pool } } - /// Evaluate a completed contract against the directive + /// Evaluate a contract - stub implementation. + /// + /// This will be reimplemented in the orchestration module with: + /// - Programmatic verification (tests, lint, build) + /// - LLM evaluation + /// - Composite scoring pub async fn evaluate_contract( &self, - contract_id: Uuid, - owner_id: Uuid, - ) -> Result { - // Gather evaluation context - let context = self.gather_context(contract_id, owner_id).await?; - - // Build evaluation prompt - let prompt = self.build_evaluation_prompt(&context); - - // Call LLM for evaluation - let response = self.call_llm_for_evaluation(&prompt).await?; - - // Parse the response - let result = self.parse_evaluation_response(&response, &context)?; - - Ok(result) - } - - /// Gather all context needed for evaluation - async fn gather_context( - &self, - contract_id: Uuid, - owner_id: Uuid, - ) -> Result { - // Get contract - let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id) - .await? - .ok_or(EvaluationError::ContractNotFound(contract_id))?; - - // Get chain contract - let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id) - .await? - .ok_or(EvaluationError::ChainContractNotFound(contract_id))?; - - // Get directive - let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id) - .await? - .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?; - - // Get files directly from repository - let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id) - .await - .unwrap_or_default(); - - // Get tasks directly from repository - let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id) - .await - .unwrap_or_default(); - - // Build file contents from FileSummary - // Note: FileSummary doesn't have content, so we use name and description - let files: Vec = contract_files.iter().map(|f| { - FileContent { - path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()), - description: f.description.clone(), - content: format!("[File: {} - content not loaded in summary view]", f.name), - is_deliverable: false, // FileSummary doesn't track deliverable status - } - }).collect(); - - // Build task outputs from TaskSummary - let task_outputs: Vec = contract_tasks.iter().map(|t| { - TaskOutput { - task_name: t.name.clone(), - output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)), - exit_code: None, - } - }).collect(); - - // Build deliverables info from files marked as deliverables - // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables - let deliverables: Vec = contract_files.iter() - .map(|f| DeliverableInfo { - name: f.name.clone(), - status: "complete".to_string(), - file_path: f.repo_file_path.clone(), - }) - .collect(); - - // Parse requirements and acceptance criteria from directive - let requirements: Vec = - serde_json::from_value(directive.requirements.clone()).unwrap_or_default(); - - let all_criteria: Vec = - serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default(); - - // Get contract definition to find mapped requirements - // For now, use all acceptance criteria - let acceptance_criteria = all_criteria; - - Ok(EvaluationContext { - contract, - chain_contract, - directive, - files, - task_outputs, - deliverables, - acceptance_criteria, - requirements, - }) + _contract_id: Uuid, + ) -> Result { + // TODO: Implement using the new directive evaluation system + Err(ContractEvaluatorError::NotImplemented( + "Contract evaluator will be reimplemented with directive system".to_string(), + )) } - - /// Build the evaluation prompt - fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String { - let mut prompt = String::new(); - - prompt.push_str("# Contract Completion Evaluation\n\n"); - prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n"); - - // Contract info - prompt.push_str("## Contract Information\n\n"); - prompt.push_str(&format!("**Name:** {}\n", context.contract.name)); - if let Some(ref desc) = context.contract.description { - prompt.push_str(&format!("**Description:** {}\n", desc)); - } - prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type)); - prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase)); - prompt.push_str("\n"); - - // Requirements - if !context.requirements.is_empty() { - prompt.push_str("## Requirements\n\n"); - for req in &context.requirements { - prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title)); - if !req.description.is_empty() { - prompt.push_str(&format!(" {}\n", req.description)); - } - } - prompt.push_str("\n"); - } - - // Acceptance criteria - if !context.acceptance_criteria.is_empty() { - prompt.push_str("## Acceptance Criteria\n\n"); - for (i, criterion) in context.acceptance_criteria.iter().enumerate() { - prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description)); - prompt.push_str(&format!(" - Testable: {}\n", criterion.testable)); - if !criterion.requirement_ids.is_empty() { - prompt.push_str(&format!(" - Covers: {}\n", criterion.requirement_ids.join(", "))); - } - } - prompt.push_str("\n"); - } - - // Deliverables - if !context.deliverables.is_empty() { - prompt.push_str("## Deliverables\n\n"); - for d in &context.deliverables { - prompt.push_str(&format!("- {} ({})\n", d.name, d.status)); - } - prompt.push_str("\n"); - } - - // Files - if !context.files.is_empty() { - prompt.push_str("## Files Created/Modified\n\n"); - for file in &context.files { - prompt.push_str(&format!("### {}", file.path)); - if file.is_deliverable { - prompt.push_str(" [DELIVERABLE]"); - } - prompt.push_str("\n"); - if let Some(ref desc) = file.description { - prompt.push_str(&format!("*{}*\n", desc)); - } - // Truncate content if too long - let content = if file.content.len() > 5000 { - format!("{}...\n[Content truncated - {} chars total]", - &file.content[..5000], file.content.len()) - } else { - file.content.clone() - }; - prompt.push_str("```\n"); - prompt.push_str(&content); - prompt.push_str("\n```\n\n"); - } - } - - // Task outputs - if !context.task_outputs.is_empty() { - prompt.push_str("## Task Outputs\n\n"); - for task in &context.task_outputs { - prompt.push_str(&format!("### {}\n", task.task_name)); - prompt.push_str(&format!("{}\n\n", task.output_summary)); - } - } - - // Evaluation instructions - prompt.push_str("## Evaluation Instructions\n\n"); - prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n"); - prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n"); - prompt.push_str("Respond with a JSON object in the following format:\n\n"); - prompt.push_str("```json\n"); - prompt.push_str(r#"{ - "passed": true/false, - "overallScore": 0.0-1.0, - "criteriaResults": [ - { - "criterionId": "criterion identifier or index", - "met": true/false, - "score": 0.0-1.0, - "feedback": "explanation of why criterion was/wasn't met" - } - ], - "summaryFeedback": "overall summary of the evaluation", - "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)" -} -"#); - prompt.push_str("```\n\n"); - prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold)); - prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n"); - - prompt - } - - /// Call LLM for evaluation - async fn call_llm_for_evaluation(&self, prompt: &str) -> Result { - let messages = vec![Message { - role: "user".to_string(), - content: MessageContent::Text(prompt.to_string()), - }]; - - // Use chat_with_tools with empty tools array for simple chat - let empty_tools: Vec = vec![]; - let result = self - .claude_client - .chat_with_tools(messages, &empty_tools) - .await - .map_err(|e| EvaluationError::LlmError(e.to_string()))?; - - // ChatResult.content is already an Option - let text = result.content.unwrap_or_default(); - - Ok(text) - } - - /// Parse the LLM response into an evaluation result - fn parse_evaluation_response( - &self, - response: &str, - context: &EvaluationContext, - ) -> Result { - // Extract JSON from response (may be wrapped in markdown code blocks) - let json_str = extract_json_from_response(response)?; - - // Parse the JSON - let parsed: EvaluationResponseJson = serde_json::from_str(&json_str) - .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?; - - // Convert to our result type - let criteria_results: Vec = parsed - .criteria_results - .into_iter() - .map(|cr| EvaluationCriterionResult { - criterion_id: cr.criterion_id.clone(), - criterion_text: cr.criterion_id, // Use ID as text if not provided - passed: cr.passed, - score: cr.score, - feedback: cr.feedback, - evidence: vec![], - }) - .collect(); - - // Determine pass/fail based on threshold and results - let passed = parsed.passed && parsed.overall_score >= self.pass_threshold; - - Ok(ContractEvaluationResult { - passed, - overall_score: parsed.overall_score, - criteria_results, - summary_feedback: parsed.summary_feedback, - rework_instructions: if passed { None } else { parsed.rework_instructions }, - }) - } - - /// Save evaluation result to database - pub async fn save_evaluation( - &self, - contract_id: Uuid, - chain_id: Uuid, - chain_contract_id: Uuid, - result: &ContractEvaluationResult, - ) -> Result { - let req = CreateContractEvaluationRequest { - contract_id, - chain_id: Some(chain_id), - chain_contract_id: Some(chain_contract_id), - evaluator_model: Some(format!("{:?}", self.model)), - passed: result.passed, - overall_score: Some(result.overall_score), - criteria_results: result.criteria_results.clone(), - summary_feedback: result.summary_feedback.clone(), - rework_instructions: result.rework_instructions.clone(), - }; - - let evaluation = repository::create_contract_evaluation(&self.pool, req).await?; - - // Update chain contract status - let status = if result.passed { "passed" } else { "failed" }; - repository::update_chain_contract_evaluation_status( - &self.pool, - chain_contract_id, - status, - Some(evaluation.id), - result.rework_instructions.as_deref(), - ) - .await?; - - Ok(evaluation) - } -} - -/// JSON structure for parsing LLM response -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -struct EvaluationResponseJson { - passed: bool, - overall_score: f64, - criteria_results: Vec, - summary_feedback: String, - rework_instructions: Option, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -struct CriterionResultJson { - criterion_id: String, - #[serde(alias = "met")] - passed: bool, - #[serde(default)] - score: f64, - feedback: String, -} - -/// Extract JSON from a response that may contain markdown code blocks -fn extract_json_from_response(response: &str) -> Result { - // Try to find JSON in code blocks first - if let Some(start) = response.find("```json") { - let json_start = start + 7; - if let Some(end) = response[json_start..].find("```") { - return Ok(response[json_start..json_start + end].trim().to_string()); - } - } - - // Try plain code blocks - if let Some(start) = response.find("```") { - let json_start = start + 3; - // Skip any language identifier on the same line - let actual_start = response[json_start..] - .find('\n') - .map(|i| json_start + i + 1) - .unwrap_or(json_start); - if let Some(end) = response[actual_start..].find("```") { - return Ok(response[actual_start..actual_start + end].trim().to_string()); - } - } - - // Try to find raw JSON (starts with {) - if let Some(start) = response.find('{') { - // Find matching closing brace - let mut depth = 0; - let mut end = start; - for (i, c) in response[start..].char_indices() { - match c { - '{' => depth += 1, - '}' => { - depth -= 1; - if depth == 0 { - end = start + i + 1; - break; - } - } - _ => {} - } - } - if end > start { - return Ok(response[start..end].to_string()); - } - } - - Err(EvaluationError::ParseError( - "Could not find JSON in response".to_string(), - )) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_json_from_code_block() { - let response = r#"Here is the evaluation: - -```json -{ - "passed": true, - "overallScore": 0.85 } -``` -Done."#; +/// Error types for contract evaluation. +#[derive(Debug, thiserror::Error)] +pub enum ContractEvaluatorError { + #[error("Database error: {0}")] + Database(#[from] sqlx::Error), - let json = extract_json_from_response(response).unwrap(); - assert!(json.contains("\"passed\": true")); - } + #[error("LLM error: {0}")] + Llm(String), - #[test] - fn test_extract_json_raw() { - let response = r#"The result is {"passed": false, "overallScore": 0.5}"#; - let json = extract_json_from_response(response).unwrap(); - assert!(json.contains("\"passed\": false")); - } + #[error("Not implemented: {0}")] + NotImplemented(String), } -- cgit v1.2.3