diff options
| author | soryu <soryu@soryu.co> | 2026-02-05 23:42:48 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2026-02-05 23:42:48 +0000 |
| commit | 88a4f15ce1310f8ee8693835be14aa5280233f17 (patch) | |
| tree | 5c1a0417e02071d2198d13478ffa85533b19f891 /makima/src/orchestration/verifier.rs | |
| parent | f1a50b80f3969d150bd1c31edde0aff05369157e (diff) | |
| download | soryu-88a4f15ce1310f8ee8693835be14aa5280233f17.tar.gz soryu-88a4f15ce1310f8ee8693835be14aa5280233f17.zip | |
Add directive-first chain system redesign
Redesigns the chain system with a directive-first architecture where
Directive is the top-level entity (the "why/what") and Chains are
generated execution plans (the "how") that can be dynamically modified.
Backend:
- Add database migration for directive system tables
- Add Directive, DirectiveChain, ChainStep, DirectiveEvent models
- Add DirectiveVerifier and DirectiveApproval models
- Add orchestration module with engine, planner, and verifier
- Add comprehensive API handlers for directives
- Add daemon CLI commands for directive management
- Add directive skill documentation
- Integrate contract completion with directive engine
- Add SSE endpoint for real-time directive events
Frontend:
- Add directives route with split-view layout
- Add 6-tab detail view (Overview, Chain, Events, Evaluations, Approvals, Verifiers)
- Add React Flow DAG visualization for chain steps
- Add SSE subscription hook for real-time event updates
- Add useDirectives and useDirectiveEventSubscription hooks
- Add directive types and API functions
Fixes:
- Fix test failures in ws/protocol, task_output, completion_gate, patch
- Fix word boundary matching in looks_like_task()
- Fix parse_last() to find actual last completion gate
- Fix create_export_patch when merge-base equals HEAD
- Clean up clippy warnings in new code
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'makima/src/orchestration/verifier.rs')
| -rw-r--r-- | makima/src/orchestration/verifier.rs | 806 |
1 files changed, 806 insertions, 0 deletions
diff --git a/makima/src/orchestration/verifier.rs b/makima/src/orchestration/verifier.rs new file mode 100644 index 0000000..e98da50 --- /dev/null +++ b/makima/src/orchestration/verifier.rs @@ -0,0 +1,806 @@ +//! Verification system for directive step evaluation. +//! +//! Provides tiered verification: programmatic verifiers run first, +//! then LLM evaluation if programmatic checks pass. Composite scoring +//! combines results with configurable weights. + +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use serde_json::Value as JsonValue; +use std::path::Path; +use thiserror::Error; +use uuid::Uuid; + +/// Confidence level based on composite score and thresholds. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConfidenceLevel { + /// High confidence (score >= green threshold) + Green, + /// Medium confidence (score >= yellow threshold but < green) + Yellow, + /// Low confidence (score < yellow threshold) + Red, +} + +impl ConfidenceLevel { + /// Compute confidence level from score and thresholds. + pub fn from_score(score: f64, green_threshold: f64, yellow_threshold: f64) -> Self { + if score >= green_threshold { + ConfidenceLevel::Green + } else if score >= yellow_threshold { + ConfidenceLevel::Yellow + } else { + ConfidenceLevel::Red + } + } + + /// Convert to string for database storage. + pub fn as_str(&self) -> &'static str { + match self { + ConfidenceLevel::Green => "green", + ConfidenceLevel::Yellow => "yellow", + ConfidenceLevel::Red => "red", + } + } +} + +impl std::fmt::Display for ConfidenceLevel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +/// Type of verifier for categorization. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum VerifierType { + /// Run test suite (npm test, cargo test, pytest, etc.) + TestRunner, + /// Run linter (eslint, clippy, ruff, etc.) + Linter, + /// Run type checker (tsc, mypy, etc.) + TypeChecker, + /// Run build command (npm build, cargo build, etc.) + Build, + /// Custom command verifier + Custom, + /// LLM-based semantic evaluation + Llm, +} + +impl VerifierType { + pub fn as_str(&self) -> &'static str { + match self { + VerifierType::TestRunner => "test_runner", + VerifierType::Linter => "linter", + VerifierType::TypeChecker => "type_checker", + VerifierType::Build => "build", + VerifierType::Custom => "custom", + VerifierType::Llm => "llm", + } + } +} + +/// Result of a single verifier run. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VerifierResult { + /// Name of the verifier + pub name: String, + /// Type of verifier + pub verifier_type: VerifierType, + /// Whether the verification passed + pub passed: bool, + /// Score from 0.0 to 1.0 (1.0 = perfect, 0.0 = complete failure) + pub score: f64, + /// Weight for composite scoring (default 1.0 for programmatic, 2.0 for LLM) + pub weight: f64, + /// Whether this verifier is required (failure = automatic red confidence) + pub required: bool, + /// Human-readable output/feedback + pub output: String, + /// Structured details (test counts, lint errors, etc.) + pub details: Option<JsonValue>, + /// Execution time in milliseconds + pub duration_ms: u64, +} + +impl VerifierResult { + /// Create a passed result with full score. + pub fn passed(name: String, verifier_type: VerifierType, output: String) -> Self { + Self { + name, + verifier_type, + passed: true, + score: 1.0, + weight: 1.0, + required: false, + output, + details: None, + duration_ms: 0, + } + } + + /// Create a failed result with zero score. + pub fn failed(name: String, verifier_type: VerifierType, output: String) -> Self { + Self { + name, + verifier_type, + passed: false, + score: 0.0, + weight: 1.0, + required: false, + output, + details: None, + duration_ms: 0, + } + } + + /// Set the weight for this result. + pub fn with_weight(mut self, weight: f64) -> Self { + self.weight = weight; + self + } + + /// Mark this verifier as required. + pub fn as_required(mut self) -> Self { + self.required = true; + self + } + + /// Set the score explicitly. + pub fn with_score(mut self, score: f64) -> Self { + self.score = score.clamp(0.0, 1.0); + self + } + + /// Set structured details. + pub fn with_details(mut self, details: JsonValue) -> Self { + self.details = Some(details); + self + } + + /// Set execution duration. + pub fn with_duration(mut self, duration_ms: u64) -> Self { + self.duration_ms = duration_ms; + self + } +} + +/// Composite evaluation result combining multiple verifier results. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvaluationResult { + /// Unique ID for this evaluation + pub id: Uuid, + /// Step ID being evaluated + pub step_id: Uuid, + /// Whether all required verifiers passed + pub passed: bool, + /// Weighted composite score (0.0-1.0) + pub composite_score: f64, + /// Confidence level derived from score + pub confidence_level: ConfidenceLevel, + /// Individual verifier results + pub verifier_results: Vec<VerifierResult>, + /// Summary feedback for the step + pub summary: String, + /// Rework instructions if failed + pub rework_instructions: Option<String>, + /// Total evaluation duration in milliseconds + pub total_duration_ms: u64, +} + +impl EvaluationResult { + /// Create a new evaluation result from verifier results. + pub fn from_verifiers( + step_id: Uuid, + results: Vec<VerifierResult>, + green_threshold: f64, + yellow_threshold: f64, + ) -> Self { + let id = Uuid::new_v4(); + + // Check if any required verifier failed + let any_required_failed = results.iter().any(|r| r.required && !r.passed); + + // Calculate weighted composite score + let (total_weighted_score, total_weight) = + results + .iter() + .fold((0.0, 0.0), |(score_acc, weight_acc), r| { + (score_acc + r.score * r.weight, weight_acc + r.weight) + }); + + let composite_score = if total_weight > 0.0 { + total_weighted_score / total_weight + } else { + 0.0 + }; + + // Override confidence to red if any required verifier failed + let confidence_level = if any_required_failed { + ConfidenceLevel::Red + } else { + ConfidenceLevel::from_score(composite_score, green_threshold, yellow_threshold) + }; + + let passed = !any_required_failed && confidence_level != ConfidenceLevel::Red; + + // Generate summary + let passed_count = results.iter().filter(|r| r.passed).count(); + let total_count = results.len(); + let summary = format!( + "{}/{} verifiers passed, composite score: {:.2}, confidence: {}", + passed_count, total_count, composite_score, confidence_level + ); + + // Generate rework instructions if failed + let rework_instructions = if !passed { + let failed_verifiers: Vec<&str> = results + .iter() + .filter(|r| !r.passed) + .map(|r| r.name.as_str()) + .collect(); + Some(format!( + "Fix issues identified by: {}", + failed_verifiers.join(", ") + )) + } else { + None + }; + + let total_duration_ms = results.iter().map(|r| r.duration_ms).sum(); + + Self { + id, + step_id, + passed, + composite_score, + confidence_level, + verifier_results: results, + summary, + rework_instructions, + total_duration_ms, + } + } +} + +/// Error type for verification operations. +#[derive(Error, Debug)] +pub enum VerifierError { + #[error("Command execution failed: {0}")] + CommandFailed(String), + + #[error("Command timed out after {0}ms")] + Timeout(u64), + + #[error("Working directory not found: {0}")] + WorkingDirectoryNotFound(String), + + #[error("Verifier not configured: {0}")] + NotConfigured(String), + + #[error("Parse error: {0}")] + ParseError(String), + + #[error("LLM error: {0}")] + LlmError(String), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), +} + +/// Verifier trait for pluggable verification implementations. +#[async_trait] +pub trait Verifier: Send + Sync { + /// Get the name of this verifier. + fn name(&self) -> &str; + + /// Get the type of this verifier. + fn verifier_type(&self) -> VerifierType; + + /// Check if this verifier is applicable to the given repository. + async fn is_applicable(&self, repo_path: &Path) -> bool; + + /// Run verification and return result. + async fn verify(&self, repo_path: &Path, context: &VerificationContext) + -> Result<VerifierResult, VerifierError>; +} + +/// Context provided to verifiers during execution. +#[derive(Debug, Clone)] +pub struct VerificationContext { + /// Step ID being verified + pub step_id: Uuid, + /// Contract ID if step has been instantiated + pub contract_id: Option<Uuid>, + /// Files that were modified in this step + pub modified_files: Vec<String>, + /// Step description for LLM context + pub step_description: String, + /// Acceptance criteria for LLM evaluation + pub acceptance_criteria: Vec<String>, + /// Additional context from directive + pub directive_context: String, +} + +/// Command-based verifier for running shell commands. +pub struct CommandVerifier { + name: String, + verifier_type: VerifierType, + command: String, + #[allow(dead_code)] + working_dir: Option<String>, + #[allow(dead_code)] + timeout_ms: u64, + required: bool, + /// Files/patterns that indicate this verifier is applicable + applicable_patterns: Vec<String>, +} + +impl CommandVerifier { + /// Create a new command verifier. + pub fn new( + name: impl Into<String>, + verifier_type: VerifierType, + command: impl Into<String>, + ) -> Self { + Self { + name: name.into(), + verifier_type, + command: command.into(), + working_dir: None, + timeout_ms: 300_000, // 5 minute default + required: false, + applicable_patterns: Vec::new(), + } + } + + /// Set the working directory. + #[allow(dead_code)] + pub fn with_working_dir(mut self, dir: impl Into<String>) -> Self { + self.working_dir = Some(dir.into()); + self + } + + /// Set the timeout in milliseconds. + #[allow(dead_code)] + pub fn with_timeout(mut self, timeout_ms: u64) -> Self { + self.timeout_ms = timeout_ms; + self + } + + /// Mark as required verifier. + pub fn as_required(mut self) -> Self { + self.required = true; + self + } + + /// Add applicability patterns (files that must exist). + pub fn with_patterns(mut self, patterns: Vec<String>) -> Self { + self.applicable_patterns = patterns; + self + } +} + +#[async_trait] +impl Verifier for CommandVerifier { + fn name(&self) -> &str { + &self.name + } + + fn verifier_type(&self) -> VerifierType { + self.verifier_type.clone() + } + + async fn is_applicable(&self, repo_path: &Path) -> bool { + if self.applicable_patterns.is_empty() { + return true; + } + + for pattern in &self.applicable_patterns { + let check_path = repo_path.join(pattern); + if check_path.exists() { + return true; + } + } + false + } + + async fn verify( + &self, + repo_path: &Path, + _context: &VerificationContext, + ) -> Result<VerifierResult, VerifierError> { + let start = std::time::Instant::now(); + + let work_dir = self + .working_dir + .as_ref() + .map(|d| repo_path.join(d)) + .unwrap_or_else(|| repo_path.to_path_buf()); + + if !work_dir.exists() { + return Err(VerifierError::WorkingDirectoryNotFound( + work_dir.display().to_string(), + )); + } + + // Parse command into program and args + let parts: Vec<&str> = self.command.split_whitespace().collect(); + if parts.is_empty() { + return Err(VerifierError::CommandFailed( + "Empty command".to_string(), + )); + } + + let program = parts[0]; + let args = &parts[1..]; + + // Execute command + let output = tokio::process::Command::new(program) + .args(args) + .current_dir(&work_dir) + .output() + .await?; + + let duration_ms = start.elapsed().as_millis() as u64; + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined_output = format!("{}\n{}", stdout, stderr); + + let passed = output.status.success(); + let score = if passed { 1.0 } else { 0.0 }; + + let mut result = VerifierResult { + name: self.name.clone(), + verifier_type: self.verifier_type.clone(), + passed, + score, + weight: 1.0, + required: self.required, + output: combined_output, + details: Some(serde_json::json!({ + "exit_code": output.status.code(), + "command": self.command, + "working_dir": work_dir.display().to_string(), + })), + duration_ms, + }; + + // Try to extract more detailed scoring from output + result = self.enhance_result(result, &stdout); + + Ok(result) + } +} + +impl CommandVerifier { + /// Enhance result with parsed details from output. + fn enhance_result(&self, mut result: VerifierResult, stdout: &str) -> VerifierResult { + match self.verifier_type { + VerifierType::TestRunner => { + // Try to parse test counts from common formats + if let Some((passed, failed, total)) = parse_test_output(stdout) { + result.details = Some(serde_json::json!({ + "tests_passed": passed, + "tests_failed": failed, + "tests_total": total, + "command": self.command, + })); + if total > 0 { + result.score = passed as f64 / total as f64; + } + } + } + VerifierType::Linter => { + // Try to parse lint error counts + if let Some(error_count) = parse_lint_output(stdout) { + result.details = Some(serde_json::json!({ + "errors": error_count, + "command": self.command, + })); + // Score decreases with more errors (up to 10 errors = 0) + result.score = (1.0 - (error_count as f64 / 10.0)).max(0.0); + } + } + _ => {} + } + result + } +} + +/// Parse test output for common formats (Jest, pytest, cargo test). +fn parse_test_output(output: &str) -> Option<(u32, u32, u32)> { + // Jest format: "Tests: X passed, Y failed, Z total" + if let Some(caps) = regex::Regex::new(r"Tests:\s*(\d+)\s*passed,\s*(\d+)\s*failed,\s*(\d+)\s*total") + .ok()? + .captures(output) + { + let passed: u32 = caps.get(1)?.as_str().parse().ok()?; + let failed: u32 = caps.get(2)?.as_str().parse().ok()?; + let total: u32 = caps.get(3)?.as_str().parse().ok()?; + return Some((passed, failed, total)); + } + + // pytest format: "X passed, Y failed" + if let Some(caps) = regex::Regex::new(r"(\d+)\s*passed(?:,\s*(\d+)\s*failed)?") + .ok()? + .captures(output) + { + let passed: u32 = caps.get(1)?.as_str().parse().ok()?; + let failed: u32 = caps.get(2).map(|m| m.as_str().parse().ok()).flatten().unwrap_or(0); + let total = passed + failed; + return Some((passed, failed, total)); + } + + // cargo test format: "test result: ok. X passed; Y failed;" + if let Some(caps) = regex::Regex::new(r"test result:.*?(\d+)\s*passed;\s*(\d+)\s*failed") + .ok()? + .captures(output) + { + let passed: u32 = caps.get(1)?.as_str().parse().ok()?; + let failed: u32 = caps.get(2)?.as_str().parse().ok()?; + let total = passed + failed; + return Some((passed, failed, total)); + } + + None +} + +/// Parse lint output for error counts. +fn parse_lint_output(output: &str) -> Option<u32> { + // ESLint format: "X problems (Y errors, Z warnings)" + if let Some(caps) = regex::Regex::new(r"(\d+)\s*problems?\s*\((\d+)\s*errors?") + .ok()? + .captures(output) + { + return caps.get(2)?.as_str().parse().ok(); + } + + // Clippy format: "warning: X warnings emitted" + if let Some(caps) = regex::Regex::new(r"warning:\s*(\d+)\s*warnings?\s*emitted") + .ok()? + .captures(output) + { + return caps.get(1)?.as_str().parse().ok(); + } + + None +} + +/// Auto-detect applicable verifiers for a repository. +pub async fn auto_detect_verifiers(repo_path: &Path) -> Vec<Box<dyn Verifier>> { + let mut verifiers: Vec<Box<dyn Verifier>> = Vec::new(); + + // Check for package.json (Node.js) + let package_json = repo_path.join("package.json"); + if package_json.exists() { + if let Ok(content) = tokio::fs::read_to_string(&package_json).await { + if let Ok(pkg) = serde_json::from_str::<serde_json::Value>(&content) { + if let Some(scripts) = pkg.get("scripts").and_then(|s| s.as_object()) { + // Test runner + if scripts.contains_key("test") { + verifiers.push(Box::new( + CommandVerifier::new("npm-test", VerifierType::TestRunner, "npm test") + .with_patterns(vec!["package.json".to_string()]) + .as_required(), + )); + } + + // Linter + if scripts.contains_key("lint") { + verifiers.push(Box::new( + CommandVerifier::new("npm-lint", VerifierType::Linter, "npm run lint") + .with_patterns(vec!["package.json".to_string()]), + )); + } + + // Build + if scripts.contains_key("build") { + verifiers.push(Box::new( + CommandVerifier::new("npm-build", VerifierType::Build, "npm run build") + .with_patterns(vec!["package.json".to_string()]) + .as_required(), + )); + } + + // Type check (for TypeScript projects) + if scripts.contains_key("typecheck") || scripts.contains_key("type-check") { + let cmd = if scripts.contains_key("typecheck") { + "npm run typecheck" + } else { + "npm run type-check" + }; + verifiers.push(Box::new( + CommandVerifier::new("npm-typecheck", VerifierType::TypeChecker, cmd) + .with_patterns(vec!["tsconfig.json".to_string()]), + )); + } + } + } + } + } + + // Check for Cargo.toml (Rust) + let cargo_toml = repo_path.join("Cargo.toml"); + if cargo_toml.exists() { + verifiers.push(Box::new( + CommandVerifier::new("cargo-test", VerifierType::TestRunner, "cargo test") + .with_patterns(vec!["Cargo.toml".to_string()]) + .as_required(), + )); + + verifiers.push(Box::new( + CommandVerifier::new("cargo-clippy", VerifierType::Linter, "cargo clippy -- -D warnings") + .with_patterns(vec!["Cargo.toml".to_string()]), + )); + + verifiers.push(Box::new( + CommandVerifier::new("cargo-build", VerifierType::Build, "cargo build") + .with_patterns(vec!["Cargo.toml".to_string()]) + .as_required(), + )); + } + + // Check for pyproject.toml or setup.py (Python) + let pyproject = repo_path.join("pyproject.toml"); + let setup_py = repo_path.join("setup.py"); + if pyproject.exists() || setup_py.exists() { + verifiers.push(Box::new( + CommandVerifier::new("pytest", VerifierType::TestRunner, "pytest") + .with_patterns(vec![ + "pyproject.toml".to_string(), + "setup.py".to_string(), + ]) + .as_required(), + )); + + verifiers.push(Box::new( + CommandVerifier::new("ruff", VerifierType::Linter, "ruff check .") + .with_patterns(vec!["pyproject.toml".to_string()]), + )); + } + + verifiers +} + +/// Composite evaluator that runs multiple verifiers and combines results. +pub struct CompositeEvaluator { + verifiers: Vec<Box<dyn Verifier>>, + green_threshold: f64, + yellow_threshold: f64, +} + +impl CompositeEvaluator { + /// Create a new composite evaluator with default thresholds. + pub fn new(verifiers: Vec<Box<dyn Verifier>>) -> Self { + Self { + verifiers, + green_threshold: 0.8, + yellow_threshold: 0.5, + } + } + + /// Set confidence thresholds. + pub fn with_thresholds(mut self, green: f64, yellow: f64) -> Self { + self.green_threshold = green; + self.yellow_threshold = yellow; + self + } + + /// Add a verifier. + pub fn add_verifier(mut self, verifier: Box<dyn Verifier>) -> Self { + self.verifiers.push(verifier); + self + } + + /// Run all applicable verifiers and return composite result. + pub async fn evaluate( + &self, + repo_path: &Path, + context: &VerificationContext, + ) -> EvaluationResult { + let mut results = Vec::new(); + + for verifier in &self.verifiers { + if !verifier.is_applicable(repo_path).await { + continue; + } + + match verifier.verify(repo_path, context).await { + Ok(result) => results.push(result), + Err(e) => { + // Convert error to failed result + results.push(VerifierResult::failed( + verifier.name().to_string(), + verifier.verifier_type(), + format!("Verifier error: {}", e), + )); + } + } + } + + EvaluationResult::from_verifiers( + context.step_id, + results, + self.green_threshold, + self.yellow_threshold, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_confidence_level_from_score() { + assert_eq!( + ConfidenceLevel::from_score(0.9, 0.8, 0.5), + ConfidenceLevel::Green + ); + assert_eq!( + ConfidenceLevel::from_score(0.8, 0.8, 0.5), + ConfidenceLevel::Green + ); + assert_eq!( + ConfidenceLevel::from_score(0.6, 0.8, 0.5), + ConfidenceLevel::Yellow + ); + assert_eq!( + ConfidenceLevel::from_score(0.5, 0.8, 0.5), + ConfidenceLevel::Yellow + ); + assert_eq!( + ConfidenceLevel::from_score(0.4, 0.8, 0.5), + ConfidenceLevel::Red + ); + } + + #[test] + fn test_evaluation_result_composite_score() { + let results = vec![ + VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()) + .with_weight(1.0), + VerifierResult::failed("test2".into(), VerifierType::Linter, "Failed".into()) + .with_weight(1.0), + ]; + + let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5); + assert!((eval.composite_score - 0.5).abs() < 0.001); + assert_eq!(eval.confidence_level, ConfidenceLevel::Yellow); + } + + #[test] + fn test_required_verifier_override() { + let results = vec![ + VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()), + VerifierResult::failed("build".into(), VerifierType::Build, "Failed".into()) + .as_required(), + ]; + + let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5); + // Even though composite score is 0.5, required failure overrides to red + assert_eq!(eval.confidence_level, ConfidenceLevel::Red); + assert!(!eval.passed); + } + + #[test] + fn test_parse_test_output_jest() { + let output = "Tests: 10 passed, 2 failed, 12 total"; + let (passed, failed, total) = parse_test_output(output).unwrap(); + assert_eq!(passed, 10); + assert_eq!(failed, 2); + assert_eq!(total, 12); + } + + #[test] + fn test_parse_test_output_cargo() { + let output = "test result: ok. 25 passed; 0 failed;"; + let (passed, failed, total) = parse_test_output(output).unwrap(); + assert_eq!(passed, 25); + assert_eq!(failed, 0); + assert_eq!(total, 25); + } +} |
