//! Verification system for directive step evaluation. //! //! Provides tiered verification: programmatic verifiers run first, //! then LLM evaluation if programmatic checks pass. Composite scoring //! combines results with configurable weights. use async_trait::async_trait; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use std::path::Path; use thiserror::Error; use uuid::Uuid; /// Confidence level based on composite score and thresholds. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum ConfidenceLevel { /// High confidence (score >= green threshold) Green, /// Medium confidence (score >= yellow threshold but < green) Yellow, /// Low confidence (score < yellow threshold) Red, } impl ConfidenceLevel { /// Compute confidence level from score and thresholds. pub fn from_score(score: f64, green_threshold: f64, yellow_threshold: f64) -> Self { if score >= green_threshold { ConfidenceLevel::Green } else if score >= yellow_threshold { ConfidenceLevel::Yellow } else { ConfidenceLevel::Red } } /// Convert to string for database storage. pub fn as_str(&self) -> &'static str { match self { ConfidenceLevel::Green => "green", ConfidenceLevel::Yellow => "yellow", ConfidenceLevel::Red => "red", } } } impl std::fmt::Display for ConfidenceLevel { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.as_str()) } } /// Type of verifier for categorization. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum VerifierType { /// Run test suite (npm test, cargo test, pytest, etc.) TestRunner, /// Run linter (eslint, clippy, ruff, etc.) Linter, /// Run type checker (tsc, mypy, etc.) TypeChecker, /// Run build command (npm build, cargo build, etc.) Build, /// Custom command verifier Custom, /// LLM-based semantic evaluation Llm, } impl VerifierType { pub fn as_str(&self) -> &'static str { match self { VerifierType::TestRunner => "test_runner", VerifierType::Linter => "linter", VerifierType::TypeChecker => "type_checker", VerifierType::Build => "build", VerifierType::Custom => "custom", VerifierType::Llm => "llm", } } } /// Result of a single verifier run. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VerifierResult { /// Name of the verifier pub name: String, /// Type of verifier pub verifier_type: VerifierType, /// Whether the verification passed pub passed: bool, /// Score from 0.0 to 1.0 (1.0 = perfect, 0.0 = complete failure) pub score: f64, /// Weight for composite scoring (default 1.0 for programmatic, 2.0 for LLM) pub weight: f64, /// Whether this verifier is required (failure = automatic red confidence) pub required: bool, /// Human-readable output/feedback pub output: String, /// Structured details (test counts, lint errors, etc.) pub details: Option, /// Execution time in milliseconds pub duration_ms: u64, } impl VerifierResult { /// Create a passed result with full score. pub fn passed(name: String, verifier_type: VerifierType, output: String) -> Self { Self { name, verifier_type, passed: true, score: 1.0, weight: 1.0, required: false, output, details: None, duration_ms: 0, } } /// Create a failed result with zero score. pub fn failed(name: String, verifier_type: VerifierType, output: String) -> Self { Self { name, verifier_type, passed: false, score: 0.0, weight: 1.0, required: false, output, details: None, duration_ms: 0, } } /// Set the weight for this result. pub fn with_weight(mut self, weight: f64) -> Self { self.weight = weight; self } /// Mark this verifier as required. pub fn as_required(mut self) -> Self { self.required = true; self } /// Set the score explicitly. pub fn with_score(mut self, score: f64) -> Self { self.score = score.clamp(0.0, 1.0); self } /// Set structured details. pub fn with_details(mut self, details: JsonValue) -> Self { self.details = Some(details); self } /// Set execution duration. pub fn with_duration(mut self, duration_ms: u64) -> Self { self.duration_ms = duration_ms; self } } /// Composite evaluation result combining multiple verifier results. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EvaluationResult { /// Unique ID for this evaluation pub id: Uuid, /// Step ID being evaluated pub step_id: Uuid, /// Whether all required verifiers passed pub passed: bool, /// Weighted composite score (0.0-1.0) pub composite_score: f64, /// Confidence level derived from score pub confidence_level: ConfidenceLevel, /// Individual verifier results pub verifier_results: Vec, /// Summary feedback for the step pub summary: String, /// Rework instructions if failed pub rework_instructions: Option, /// Total evaluation duration in milliseconds pub total_duration_ms: u64, } impl EvaluationResult { /// Create a new evaluation result from verifier results. pub fn from_verifiers( step_id: Uuid, results: Vec, green_threshold: f64, yellow_threshold: f64, ) -> Self { let id = Uuid::new_v4(); // Check if any required verifier failed let any_required_failed = results.iter().any(|r| r.required && !r.passed); // Calculate weighted composite score let (total_weighted_score, total_weight) = results .iter() .fold((0.0, 0.0), |(score_acc, weight_acc), r| { (score_acc + r.score * r.weight, weight_acc + r.weight) }); let composite_score = if total_weight > 0.0 { total_weighted_score / total_weight } else { 0.0 }; // Override confidence to red if any required verifier failed let confidence_level = if any_required_failed { ConfidenceLevel::Red } else { ConfidenceLevel::from_score(composite_score, green_threshold, yellow_threshold) }; let passed = !any_required_failed && confidence_level != ConfidenceLevel::Red; // Generate summary let passed_count = results.iter().filter(|r| r.passed).count(); let total_count = results.len(); let summary = format!( "{}/{} verifiers passed, composite score: {:.2}, confidence: {}", passed_count, total_count, composite_score, confidence_level ); // Generate rework instructions if failed let rework_instructions = if !passed { let failed_verifiers: Vec<&str> = results .iter() .filter(|r| !r.passed) .map(|r| r.name.as_str()) .collect(); Some(format!( "Fix issues identified by: {}", failed_verifiers.join(", ") )) } else { None }; let total_duration_ms = results.iter().map(|r| r.duration_ms).sum(); Self { id, step_id, passed, composite_score, confidence_level, verifier_results: results, summary, rework_instructions, total_duration_ms, } } } /// Error type for verification operations. #[derive(Error, Debug)] pub enum VerifierError { #[error("Command execution failed: {0}")] CommandFailed(String), #[error("Command timed out after {0}ms")] Timeout(u64), #[error("Working directory not found: {0}")] WorkingDirectoryNotFound(String), #[error("Verifier not configured: {0}")] NotConfigured(String), #[error("Parse error: {0}")] ParseError(String), #[error("LLM error: {0}")] LlmError(String), #[error("IO error: {0}")] Io(#[from] std::io::Error), } /// Information about a verifier for serialization and database storage. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VerifierInfo { pub name: String, pub verifier_type: String, pub command: String, pub working_directory: Option, pub detect_files: Vec, pub weight: f64, pub required: bool, } /// Verifier trait for pluggable verification implementations. #[async_trait] pub trait Verifier: Send + Sync { /// Get the name of this verifier. fn name(&self) -> &str; /// Get the type of this verifier. fn verifier_type(&self) -> VerifierType; /// Get serializable info about this verifier. fn info(&self) -> VerifierInfo; /// Check if this verifier is applicable to the given repository. async fn is_applicable(&self, repo_path: &Path) -> bool; /// Run verification and return result. async fn verify(&self, repo_path: &Path, context: &VerificationContext) -> Result; } /// Context provided to verifiers during execution. #[derive(Debug, Clone)] pub struct VerificationContext { /// Step ID being verified pub step_id: Uuid, /// Contract ID if step has been instantiated pub contract_id: Option, /// Files that were modified in this step pub modified_files: Vec, /// Step description for LLM context pub step_description: String, /// Acceptance criteria for LLM evaluation pub acceptance_criteria: Vec, /// Additional context from directive pub directive_context: String, } /// Command-based verifier for running shell commands. pub struct CommandVerifier { name: String, verifier_type: VerifierType, command: String, #[allow(dead_code)] working_dir: Option, #[allow(dead_code)] timeout_ms: u64, required: bool, /// Files/patterns that indicate this verifier is applicable applicable_patterns: Vec, } impl CommandVerifier { /// Create a new command verifier. pub fn new( name: impl Into, verifier_type: VerifierType, command: impl Into, ) -> Self { Self { name: name.into(), verifier_type, command: command.into(), working_dir: None, timeout_ms: 300_000, // 5 minute default required: false, applicable_patterns: Vec::new(), } } /// Set the working directory. #[allow(dead_code)] pub fn with_working_dir(mut self, dir: impl Into) -> Self { self.working_dir = Some(dir.into()); self } /// Set the timeout in milliseconds. #[allow(dead_code)] pub fn with_timeout(mut self, timeout_ms: u64) -> Self { self.timeout_ms = timeout_ms; self } /// Mark as required verifier. pub fn as_required(mut self) -> Self { self.required = true; self } /// Add applicability patterns (files that must exist). pub fn with_patterns(mut self, patterns: Vec) -> Self { self.applicable_patterns = patterns; self } } #[async_trait] impl Verifier for CommandVerifier { fn name(&self) -> &str { &self.name } fn verifier_type(&self) -> VerifierType { self.verifier_type.clone() } fn info(&self) -> VerifierInfo { VerifierInfo { name: self.name.clone(), verifier_type: self.verifier_type.as_str().to_string(), command: self.command.clone(), working_directory: self.working_dir.clone(), detect_files: self.applicable_patterns.clone(), weight: 1.0, required: self.required, } } async fn is_applicable(&self, repo_path: &Path) -> bool { if self.applicable_patterns.is_empty() { return true; } for pattern in &self.applicable_patterns { let check_path = repo_path.join(pattern); if check_path.exists() { return true; } } false } async fn verify( &self, repo_path: &Path, _context: &VerificationContext, ) -> Result { let start = std::time::Instant::now(); let work_dir = self .working_dir .as_ref() .map(|d| repo_path.join(d)) .unwrap_or_else(|| repo_path.to_path_buf()); if !work_dir.exists() { return Err(VerifierError::WorkingDirectoryNotFound( work_dir.display().to_string(), )); } // Parse command into program and args let parts: Vec<&str> = self.command.split_whitespace().collect(); if parts.is_empty() { return Err(VerifierError::CommandFailed( "Empty command".to_string(), )); } let program = parts[0]; let args = &parts[1..]; // Execute command let output = tokio::process::Command::new(program) .args(args) .current_dir(&work_dir) .output() .await?; let duration_ms = start.elapsed().as_millis() as u64; let stdout = String::from_utf8_lossy(&output.stdout); let stderr = String::from_utf8_lossy(&output.stderr); let combined_output = format!("{}\n{}", stdout, stderr); let passed = output.status.success(); let score = if passed { 1.0 } else { 0.0 }; let mut result = VerifierResult { name: self.name.clone(), verifier_type: self.verifier_type.clone(), passed, score, weight: 1.0, required: self.required, output: combined_output, details: Some(serde_json::json!({ "exit_code": output.status.code(), "command": self.command, "working_dir": work_dir.display().to_string(), })), duration_ms, }; // Try to extract more detailed scoring from output result = self.enhance_result(result, &stdout); Ok(result) } } impl CommandVerifier { /// Enhance result with parsed details from output. fn enhance_result(&self, mut result: VerifierResult, stdout: &str) -> VerifierResult { match self.verifier_type { VerifierType::TestRunner => { // Try to parse test counts from common formats if let Some((passed, failed, total)) = parse_test_output(stdout) { result.details = Some(serde_json::json!({ "tests_passed": passed, "tests_failed": failed, "tests_total": total, "command": self.command, })); if total > 0 { result.score = passed as f64 / total as f64; } } } VerifierType::Linter => { // Try to parse lint error counts if let Some(error_count) = parse_lint_output(stdout) { result.details = Some(serde_json::json!({ "errors": error_count, "command": self.command, })); // Score decreases with more errors (up to 10 errors = 0) result.score = (1.0 - (error_count as f64 / 10.0)).max(0.0); } } _ => {} } result } } /// Parse test output for common formats (Jest, pytest, cargo test). fn parse_test_output(output: &str) -> Option<(u32, u32, u32)> { // Jest format: "Tests: X passed, Y failed, Z total" if let Some(caps) = regex::Regex::new(r"Tests:\s*(\d+)\s*passed,\s*(\d+)\s*failed,\s*(\d+)\s*total") .ok()? .captures(output) { let passed: u32 = caps.get(1)?.as_str().parse().ok()?; let failed: u32 = caps.get(2)?.as_str().parse().ok()?; let total: u32 = caps.get(3)?.as_str().parse().ok()?; return Some((passed, failed, total)); } // pytest format: "X passed, Y failed" if let Some(caps) = regex::Regex::new(r"(\d+)\s*passed(?:,\s*(\d+)\s*failed)?") .ok()? .captures(output) { let passed: u32 = caps.get(1)?.as_str().parse().ok()?; let failed: u32 = caps.get(2).map(|m| m.as_str().parse().ok()).flatten().unwrap_or(0); let total = passed + failed; return Some((passed, failed, total)); } // cargo test format: "test result: ok. X passed; Y failed;" if let Some(caps) = regex::Regex::new(r"test result:.*?(\d+)\s*passed;\s*(\d+)\s*failed") .ok()? .captures(output) { let passed: u32 = caps.get(1)?.as_str().parse().ok()?; let failed: u32 = caps.get(2)?.as_str().parse().ok()?; let total = passed + failed; return Some((passed, failed, total)); } None } /// Parse lint output for error counts. fn parse_lint_output(output: &str) -> Option { // ESLint format: "X problems (Y errors, Z warnings)" if let Some(caps) = regex::Regex::new(r"(\d+)\s*problems?\s*\((\d+)\s*errors?") .ok()? .captures(output) { return caps.get(2)?.as_str().parse().ok(); } // Clippy format: "warning: X warnings emitted" if let Some(caps) = regex::Regex::new(r"warning:\s*(\d+)\s*warnings?\s*emitted") .ok()? .captures(output) { return caps.get(1)?.as_str().parse().ok(); } None } /// Auto-detect applicable verifiers for a repository. pub async fn auto_detect_verifiers(repo_path: &Path) -> Vec> { let mut verifiers: Vec> = Vec::new(); // Check for package.json (Node.js) let package_json = repo_path.join("package.json"); if package_json.exists() { if let Ok(content) = tokio::fs::read_to_string(&package_json).await { if let Ok(pkg) = serde_json::from_str::(&content) { if let Some(scripts) = pkg.get("scripts").and_then(|s| s.as_object()) { // Test runner if scripts.contains_key("test") { verifiers.push(Box::new( CommandVerifier::new("npm-test", VerifierType::TestRunner, "npm test") .with_patterns(vec!["package.json".to_string()]) .as_required(), )); } // Linter if scripts.contains_key("lint") { verifiers.push(Box::new( CommandVerifier::new("npm-lint", VerifierType::Linter, "npm run lint") .with_patterns(vec!["package.json".to_string()]), )); } // Build if scripts.contains_key("build") { verifiers.push(Box::new( CommandVerifier::new("npm-build", VerifierType::Build, "npm run build") .with_patterns(vec!["package.json".to_string()]) .as_required(), )); } // Type check (for TypeScript projects) if scripts.contains_key("typecheck") || scripts.contains_key("type-check") { let cmd = if scripts.contains_key("typecheck") { "npm run typecheck" } else { "npm run type-check" }; verifiers.push(Box::new( CommandVerifier::new("npm-typecheck", VerifierType::TypeChecker, cmd) .with_patterns(vec!["tsconfig.json".to_string()]), )); } } } } } // Check for Cargo.toml (Rust) let cargo_toml = repo_path.join("Cargo.toml"); if cargo_toml.exists() { verifiers.push(Box::new( CommandVerifier::new("cargo-test", VerifierType::TestRunner, "cargo test") .with_patterns(vec!["Cargo.toml".to_string()]) .as_required(), )); verifiers.push(Box::new( CommandVerifier::new("cargo-clippy", VerifierType::Linter, "cargo clippy -- -D warnings") .with_patterns(vec!["Cargo.toml".to_string()]), )); verifiers.push(Box::new( CommandVerifier::new("cargo-build", VerifierType::Build, "cargo build") .with_patterns(vec!["Cargo.toml".to_string()]) .as_required(), )); } // Check for pyproject.toml or setup.py (Python) let pyproject = repo_path.join("pyproject.toml"); let setup_py = repo_path.join("setup.py"); if pyproject.exists() || setup_py.exists() { verifiers.push(Box::new( CommandVerifier::new("pytest", VerifierType::TestRunner, "pytest") .with_patterns(vec![ "pyproject.toml".to_string(), "setup.py".to_string(), ]) .as_required(), )); verifiers.push(Box::new( CommandVerifier::new("ruff", VerifierType::Linter, "ruff check .") .with_patterns(vec!["pyproject.toml".to_string()]), )); } verifiers } /// Composite evaluator that runs multiple verifiers and combines results. pub struct CompositeEvaluator { verifiers: Vec>, green_threshold: f64, yellow_threshold: f64, } impl CompositeEvaluator { /// Create a new composite evaluator with default thresholds. pub fn new(verifiers: Vec>) -> Self { Self { verifiers, green_threshold: 0.8, yellow_threshold: 0.5, } } /// Set confidence thresholds. pub fn with_thresholds(mut self, green: f64, yellow: f64) -> Self { self.green_threshold = green; self.yellow_threshold = yellow; self } /// Add a verifier. pub fn add_verifier(mut self, verifier: Box) -> Self { self.verifiers.push(verifier); self } /// Run all applicable verifiers and return composite result. pub async fn evaluate( &self, repo_path: &Path, context: &VerificationContext, ) -> EvaluationResult { let mut results = Vec::new(); for verifier in &self.verifiers { if !verifier.is_applicable(repo_path).await { continue; } match verifier.verify(repo_path, context).await { Ok(result) => results.push(result), Err(e) => { // Convert error to failed result results.push(VerifierResult::failed( verifier.name().to_string(), verifier.verifier_type(), format!("Verifier error: {}", e), )); } } } EvaluationResult::from_verifiers( context.step_id, results, self.green_threshold, self.yellow_threshold, ) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_confidence_level_from_score() { assert_eq!( ConfidenceLevel::from_score(0.9, 0.8, 0.5), ConfidenceLevel::Green ); assert_eq!( ConfidenceLevel::from_score(0.8, 0.8, 0.5), ConfidenceLevel::Green ); assert_eq!( ConfidenceLevel::from_score(0.6, 0.8, 0.5), ConfidenceLevel::Yellow ); assert_eq!( ConfidenceLevel::from_score(0.5, 0.8, 0.5), ConfidenceLevel::Yellow ); assert_eq!( ConfidenceLevel::from_score(0.4, 0.8, 0.5), ConfidenceLevel::Red ); } #[test] fn test_evaluation_result_composite_score() { let results = vec![ VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()) .with_weight(1.0), VerifierResult::failed("test2".into(), VerifierType::Linter, "Failed".into()) .with_weight(1.0), ]; let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5); assert!((eval.composite_score - 0.5).abs() < 0.001); assert_eq!(eval.confidence_level, ConfidenceLevel::Yellow); } #[test] fn test_required_verifier_override() { let results = vec![ VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()), VerifierResult::failed("build".into(), VerifierType::Build, "Failed".into()) .as_required(), ]; let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5); // Even though composite score is 0.5, required failure overrides to red assert_eq!(eval.confidence_level, ConfidenceLevel::Red); assert!(!eval.passed); } #[test] fn test_parse_test_output_jest() { let output = "Tests: 10 passed, 2 failed, 12 total"; let (passed, failed, total) = parse_test_output(output).unwrap(); assert_eq!(passed, 10); assert_eq!(failed, 2); assert_eq!(total, 12); } #[test] fn test_parse_test_output_cargo() { let output = "test result: ok. 25 passed; 0 failed;"; let (passed, failed, total) = parse_test_output(output).unwrap(); assert_eq!(passed, 25); assert_eq!(failed, 0); assert_eq!(total, 25); } }