diff options
Diffstat (limited to 'makima/src/orchestration/verifier.rs')
| -rw-r--r-- | makima/src/orchestration/verifier.rs | 833 |
1 files changed, 0 insertions, 833 deletions
diff --git a/makima/src/orchestration/verifier.rs b/makima/src/orchestration/verifier.rs deleted file mode 100644 index bc29e47..0000000 --- a/makima/src/orchestration/verifier.rs +++ /dev/null @@ -1,833 +0,0 @@ -//! Verification system for directive step evaluation. -//! -//! Provides tiered verification: programmatic verifiers run first, -//! then LLM evaluation if programmatic checks pass. Composite scoring -//! combines results with configurable weights. - -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use serde_json::Value as JsonValue; -use std::path::Path; -use thiserror::Error; -use uuid::Uuid; - -/// Confidence level based on composite score and thresholds. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum ConfidenceLevel { - /// High confidence (score >= green threshold) - Green, - /// Medium confidence (score >= yellow threshold but < green) - Yellow, - /// Low confidence (score < yellow threshold) - Red, -} - -impl ConfidenceLevel { - /// Compute confidence level from score and thresholds. - pub fn from_score(score: f64, green_threshold: f64, yellow_threshold: f64) -> Self { - if score >= green_threshold { - ConfidenceLevel::Green - } else if score >= yellow_threshold { - ConfidenceLevel::Yellow - } else { - ConfidenceLevel::Red - } - } - - /// Convert to string for database storage. - pub fn as_str(&self) -> &'static str { - match self { - ConfidenceLevel::Green => "green", - ConfidenceLevel::Yellow => "yellow", - ConfidenceLevel::Red => "red", - } - } -} - -impl std::fmt::Display for ConfidenceLevel { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -/// Type of verifier for categorization. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum VerifierType { - /// Run test suite (npm test, cargo test, pytest, etc.) - TestRunner, - /// Run linter (eslint, clippy, ruff, etc.) - Linter, - /// Run type checker (tsc, mypy, etc.) - TypeChecker, - /// Run build command (npm build, cargo build, etc.) - Build, - /// Custom command verifier - Custom, - /// LLM-based semantic evaluation - Llm, -} - -impl VerifierType { - pub fn as_str(&self) -> &'static str { - match self { - VerifierType::TestRunner => "test_runner", - VerifierType::Linter => "linter", - VerifierType::TypeChecker => "type_checker", - VerifierType::Build => "build", - VerifierType::Custom => "custom", - VerifierType::Llm => "llm", - } - } -} - -/// Result of a single verifier run. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VerifierResult { - /// Name of the verifier - pub name: String, - /// Type of verifier - pub verifier_type: VerifierType, - /// Whether the verification passed - pub passed: bool, - /// Score from 0.0 to 1.0 (1.0 = perfect, 0.0 = complete failure) - pub score: f64, - /// Weight for composite scoring (default 1.0 for programmatic, 2.0 for LLM) - pub weight: f64, - /// Whether this verifier is required (failure = automatic red confidence) - pub required: bool, - /// Human-readable output/feedback - pub output: String, - /// Structured details (test counts, lint errors, etc.) - pub details: Option<JsonValue>, - /// Execution time in milliseconds - pub duration_ms: u64, -} - -impl VerifierResult { - /// Create a passed result with full score. - pub fn passed(name: String, verifier_type: VerifierType, output: String) -> Self { - Self { - name, - verifier_type, - passed: true, - score: 1.0, - weight: 1.0, - required: false, - output, - details: None, - duration_ms: 0, - } - } - - /// Create a failed result with zero score. - pub fn failed(name: String, verifier_type: VerifierType, output: String) -> Self { - Self { - name, - verifier_type, - passed: false, - score: 0.0, - weight: 1.0, - required: false, - output, - details: None, - duration_ms: 0, - } - } - - /// Set the weight for this result. - pub fn with_weight(mut self, weight: f64) -> Self { - self.weight = weight; - self - } - - /// Mark this verifier as required. - pub fn as_required(mut self) -> Self { - self.required = true; - self - } - - /// Set the score explicitly. - pub fn with_score(mut self, score: f64) -> Self { - self.score = score.clamp(0.0, 1.0); - self - } - - /// Set structured details. - pub fn with_details(mut self, details: JsonValue) -> Self { - self.details = Some(details); - self - } - - /// Set execution duration. - pub fn with_duration(mut self, duration_ms: u64) -> Self { - self.duration_ms = duration_ms; - self - } -} - -/// Composite evaluation result combining multiple verifier results. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct EvaluationResult { - /// Unique ID for this evaluation - pub id: Uuid, - /// Step ID being evaluated - pub step_id: Uuid, - /// Whether all required verifiers passed - pub passed: bool, - /// Weighted composite score (0.0-1.0) - pub composite_score: f64, - /// Confidence level derived from score - pub confidence_level: ConfidenceLevel, - /// Individual verifier results - pub verifier_results: Vec<VerifierResult>, - /// Summary feedback for the step - pub summary: String, - /// Rework instructions if failed - pub rework_instructions: Option<String>, - /// Total evaluation duration in milliseconds - pub total_duration_ms: u64, -} - -impl EvaluationResult { - /// Create a new evaluation result from verifier results. - pub fn from_verifiers( - step_id: Uuid, - results: Vec<VerifierResult>, - green_threshold: f64, - yellow_threshold: f64, - ) -> Self { - let id = Uuid::new_v4(); - - // Check if any required verifier failed - let any_required_failed = results.iter().any(|r| r.required && !r.passed); - - // Calculate weighted composite score - let (total_weighted_score, total_weight) = - results - .iter() - .fold((0.0, 0.0), |(score_acc, weight_acc), r| { - (score_acc + r.score * r.weight, weight_acc + r.weight) - }); - - let composite_score = if total_weight > 0.0 { - total_weighted_score / total_weight - } else { - 0.0 - }; - - // Override confidence to red if any required verifier failed - let confidence_level = if any_required_failed { - ConfidenceLevel::Red - } else { - ConfidenceLevel::from_score(composite_score, green_threshold, yellow_threshold) - }; - - let passed = !any_required_failed && confidence_level != ConfidenceLevel::Red; - - // Generate summary - let passed_count = results.iter().filter(|r| r.passed).count(); - let total_count = results.len(); - let summary = format!( - "{}/{} verifiers passed, composite score: {:.2}, confidence: {}", - passed_count, total_count, composite_score, confidence_level - ); - - // Generate rework instructions if failed - let rework_instructions = if !passed { - let failed_verifiers: Vec<&str> = results - .iter() - .filter(|r| !r.passed) - .map(|r| r.name.as_str()) - .collect(); - Some(format!( - "Fix issues identified by: {}", - failed_verifiers.join(", ") - )) - } else { - None - }; - - let total_duration_ms = results.iter().map(|r| r.duration_ms).sum(); - - Self { - id, - step_id, - passed, - composite_score, - confidence_level, - verifier_results: results, - summary, - rework_instructions, - total_duration_ms, - } - } -} - -/// Error type for verification operations. -#[derive(Error, Debug)] -pub enum VerifierError { - #[error("Command execution failed: {0}")] - CommandFailed(String), - - #[error("Command timed out after {0}ms")] - Timeout(u64), - - #[error("Working directory not found: {0}")] - WorkingDirectoryNotFound(String), - - #[error("Verifier not configured: {0}")] - NotConfigured(String), - - #[error("Parse error: {0}")] - ParseError(String), - - #[error("LLM error: {0}")] - LlmError(String), - - #[error("IO error: {0}")] - Io(#[from] std::io::Error), -} - -/// Information about a verifier for serialization and database storage. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VerifierInfo { - pub name: String, - pub verifier_type: String, - pub command: String, - pub working_directory: Option<String>, - pub detect_files: Vec<String>, - pub weight: f64, - pub required: bool, -} - -/// Verifier trait for pluggable verification implementations. -#[async_trait] -pub trait Verifier: Send + Sync { - /// Get the name of this verifier. - fn name(&self) -> &str; - - /// Get the type of this verifier. - fn verifier_type(&self) -> VerifierType; - - /// Get serializable info about this verifier. - fn info(&self) -> VerifierInfo; - - /// Check if this verifier is applicable to the given repository. - async fn is_applicable(&self, repo_path: &Path) -> bool; - - /// Run verification and return result. - async fn verify(&self, repo_path: &Path, context: &VerificationContext) - -> Result<VerifierResult, VerifierError>; -} - -/// Context provided to verifiers during execution. -#[derive(Debug, Clone)] -pub struct VerificationContext { - /// Step ID being verified - pub step_id: Uuid, - /// Contract ID if step has been instantiated - pub contract_id: Option<Uuid>, - /// Files that were modified in this step - pub modified_files: Vec<String>, - /// Step description for LLM context - pub step_description: String, - /// Acceptance criteria for LLM evaluation - pub acceptance_criteria: Vec<String>, - /// Additional context from directive - pub directive_context: String, -} - -/// Command-based verifier for running shell commands. -pub struct CommandVerifier { - name: String, - verifier_type: VerifierType, - command: String, - #[allow(dead_code)] - working_dir: Option<String>, - #[allow(dead_code)] - timeout_ms: u64, - required: bool, - /// Files/patterns that indicate this verifier is applicable - applicable_patterns: Vec<String>, -} - -impl CommandVerifier { - /// Create a new command verifier. - pub fn new( - name: impl Into<String>, - verifier_type: VerifierType, - command: impl Into<String>, - ) -> Self { - Self { - name: name.into(), - verifier_type, - command: command.into(), - working_dir: None, - timeout_ms: 300_000, // 5 minute default - required: false, - applicable_patterns: Vec::new(), - } - } - - /// Set the working directory. - #[allow(dead_code)] - pub fn with_working_dir(mut self, dir: impl Into<String>) -> Self { - self.working_dir = Some(dir.into()); - self - } - - /// Set the timeout in milliseconds. - #[allow(dead_code)] - pub fn with_timeout(mut self, timeout_ms: u64) -> Self { - self.timeout_ms = timeout_ms; - self - } - - /// Mark as required verifier. - pub fn as_required(mut self) -> Self { - self.required = true; - self - } - - /// Add applicability patterns (files that must exist). - pub fn with_patterns(mut self, patterns: Vec<String>) -> Self { - self.applicable_patterns = patterns; - self - } -} - -#[async_trait] -impl Verifier for CommandVerifier { - fn name(&self) -> &str { - &self.name - } - - fn verifier_type(&self) -> VerifierType { - self.verifier_type.clone() - } - - fn info(&self) -> VerifierInfo { - VerifierInfo { - name: self.name.clone(), - verifier_type: self.verifier_type.as_str().to_string(), - command: self.command.clone(), - working_directory: self.working_dir.clone(), - detect_files: self.applicable_patterns.clone(), - weight: 1.0, - required: self.required, - } - } - - async fn is_applicable(&self, repo_path: &Path) -> bool { - if self.applicable_patterns.is_empty() { - return true; - } - - for pattern in &self.applicable_patterns { - let check_path = repo_path.join(pattern); - if check_path.exists() { - return true; - } - } - false - } - - async fn verify( - &self, - repo_path: &Path, - _context: &VerificationContext, - ) -> Result<VerifierResult, VerifierError> { - let start = std::time::Instant::now(); - - let work_dir = self - .working_dir - .as_ref() - .map(|d| repo_path.join(d)) - .unwrap_or_else(|| repo_path.to_path_buf()); - - if !work_dir.exists() { - return Err(VerifierError::WorkingDirectoryNotFound( - work_dir.display().to_string(), - )); - } - - // Parse command into program and args - let parts: Vec<&str> = self.command.split_whitespace().collect(); - if parts.is_empty() { - return Err(VerifierError::CommandFailed( - "Empty command".to_string(), - )); - } - - let program = parts[0]; - let args = &parts[1..]; - - // Execute command - let output = tokio::process::Command::new(program) - .args(args) - .current_dir(&work_dir) - .output() - .await?; - - let duration_ms = start.elapsed().as_millis() as u64; - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); - let combined_output = format!("{}\n{}", stdout, stderr); - - let passed = output.status.success(); - let score = if passed { 1.0 } else { 0.0 }; - - let mut result = VerifierResult { - name: self.name.clone(), - verifier_type: self.verifier_type.clone(), - passed, - score, - weight: 1.0, - required: self.required, - output: combined_output, - details: Some(serde_json::json!({ - "exit_code": output.status.code(), - "command": self.command, - "working_dir": work_dir.display().to_string(), - })), - duration_ms, - }; - - // Try to extract more detailed scoring from output - result = self.enhance_result(result, &stdout); - - Ok(result) - } -} - -impl CommandVerifier { - /// Enhance result with parsed details from output. - fn enhance_result(&self, mut result: VerifierResult, stdout: &str) -> VerifierResult { - match self.verifier_type { - VerifierType::TestRunner => { - // Try to parse test counts from common formats - if let Some((passed, failed, total)) = parse_test_output(stdout) { - result.details = Some(serde_json::json!({ - "tests_passed": passed, - "tests_failed": failed, - "tests_total": total, - "command": self.command, - })); - if total > 0 { - result.score = passed as f64 / total as f64; - } - } - } - VerifierType::Linter => { - // Try to parse lint error counts - if let Some(error_count) = parse_lint_output(stdout) { - result.details = Some(serde_json::json!({ - "errors": error_count, - "command": self.command, - })); - // Score decreases with more errors (up to 10 errors = 0) - result.score = (1.0 - (error_count as f64 / 10.0)).max(0.0); - } - } - _ => {} - } - result - } -} - -/// Parse test output for common formats (Jest, pytest, cargo test). -fn parse_test_output(output: &str) -> Option<(u32, u32, u32)> { - // Jest format: "Tests: X passed, Y failed, Z total" - if let Some(caps) = regex::Regex::new(r"Tests:\s*(\d+)\s*passed,\s*(\d+)\s*failed,\s*(\d+)\s*total") - .ok()? - .captures(output) - { - let passed: u32 = caps.get(1)?.as_str().parse().ok()?; - let failed: u32 = caps.get(2)?.as_str().parse().ok()?; - let total: u32 = caps.get(3)?.as_str().parse().ok()?; - return Some((passed, failed, total)); - } - - // pytest format: "X passed, Y failed" - if let Some(caps) = regex::Regex::new(r"(\d+)\s*passed(?:,\s*(\d+)\s*failed)?") - .ok()? - .captures(output) - { - let passed: u32 = caps.get(1)?.as_str().parse().ok()?; - let failed: u32 = caps.get(2).map(|m| m.as_str().parse().ok()).flatten().unwrap_or(0); - let total = passed + failed; - return Some((passed, failed, total)); - } - - // cargo test format: "test result: ok. X passed; Y failed;" - if let Some(caps) = regex::Regex::new(r"test result:.*?(\d+)\s*passed;\s*(\d+)\s*failed") - .ok()? - .captures(output) - { - let passed: u32 = caps.get(1)?.as_str().parse().ok()?; - let failed: u32 = caps.get(2)?.as_str().parse().ok()?; - let total = passed + failed; - return Some((passed, failed, total)); - } - - None -} - -/// Parse lint output for error counts. -fn parse_lint_output(output: &str) -> Option<u32> { - // ESLint format: "X problems (Y errors, Z warnings)" - if let Some(caps) = regex::Regex::new(r"(\d+)\s*problems?\s*\((\d+)\s*errors?") - .ok()? - .captures(output) - { - return caps.get(2)?.as_str().parse().ok(); - } - - // Clippy format: "warning: X warnings emitted" - if let Some(caps) = regex::Regex::new(r"warning:\s*(\d+)\s*warnings?\s*emitted") - .ok()? - .captures(output) - { - return caps.get(1)?.as_str().parse().ok(); - } - - None -} - -/// Auto-detect applicable verifiers for a repository. -pub async fn auto_detect_verifiers(repo_path: &Path) -> Vec<Box<dyn Verifier>> { - let mut verifiers: Vec<Box<dyn Verifier>> = Vec::new(); - - // Check for package.json (Node.js) - let package_json = repo_path.join("package.json"); - if package_json.exists() { - if let Ok(content) = tokio::fs::read_to_string(&package_json).await { - if let Ok(pkg) = serde_json::from_str::<serde_json::Value>(&content) { - if let Some(scripts) = pkg.get("scripts").and_then(|s| s.as_object()) { - // Test runner - if scripts.contains_key("test") { - verifiers.push(Box::new( - CommandVerifier::new("npm-test", VerifierType::TestRunner, "npm test") - .with_patterns(vec!["package.json".to_string()]) - .as_required(), - )); - } - - // Linter - if scripts.contains_key("lint") { - verifiers.push(Box::new( - CommandVerifier::new("npm-lint", VerifierType::Linter, "npm run lint") - .with_patterns(vec!["package.json".to_string()]), - )); - } - - // Build - if scripts.contains_key("build") { - verifiers.push(Box::new( - CommandVerifier::new("npm-build", VerifierType::Build, "npm run build") - .with_patterns(vec!["package.json".to_string()]) - .as_required(), - )); - } - - // Type check (for TypeScript projects) - if scripts.contains_key("typecheck") || scripts.contains_key("type-check") { - let cmd = if scripts.contains_key("typecheck") { - "npm run typecheck" - } else { - "npm run type-check" - }; - verifiers.push(Box::new( - CommandVerifier::new("npm-typecheck", VerifierType::TypeChecker, cmd) - .with_patterns(vec!["tsconfig.json".to_string()]), - )); - } - } - } - } - } - - // Check for Cargo.toml (Rust) - let cargo_toml = repo_path.join("Cargo.toml"); - if cargo_toml.exists() { - verifiers.push(Box::new( - CommandVerifier::new("cargo-test", VerifierType::TestRunner, "cargo test") - .with_patterns(vec!["Cargo.toml".to_string()]) - .as_required(), - )); - - verifiers.push(Box::new( - CommandVerifier::new("cargo-clippy", VerifierType::Linter, "cargo clippy -- -D warnings") - .with_patterns(vec!["Cargo.toml".to_string()]), - )); - - verifiers.push(Box::new( - CommandVerifier::new("cargo-build", VerifierType::Build, "cargo build") - .with_patterns(vec!["Cargo.toml".to_string()]) - .as_required(), - )); - } - - // Check for pyproject.toml or setup.py (Python) - let pyproject = repo_path.join("pyproject.toml"); - let setup_py = repo_path.join("setup.py"); - if pyproject.exists() || setup_py.exists() { - verifiers.push(Box::new( - CommandVerifier::new("pytest", VerifierType::TestRunner, "pytest") - .with_patterns(vec![ - "pyproject.toml".to_string(), - "setup.py".to_string(), - ]) - .as_required(), - )); - - verifiers.push(Box::new( - CommandVerifier::new("ruff", VerifierType::Linter, "ruff check .") - .with_patterns(vec!["pyproject.toml".to_string()]), - )); - } - - verifiers -} - -/// Composite evaluator that runs multiple verifiers and combines results. -pub struct CompositeEvaluator { - verifiers: Vec<Box<dyn Verifier>>, - green_threshold: f64, - yellow_threshold: f64, -} - -impl CompositeEvaluator { - /// Create a new composite evaluator with default thresholds. - pub fn new(verifiers: Vec<Box<dyn Verifier>>) -> Self { - Self { - verifiers, - green_threshold: 0.8, - yellow_threshold: 0.5, - } - } - - /// Set confidence thresholds. - pub fn with_thresholds(mut self, green: f64, yellow: f64) -> Self { - self.green_threshold = green; - self.yellow_threshold = yellow; - self - } - - /// Add a verifier. - pub fn add_verifier(mut self, verifier: Box<dyn Verifier>) -> Self { - self.verifiers.push(verifier); - self - } - - /// Run all applicable verifiers and return composite result. - pub async fn evaluate( - &self, - repo_path: &Path, - context: &VerificationContext, - ) -> EvaluationResult { - let mut results = Vec::new(); - - for verifier in &self.verifiers { - if !verifier.is_applicable(repo_path).await { - continue; - } - - match verifier.verify(repo_path, context).await { - Ok(result) => results.push(result), - Err(e) => { - // Convert error to failed result - results.push(VerifierResult::failed( - verifier.name().to_string(), - verifier.verifier_type(), - format!("Verifier error: {}", e), - )); - } - } - } - - EvaluationResult::from_verifiers( - context.step_id, - results, - self.green_threshold, - self.yellow_threshold, - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_confidence_level_from_score() { - assert_eq!( - ConfidenceLevel::from_score(0.9, 0.8, 0.5), - ConfidenceLevel::Green - ); - assert_eq!( - ConfidenceLevel::from_score(0.8, 0.8, 0.5), - ConfidenceLevel::Green - ); - assert_eq!( - ConfidenceLevel::from_score(0.6, 0.8, 0.5), - ConfidenceLevel::Yellow - ); - assert_eq!( - ConfidenceLevel::from_score(0.5, 0.8, 0.5), - ConfidenceLevel::Yellow - ); - assert_eq!( - ConfidenceLevel::from_score(0.4, 0.8, 0.5), - ConfidenceLevel::Red - ); - } - - #[test] - fn test_evaluation_result_composite_score() { - let results = vec![ - VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()) - .with_weight(1.0), - VerifierResult::failed("test2".into(), VerifierType::Linter, "Failed".into()) - .with_weight(1.0), - ]; - - let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5); - assert!((eval.composite_score - 0.5).abs() < 0.001); - assert_eq!(eval.confidence_level, ConfidenceLevel::Yellow); - } - - #[test] - fn test_required_verifier_override() { - let results = vec![ - VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()), - VerifierResult::failed("build".into(), VerifierType::Build, "Failed".into()) - .as_required(), - ]; - - let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5); - // Even though composite score is 0.5, required failure overrides to red - assert_eq!(eval.confidence_level, ConfidenceLevel::Red); - assert!(!eval.passed); - } - - #[test] - fn test_parse_test_output_jest() { - let output = "Tests: 10 passed, 2 failed, 12 total"; - let (passed, failed, total) = parse_test_output(output).unwrap(); - assert_eq!(passed, 10); - assert_eq!(failed, 2); - assert_eq!(total, 12); - } - - #[test] - fn test_parse_test_output_cargo() { - let output = "test result: ok. 25 passed; 0 failed;"; - let (passed, failed, total) = parse_test_output(output).unwrap(); - assert_eq!(passed, 25); - assert_eq!(failed, 0); - assert_eq!(total, 25); - } -} |
