path: root/makima/src/orchestration/verifier.rs



//! Verification system for directive step evaluation.
//!
//! Provides tiered verification: programmatic verifiers run first,
//! then LLM evaluation if programmatic checks pass. Composite scoring
//! combines results with configurable weights.

use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use std::path::Path;
use thiserror::Error;
use uuid::Uuid;

/// Confidence level based on composite score and thresholds.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ConfidenceLevel {
    /// High confidence (score >= green threshold)
    Green,
    /// Medium confidence (score >= yellow threshold but < green)
    Yellow,
    /// Low confidence (score < yellow threshold)
    Red,
}

impl ConfidenceLevel {
    /// Compute confidence level from score and thresholds.
    pub fn from_score(score: f64, green_threshold: f64, yellow_threshold: f64) -> Self {
        if score >= green_threshold {
            ConfidenceLevel::Green
        } else if score >= yellow_threshold {
            ConfidenceLevel::Yellow
        } else {
            ConfidenceLevel::Red
        }
    }

    /// Convert to string for database storage.
    pub fn as_str(&self) -> &'static str {
        match self {
            ConfidenceLevel::Green => "green",
            ConfidenceLevel::Yellow => "yellow",
            ConfidenceLevel::Red => "red",
        }
    }
}

impl std::fmt::Display for ConfidenceLevel {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.as_str())
    }
}

/// Type of verifier for categorization.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VerifierType {
    /// Run test suite (npm test, cargo test, pytest, etc.)
    TestRunner,
    /// Run linter (eslint, clippy, ruff, etc.)
    Linter,
    /// Run type checker (tsc, mypy, etc.)
    TypeChecker,
    /// Run build command (npm build, cargo build, etc.)
    Build,
    /// Custom command verifier
    Custom,
    /// LLM-based semantic evaluation
    Llm,
}

impl VerifierType {
    pub fn as_str(&self) -> &'static str {
        match self {
            VerifierType::TestRunner => "test_runner",
            VerifierType::Linter => "linter",
            VerifierType::TypeChecker => "type_checker",
            VerifierType::Build => "build",
            VerifierType::Custom => "custom",
            VerifierType::Llm => "llm",
        }
    }
}

/// Result of a single verifier run.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VerifierResult {
    /// Name of the verifier
    pub name: String,
    /// Type of verifier
    pub verifier_type: VerifierType,
    /// Whether the verification passed
    pub passed: bool,
    /// Score from 0.0 to 1.0 (1.0 = perfect, 0.0 = complete failure)
    pub score: f64,
    /// Weight for composite scoring (default 1.0 for programmatic, 2.0 for LLM)
    pub weight: f64,
    /// Whether this verifier is required (failure = automatic red confidence)
    pub required: bool,
    /// Human-readable output/feedback
    pub output: String,
    /// Structured details (test counts, lint errors, etc.)
    pub details: Option<JsonValue>,
    /// Execution time in milliseconds
    pub duration_ms: u64,
}

impl VerifierResult {
    /// Create a passed result with full score.
    pub fn passed(name: String, verifier_type: VerifierType, output: String) -> Self {
        Self {
            name,
            verifier_type,
            passed: true,
            score: 1.0,
            weight: 1.0,
            required: false,
            output,
            details: None,
            duration_ms: 0,
        }
    }

    /// Create a failed result with zero score.
    pub fn failed(name: String, verifier_type: VerifierType, output: String) -> Self {
        Self {
            name,
            verifier_type,
            passed: false,
            score: 0.0,
            weight: 1.0,
            required: false,
            output,
            details: None,
            duration_ms: 0,
        }
    }

    /// Set the weight for this result.
    pub fn with_weight(mut self, weight: f64) -> Self {
        self.weight = weight;
        self
    }

    /// Mark this verifier as required.
    pub fn as_required(mut self) -> Self {
        self.required = true;
        self
    }

    /// Set the score explicitly.
    pub fn with_score(mut self, score: f64) -> Self {
        self.score = score.clamp(0.0, 1.0);
        self
    }

    /// Set structured details.
    pub fn with_details(mut self, details: JsonValue) -> Self {
        self.details = Some(details);
        self
    }

    /// Set execution duration.
    pub fn with_duration(mut self, duration_ms: u64) -> Self {
        self.duration_ms = duration_ms;
        self
    }
}

/// Composite evaluation result combining multiple verifier results.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationResult {
    /// Unique ID for this evaluation
    pub id: Uuid,
    /// Step ID being evaluated
    pub step_id: Uuid,
    /// Whether all required verifiers passed
    pub passed: bool,
    /// Weighted composite score (0.0-1.0)
    pub composite_score: f64,
    /// Confidence level derived from score
    pub confidence_level: ConfidenceLevel,
    /// Individual verifier results
    pub verifier_results: Vec<VerifierResult>,
    /// Summary feedback for the step
    pub summary: String,
    /// Rework instructions if failed
    pub rework_instructions: Option<String>,
    /// Total evaluation duration in milliseconds
    pub total_duration_ms: u64,
}

impl EvaluationResult {
    /// Create a new evaluation result from verifier results.
    pub fn from_verifiers(
        step_id: Uuid,
        results: Vec<VerifierResult>,
        green_threshold: f64,
        yellow_threshold: f64,
    ) -> Self {
        let id = Uuid::new_v4();

        // Check if any required verifier failed
        let any_required_failed = results.iter().any(|r| r.required && !r.passed);

        // Calculate weighted composite score
        let (total_weighted_score, total_weight) =
            results
                .iter()
                .fold((0.0, 0.0), |(score_acc, weight_acc), r| {
                    (score_acc + r.score * r.weight, weight_acc + r.weight)
                });

        let composite_score = if total_weight > 0.0 {
            total_weighted_score / total_weight
        } else {
            0.0
        };

        // Override confidence to red if any required verifier failed
        let confidence_level = if any_required_failed {
            ConfidenceLevel::Red
        } else {
            ConfidenceLevel::from_score(composite_score, green_threshold, yellow_threshold)
        };

        let passed = !any_required_failed && confidence_level != ConfidenceLevel::Red;

        // Generate summary
        let passed_count = results.iter().filter(|r| r.passed).count();
        let total_count = results.len();
        let summary = format!(
            "{}/{} verifiers passed, composite score: {:.2}, confidence: {}",
            passed_count, total_count, composite_score, confidence_level
        );

        // Generate rework instructions if failed
        let rework_instructions = if !passed {
            let failed_verifiers: Vec<&str> = results
                .iter()
                .filter(|r| !r.passed)
                .map(|r| r.name.as_str())
                .collect();
            Some(format!(
                "Fix issues identified by: {}",
                failed_verifiers.join(", ")
            ))
        } else {
            None
        };

        let total_duration_ms = results.iter().map(|r| r.duration_ms).sum();

        Self {
            id,
            step_id,
            passed,
            composite_score,
            confidence_level,
            verifier_results: results,
            summary,
            rework_instructions,
            total_duration_ms,
        }
    }
}

/// Error type for verification operations.
#[derive(Error, Debug)]
pub enum VerifierError {
    #[error("Command execution failed: {0}")]
    CommandFailed(String),

    #[error("Command timed out after {0}ms")]
    Timeout(u64),

    #[error("Working directory not found: {0}")]
    WorkingDirectoryNotFound(String),

    #[error("Verifier not configured: {0}")]
    NotConfigured(String),

    #[error("Parse error: {0}")]
    ParseError(String),

    #[error("LLM error: {0}")]
    LlmError(String),

    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),
}

/// Verifier trait for pluggable verification implementations.
#[async_trait]
pub trait Verifier: Send + Sync {
    /// Get the name of this verifier.
    fn name(&self) -> &str;

    /// Get the type of this verifier.
    fn verifier_type(&self) -> VerifierType;

    /// Check if this verifier is applicable to the given repository.
    async fn is_applicable(&self, repo_path: &Path) -> bool;

    /// Run verification and return result.
    async fn verify(&self, repo_path: &Path, context: &VerificationContext)
        -> Result<VerifierResult, VerifierError>;
}

/// Context provided to verifiers during execution.
#[derive(Debug, Clone)]
pub struct VerificationContext {
    /// Step ID being verified
    pub step_id: Uuid,
    /// Contract ID if step has been instantiated
    pub contract_id: Option<Uuid>,
    /// Files that were modified in this step
    pub modified_files: Vec<String>,
    /// Step description for LLM context
    pub step_description: String,
    /// Acceptance criteria for LLM evaluation
    pub acceptance_criteria: Vec<String>,
    /// Additional context from directive
    pub directive_context: String,
}

/// Command-based verifier for running shell commands.
pub struct CommandVerifier {
    name: String,
    verifier_type: VerifierType,
    command: String,
    #[allow(dead_code)]
    working_dir: Option<String>,
    #[allow(dead_code)]
    timeout_ms: u64,
    required: bool,
    /// Files/patterns that indicate this verifier is applicable
    applicable_patterns: Vec<String>,
}

impl CommandVerifier {
    /// Create a new command verifier.
    pub fn new(
        name: impl Into<String>,
        verifier_type: VerifierType,
        command: impl Into<String>,
    ) -> Self {
        Self {
            name: name.into(),
            verifier_type,
            command: command.into(),
            working_dir: None,
            timeout_ms: 300_000, // 5 minute default
            required: false,
            applicable_patterns: Vec::new(),
        }
    }

    /// Set the working directory.
    #[allow(dead_code)]
    pub fn with_working_dir(mut self, dir: impl Into<String>) -> Self {
        self.working_dir = Some(dir.into());
        self
    }

    /// Set the timeout in milliseconds.
    #[allow(dead_code)]
    pub fn with_timeout(mut self, timeout_ms: u64) -> Self {
        self.timeout_ms = timeout_ms;
        self
    }

    /// Mark as required verifier.
    pub fn as_required(mut self) -> Self {
        self.required = true;
        self
    }

    /// Add applicability patterns (files that must exist).
    pub fn with_patterns(mut self, patterns: Vec<String>) -> Self {
        self.applicable_patterns = patterns;
        self
    }
}

#[async_trait]
impl Verifier for CommandVerifier {
    fn name(&self) -> &str {
        &self.name
    }

    fn verifier_type(&self) -> VerifierType {
        self.verifier_type.clone()
    }

    async fn is_applicable(&self, repo_path: &Path) -> bool {
        if self.applicable_patterns.is_empty() {
            return true;
        }

        for pattern in &self.applicable_patterns {
            let check_path = repo_path.join(pattern);
            if check_path.exists() {
                return true;
            }
        }
        false
    }

    async fn verify(
        &self,
        repo_path: &Path,
        _context: &VerificationContext,
    ) -> Result<VerifierResult, VerifierError> {
        let start = std::time::Instant::now();

        let work_dir = self
            .working_dir
            .as_ref()
            .map(|d| repo_path.join(d))
            .unwrap_or_else(|| repo_path.to_path_buf());

        if !work_dir.exists() {
            return Err(VerifierError::WorkingDirectoryNotFound(
                work_dir.display().to_string(),
            ));
        }

        // Parse command into program and args
        let parts: Vec<&str> = self.command.split_whitespace().collect();
        if parts.is_empty() {
            return Err(VerifierError::CommandFailed(
                "Empty command".to_string(),
            ));
        }

        let program = parts[0];
        let args = &parts[1..];

        // Execute command
        let output = tokio::process::Command::new(program)
            .args(args)
            .current_dir(&work_dir)
            .output()
            .await?;

        let duration_ms = start.elapsed().as_millis() as u64;
        let stdout = String::from_utf8_lossy(&output.stdout);
        let stderr = String::from_utf8_lossy(&output.stderr);
        let combined_output = format!("{}\n{}", stdout, stderr);

        let passed = output.status.success();
        let score = if passed { 1.0 } else { 0.0 };

        let mut result = VerifierResult {
            name: self.name.clone(),
            verifier_type: self.verifier_type.clone(),
            passed,
            score,
            weight: 1.0,
            required: self.required,
            output: combined_output,
            details: Some(serde_json::json!({
                "exit_code": output.status.code(),
                "command": self.command,
                "working_dir": work_dir.display().to_string(),
            })),
            duration_ms,
        };

        // Try to extract more detailed scoring from output
        result = self.enhance_result(result, &stdout);

        Ok(result)
    }
}

impl CommandVerifier {
    /// Enhance result with parsed details from output.
    fn enhance_result(&self, mut result: VerifierResult, stdout: &str) -> VerifierResult {
        match self.verifier_type {
            VerifierType::TestRunner => {
                // Try to parse test counts from common formats
                if let Some((passed, failed, total)) = parse_test_output(stdout) {
                    result.details = Some(serde_json::json!({
                        "tests_passed": passed,
                        "tests_failed": failed,
                        "tests_total": total,
                        "command": self.command,
                    }));
                    if total > 0 {
                        result.score = passed as f64 / total as f64;
                    }
                }
            }
            VerifierType::Linter => {
                // Try to parse lint error counts
                if let Some(error_count) = parse_lint_output(stdout) {
                    result.details = Some(serde_json::json!({
                        "errors": error_count,
                        "command": self.command,
                    }));
                    // Score decreases with more errors (up to 10 errors = 0)
                    result.score = (1.0 - (error_count as f64 / 10.0)).max(0.0);
                }
            }
            _ => {}
        }
        result
    }
}

/// Parse test output for common formats (Jest, pytest, cargo test).
fn parse_test_output(output: &str) -> Option<(u32, u32, u32)> {
    // Jest format: "Tests: X passed, Y failed, Z total"
    if let Some(caps) = regex::Regex::new(r"Tests:\s*(\d+)\s*passed,\s*(\d+)\s*failed,\s*(\d+)\s*total")
        .ok()?
        .captures(output)
    {
        let passed: u32 = caps.get(1)?.as_str().parse().ok()?;
        let failed: u32 = caps.get(2)?.as_str().parse().ok()?;
        let total: u32 = caps.get(3)?.as_str().parse().ok()?;
        return Some((passed, failed, total));
    }

    // pytest format: "X passed, Y failed"
    if let Some(caps) = regex::Regex::new(r"(\d+)\s*passed(?:,\s*(\d+)\s*failed)?")
        .ok()?
        .captures(output)
    {
        let passed: u32 = caps.get(1)?.as_str().parse().ok()?;
        let failed: u32 = caps.get(2).map(|m| m.as_str().parse().ok()).flatten().unwrap_or(0);
        let total = passed + failed;
        return Some((passed, failed, total));
    }

    // cargo test format: "test result: ok. X passed; Y failed;"
    if let Some(caps) = regex::Regex::new(r"test result:.*?(\d+)\s*passed;\s*(\d+)\s*failed")
        .ok()?
        .captures(output)
    {
        let passed: u32 = caps.get(1)?.as_str().parse().ok()?;
        let failed: u32 = caps.get(2)?.as_str().parse().ok()?;
        let total = passed + failed;
        return Some((passed, failed, total));
    }

    None
}

/// Parse lint output for error counts.
fn parse_lint_output(output: &str) -> Option<u32> {
    // ESLint format: "X problems (Y errors, Z warnings)"
    if let Some(caps) = regex::Regex::new(r"(\d+)\s*problems?\s*\((\d+)\s*errors?")
        .ok()?
        .captures(output)
    {
        return caps.get(2)?.as_str().parse().ok();
    }

    // Clippy format: "warning: X warnings emitted"
    if let Some(caps) = regex::Regex::new(r"warning:\s*(\d+)\s*warnings?\s*emitted")
        .ok()?
        .captures(output)
    {
        return caps.get(1)?.as_str().parse().ok();
    }

    None
}

/// Auto-detect applicable verifiers for a repository.
pub async fn auto_detect_verifiers(repo_path: &Path) -> Vec<Box<dyn Verifier>> {
    let mut verifiers: Vec<Box<dyn Verifier>> = Vec::new();

    // Check for package.json (Node.js)
    let package_json = repo_path.join("package.json");
    if package_json.exists() {
        if let Ok(content) = tokio::fs::read_to_string(&package_json).await {
            if let Ok(pkg) = serde_json::from_str::<serde_json::Value>(&content) {
                if let Some(scripts) = pkg.get("scripts").and_then(|s| s.as_object()) {
                    // Test runner
                    if scripts.contains_key("test") {
                        verifiers.push(Box::new(
                            CommandVerifier::new("npm-test", VerifierType::TestRunner, "npm test")
                                .with_patterns(vec!["package.json".to_string()])
                                .as_required(),
                        ));
                    }

                    // Linter
                    if scripts.contains_key("lint") {
                        verifiers.push(Box::new(
                            CommandVerifier::new("npm-lint", VerifierType::Linter, "npm run lint")
                                .with_patterns(vec!["package.json".to_string()]),
                        ));
                    }

                    // Build
                    if scripts.contains_key("build") {
                        verifiers.push(Box::new(
                            CommandVerifier::new("npm-build", VerifierType::Build, "npm run build")
                                .with_patterns(vec!["package.json".to_string()])
                                .as_required(),
                        ));
                    }

                    // Type check (for TypeScript projects)
                    if scripts.contains_key("typecheck") || scripts.contains_key("type-check") {
                        let cmd = if scripts.contains_key("typecheck") {
                            "npm run typecheck"
                        } else {
                            "npm run type-check"
                        };
                        verifiers.push(Box::new(
                            CommandVerifier::new("npm-typecheck", VerifierType::TypeChecker, cmd)
                                .with_patterns(vec!["tsconfig.json".to_string()]),
                        ));
                    }
                }
            }
        }
    }

    // Check for Cargo.toml (Rust)
    let cargo_toml = repo_path.join("Cargo.toml");
    if cargo_toml.exists() {
        verifiers.push(Box::new(
            CommandVerifier::new("cargo-test", VerifierType::TestRunner, "cargo test")
                .with_patterns(vec!["Cargo.toml".to_string()])
                .as_required(),
        ));

        verifiers.push(Box::new(
            CommandVerifier::new("cargo-clippy", VerifierType::Linter, "cargo clippy -- -D warnings")
                .with_patterns(vec!["Cargo.toml".to_string()]),
        ));

        verifiers.push(Box::new(
            CommandVerifier::new("cargo-build", VerifierType::Build, "cargo build")
                .with_patterns(vec!["Cargo.toml".to_string()])
                .as_required(),
        ));
    }

    // Check for pyproject.toml or setup.py (Python)
    let pyproject = repo_path.join("pyproject.toml");
    let setup_py = repo_path.join("setup.py");
    if pyproject.exists() || setup_py.exists() {
        verifiers.push(Box::new(
            CommandVerifier::new("pytest", VerifierType::TestRunner, "pytest")
                .with_patterns(vec![
                    "pyproject.toml".to_string(),
                    "setup.py".to_string(),
                ])
                .as_required(),
        ));

        verifiers.push(Box::new(
            CommandVerifier::new("ruff", VerifierType::Linter, "ruff check .")
                .with_patterns(vec!["pyproject.toml".to_string()]),
        ));
    }

    verifiers
}

/// Composite evaluator that runs multiple verifiers and combines results.
pub struct CompositeEvaluator {
    verifiers: Vec<Box<dyn Verifier>>,
    green_threshold: f64,
    yellow_threshold: f64,
}

impl CompositeEvaluator {
    /// Create a new composite evaluator with default thresholds.
    pub fn new(verifiers: Vec<Box<dyn Verifier>>) -> Self {
        Self {
            verifiers,
            green_threshold: 0.8,
            yellow_threshold: 0.5,
        }
    }

    /// Set confidence thresholds.
    pub fn with_thresholds(mut self, green: f64, yellow: f64) -> Self {
        self.green_threshold = green;
        self.yellow_threshold = yellow;
        self
    }

    /// Add a verifier.
    pub fn add_verifier(mut self, verifier: Box<dyn Verifier>) -> Self {
        self.verifiers.push(verifier);
        self
    }

    /// Run all applicable verifiers and return composite result.
    pub async fn evaluate(
        &self,
        repo_path: &Path,
        context: &VerificationContext,
    ) -> EvaluationResult {
        let mut results = Vec::new();

        for verifier in &self.verifiers {
            if !verifier.is_applicable(repo_path).await {
                continue;
            }

            match verifier.verify(repo_path, context).await {
                Ok(result) => results.push(result),
                Err(e) => {
                    // Convert error to failed result
                    results.push(VerifierResult::failed(
                        verifier.name().to_string(),
                        verifier.verifier_type(),
                        format!("Verifier error: {}", e),
                    ));
                }
            }
        }

        EvaluationResult::from_verifiers(
            context.step_id,
            results,
            self.green_threshold,
            self.yellow_threshold,
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_confidence_level_from_score() {
        assert_eq!(
            ConfidenceLevel::from_score(0.9, 0.8, 0.5),
            ConfidenceLevel::Green
        );
        assert_eq!(
            ConfidenceLevel::from_score(0.8, 0.8, 0.5),
            ConfidenceLevel::Green
        );
        assert_eq!(
            ConfidenceLevel::from_score(0.6, 0.8, 0.5),
            ConfidenceLevel::Yellow
        );
        assert_eq!(
            ConfidenceLevel::from_score(0.5, 0.8, 0.5),
            ConfidenceLevel::Yellow
        );
        assert_eq!(
            ConfidenceLevel::from_score(0.4, 0.8, 0.5),
            ConfidenceLevel::Red
        );
    }

    #[test]
    fn test_evaluation_result_composite_score() {
        let results = vec![
            VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into())
                .with_weight(1.0),
            VerifierResult::failed("test2".into(), VerifierType::Linter, "Failed".into())
                .with_weight(1.0),
        ];

        let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5);
        assert!((eval.composite_score - 0.5).abs() < 0.001);
        assert_eq!(eval.confidence_level, ConfidenceLevel::Yellow);
    }

    #[test]
    fn test_required_verifier_override() {
        let results = vec![
            VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()),
            VerifierResult::failed("build".into(), VerifierType::Build, "Failed".into())
                .as_required(),
        ];

        let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5);
        // Even though composite score is 0.5, required failure overrides to red
        assert_eq!(eval.confidence_level, ConfidenceLevel::Red);
        assert!(!eval.passed);
    }

    #[test]
    fn test_parse_test_output_jest() {
        let output = "Tests: 10 passed, 2 failed, 12 total";
        let (passed, failed, total) = parse_test_output(output).unwrap();
        assert_eq!(passed, 10);
        assert_eq!(failed, 2);
        assert_eq!(total, 12);
    }

    #[test]
    fn test_parse_test_output_cargo() {
        let output = "test result: ok. 25 passed; 0 failed;";
        let (passed, failed, total) = parse_test_output(output).unwrap();
        assert_eq!(passed, 25);
        assert_eq!(failed, 0);
        assert_eq!(total, 25);
    }
}