//! Verification system for directive step evaluation.
//!
//! Provides tiered verification: programmatic verifiers run first,
//! then LLM evaluation if programmatic checks pass. Composite scoring
//! combines results with configurable weights.
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use std::path::Path;
use thiserror::Error;
use uuid::Uuid;
/// Confidence level based on composite score and thresholds.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ConfidenceLevel {
/// High confidence (score >= green threshold)
Green,
/// Medium confidence (score >= yellow threshold but < green)
Yellow,
/// Low confidence (score < yellow threshold)
Red,
}
impl ConfidenceLevel {
/// Compute confidence level from score and thresholds.
pub fn from_score(score: f64, green_threshold: f64, yellow_threshold: f64) -> Self {
if score >= green_threshold {
ConfidenceLevel::Green
} else if score >= yellow_threshold {
ConfidenceLevel::Yellow
} else {
ConfidenceLevel::Red
}
}
/// Convert to string for database storage.
pub fn as_str(&self) -> &'static str {
match self {
ConfidenceLevel::Green => "green",
ConfidenceLevel::Yellow => "yellow",
ConfidenceLevel::Red => "red",
}
}
}
impl std::fmt::Display for ConfidenceLevel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
/// Type of verifier for categorization.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VerifierType {
/// Run test suite (npm test, cargo test, pytest, etc.)
TestRunner,
/// Run linter (eslint, clippy, ruff, etc.)
Linter,
/// Run type checker (tsc, mypy, etc.)
TypeChecker,
/// Run build command (npm build, cargo build, etc.)
Build,
/// Custom command verifier
Custom,
/// LLM-based semantic evaluation
Llm,
}
impl VerifierType {
pub fn as_str(&self) -> &'static str {
match self {
VerifierType::TestRunner => "test_runner",
VerifierType::Linter => "linter",
VerifierType::TypeChecker => "type_checker",
VerifierType::Build => "build",
VerifierType::Custom => "custom",
VerifierType::Llm => "llm",
}
}
}
/// Result of a single verifier run.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VerifierResult {
/// Name of the verifier
pub name: String,
/// Type of verifier
pub verifier_type: VerifierType,
/// Whether the verification passed
pub passed: bool,
/// Score from 0.0 to 1.0 (1.0 = perfect, 0.0 = complete failure)
pub score: f64,
/// Weight for composite scoring (default 1.0 for programmatic, 2.0 for LLM)
pub weight: f64,
/// Whether this verifier is required (failure = automatic red confidence)
pub required: bool,
/// Human-readable output/feedback
pub output: String,
/// Structured details (test counts, lint errors, etc.)
pub details: Option<JsonValue>,
/// Execution time in milliseconds
pub duration_ms: u64,
}
impl VerifierResult {
/// Create a passed result with full score.
pub fn passed(name: String, verifier_type: VerifierType, output: String) -> Self {
Self {
name,
verifier_type,
passed: true,
score: 1.0,
weight: 1.0,
required: false,
output,
details: None,
duration_ms: 0,
}
}
/// Create a failed result with zero score.
pub fn failed(name: String, verifier_type: VerifierType, output: String) -> Self {
Self {
name,
verifier_type,
passed: false,
score: 0.0,
weight: 1.0,
required: false,
output,
details: None,
duration_ms: 0,
}
}
/// Set the weight for this result.
pub fn with_weight(mut self, weight: f64) -> Self {
self.weight = weight;
self
}
/// Mark this verifier as required.
pub fn as_required(mut self) -> Self {
self.required = true;
self
}
/// Set the score explicitly.
pub fn with_score(mut self, score: f64) -> Self {
self.score = score.clamp(0.0, 1.0);
self
}
/// Set structured details.
pub fn with_details(mut self, details: JsonValue) -> Self {
self.details = Some(details);
self
}
/// Set execution duration.
pub fn with_duration(mut self, duration_ms: u64) -> Self {
self.duration_ms = duration_ms;
self
}
}
/// Composite evaluation result combining multiple verifier results.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationResult {
/// Unique ID for this evaluation
pub id: Uuid,
/// Step ID being evaluated
pub step_id: Uuid,
/// Whether all required verifiers passed
pub passed: bool,
/// Weighted composite score (0.0-1.0)
pub composite_score: f64,
/// Confidence level derived from score
pub confidence_level: ConfidenceLevel,
/// Individual verifier results
pub verifier_results: Vec<VerifierResult>,
/// Summary feedback for the step
pub summary: String,
/// Rework instructions if failed
pub rework_instructions: Option<String>,
/// Total evaluation duration in milliseconds
pub total_duration_ms: u64,
}
impl EvaluationResult {
/// Create a new evaluation result from verifier results.
pub fn from_verifiers(
step_id: Uuid,
results: Vec<VerifierResult>,
green_threshold: f64,
yellow_threshold: f64,
) -> Self {
let id = Uuid::new_v4();
// Check if any required verifier failed
let any_required_failed = results.iter().any(|r| r.required && !r.passed);
// Calculate weighted composite score
let (total_weighted_score, total_weight) =
results
.iter()
.fold((0.0, 0.0), |(score_acc, weight_acc), r| {
(score_acc + r.score * r.weight, weight_acc + r.weight)
});
let composite_score = if total_weight > 0.0 {
total_weighted_score / total_weight
} else {
0.0
};
// Override confidence to red if any required verifier failed
let confidence_level = if any_required_failed {
ConfidenceLevel::Red
} else {
ConfidenceLevel::from_score(composite_score, green_threshold, yellow_threshold)
};
let passed = !any_required_failed && confidence_level != ConfidenceLevel::Red;
// Generate summary
let passed_count = results.iter().filter(|r| r.passed).count();
let total_count = results.len();
let summary = format!(
"{}/{} verifiers passed, composite score: {:.2}, confidence: {}",
passed_count, total_count, composite_score, confidence_level
);
// Generate rework instructions if failed
let rework_instructions = if !passed {
let failed_verifiers: Vec<&str> = results
.iter()
.filter(|r| !r.passed)
.map(|r| r.name.as_str())
.collect();
Some(format!(
"Fix issues identified by: {}",
failed_verifiers.join(", ")
))
} else {
None
};
let total_duration_ms = results.iter().map(|r| r.duration_ms).sum();
Self {
id,
step_id,
passed,
composite_score,
confidence_level,
verifier_results: results,
summary,
rework_instructions,
total_duration_ms,
}
}
}
/// Error type for verification operations.
#[derive(Error, Debug)]
pub enum VerifierError {
#[error("Command execution failed: {0}")]
CommandFailed(String),
#[error("Command timed out after {0}ms")]
Timeout(u64),
#[error("Working directory not found: {0}")]
WorkingDirectoryNotFound(String),
#[error("Verifier not configured: {0}")]
NotConfigured(String),
#[error("Parse error: {0}")]
ParseError(String),
#[error("LLM error: {0}")]
LlmError(String),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
}
/// Verifier trait for pluggable verification implementations.
#[async_trait]
pub trait Verifier: Send + Sync {
/// Get the name of this verifier.
fn name(&self) -> &str;
/// Get the type of this verifier.
fn verifier_type(&self) -> VerifierType;
/// Check if this verifier is applicable to the given repository.
async fn is_applicable(&self, repo_path: &Path) -> bool;
/// Run verification and return result.
async fn verify(&self, repo_path: &Path, context: &VerificationContext)
-> Result<VerifierResult, VerifierError>;
}
/// Context provided to verifiers during execution.
#[derive(Debug, Clone)]
pub struct VerificationContext {
/// Step ID being verified
pub step_id: Uuid,
/// Contract ID if step has been instantiated
pub contract_id: Option<Uuid>,
/// Files that were modified in this step
pub modified_files: Vec<String>,
/// Step description for LLM context
pub step_description: String,
/// Acceptance criteria for LLM evaluation
pub acceptance_criteria: Vec<String>,
/// Additional context from directive
pub directive_context: String,
}
/// Command-based verifier for running shell commands.
pub struct CommandVerifier {
name: String,
verifier_type: VerifierType,
command: String,
#[allow(dead_code)]
working_dir: Option<String>,
#[allow(dead_code)]
timeout_ms: u64,
required: bool,
/// Files/patterns that indicate this verifier is applicable
applicable_patterns: Vec<String>,
}
impl CommandVerifier {
/// Create a new command verifier.
pub fn new(
name: impl Into<String>,
verifier_type: VerifierType,
command: impl Into<String>,
) -> Self {
Self {
name: name.into(),
verifier_type,
command: command.into(),
working_dir: None,
timeout_ms: 300_000, // 5 minute default
required: false,
applicable_patterns: Vec::new(),
}
}
/// Set the working directory.
#[allow(dead_code)]
pub fn with_working_dir(mut self, dir: impl Into<String>) -> Self {
self.working_dir = Some(dir.into());
self
}
/// Set the timeout in milliseconds.
#[allow(dead_code)]
pub fn with_timeout(mut self, timeout_ms: u64) -> Self {
self.timeout_ms = timeout_ms;
self
}
/// Mark as required verifier.
pub fn as_required(mut self) -> Self {
self.required = true;
self
}
/// Add applicability patterns (files that must exist).
pub fn with_patterns(mut self, patterns: Vec<String>) -> Self {
self.applicable_patterns = patterns;
self
}
}
#[async_trait]
impl Verifier for CommandVerifier {
fn name(&self) -> &str {
&self.name
}
fn verifier_type(&self) -> VerifierType {
self.verifier_type.clone()
}
async fn is_applicable(&self, repo_path: &Path) -> bool {
if self.applicable_patterns.is_empty() {
return true;
}
for pattern in &self.applicable_patterns {
let check_path = repo_path.join(pattern);
if check_path.exists() {
return true;
}
}
false
}
async fn verify(
&self,
repo_path: &Path,
_context: &VerificationContext,
) -> Result<VerifierResult, VerifierError> {
let start = std::time::Instant::now();
let work_dir = self
.working_dir
.as_ref()
.map(|d| repo_path.join(d))
.unwrap_or_else(|| repo_path.to_path_buf());
if !work_dir.exists() {
return Err(VerifierError::WorkingDirectoryNotFound(
work_dir.display().to_string(),
));
}
// Parse command into program and args
let parts: Vec<&str> = self.command.split_whitespace().collect();
if parts.is_empty() {
return Err(VerifierError::CommandFailed(
"Empty command".to_string(),
));
}
let program = parts[0];
let args = &parts[1..];
// Execute command
let output = tokio::process::Command::new(program)
.args(args)
.current_dir(&work_dir)
.output()
.await?;
let duration_ms = start.elapsed().as_millis() as u64;
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
let combined_output = format!("{}\n{}", stdout, stderr);
let passed = output.status.success();
let score = if passed { 1.0 } else { 0.0 };
let mut result = VerifierResult {
name: self.name.clone(),
verifier_type: self.verifier_type.clone(),
passed,
score,
weight: 1.0,
required: self.required,
output: combined_output,
details: Some(serde_json::json!({
"exit_code": output.status.code(),
"command": self.command,
"working_dir": work_dir.display().to_string(),
})),
duration_ms,
};
// Try to extract more detailed scoring from output
result = self.enhance_result(result, &stdout);
Ok(result)
}
}
impl CommandVerifier {
/// Enhance result with parsed details from output.
fn enhance_result(&self, mut result: VerifierResult, stdout: &str) -> VerifierResult {
match self.verifier_type {
VerifierType::TestRunner => {
// Try to parse test counts from common formats
if let Some((passed, failed, total)) = parse_test_output(stdout) {
result.details = Some(serde_json::json!({
"tests_passed": passed,
"tests_failed": failed,
"tests_total": total,
"command": self.command,
}));
if total > 0 {
result.score = passed as f64 / total as f64;
}
}
}
VerifierType::Linter => {
// Try to parse lint error counts
if let Some(error_count) = parse_lint_output(stdout) {
result.details = Some(serde_json::json!({
"errors": error_count,
"command": self.command,
}));
// Score decreases with more errors (up to 10 errors = 0)
result.score = (1.0 - (error_count as f64 / 10.0)).max(0.0);
}
}
_ => {}
}
result
}
}
/// Parse test output for common formats (Jest, pytest, cargo test).
fn parse_test_output(output: &str) -> Option<(u32, u32, u32)> {
// Jest format: "Tests: X passed, Y failed, Z total"
if let Some(caps) = regex::Regex::new(r"Tests:\s*(\d+)\s*passed,\s*(\d+)\s*failed,\s*(\d+)\s*total")
.ok()?
.captures(output)
{
let passed: u32 = caps.get(1)?.as_str().parse().ok()?;
let failed: u32 = caps.get(2)?.as_str().parse().ok()?;
let total: u32 = caps.get(3)?.as_str().parse().ok()?;
return Some((passed, failed, total));
}
// pytest format: "X passed, Y failed"
if let Some(caps) = regex::Regex::new(r"(\d+)\s*passed(?:,\s*(\d+)\s*failed)?")
.ok()?
.captures(output)
{
let passed: u32 = caps.get(1)?.as_str().parse().ok()?;
let failed: u32 = caps.get(2).map(|m| m.as_str().parse().ok()).flatten().unwrap_or(0);
let total = passed + failed;
return Some((passed, failed, total));
}
// cargo test format: "test result: ok. X passed; Y failed;"
if let Some(caps) = regex::Regex::new(r"test result:.*?(\d+)\s*passed;\s*(\d+)\s*failed")
.ok()?
.captures(output)
{
let passed: u32 = caps.get(1)?.as_str().parse().ok()?;
let failed: u32 = caps.get(2)?.as_str().parse().ok()?;
let total = passed + failed;
return Some((passed, failed, total));
}
None
}
/// Parse lint output for error counts.
fn parse_lint_output(output: &str) -> Option<u32> {
// ESLint format: "X problems (Y errors, Z warnings)"
if let Some(caps) = regex::Regex::new(r"(\d+)\s*problems?\s*\((\d+)\s*errors?")
.ok()?
.captures(output)
{
return caps.get(2)?.as_str().parse().ok();
}
// Clippy format: "warning: X warnings emitted"
if let Some(caps) = regex::Regex::new(r"warning:\s*(\d+)\s*warnings?\s*emitted")
.ok()?
.captures(output)
{
return caps.get(1)?.as_str().parse().ok();
}
None
}
/// Auto-detect applicable verifiers for a repository.
pub async fn auto_detect_verifiers(repo_path: &Path) -> Vec<Box<dyn Verifier>> {
let mut verifiers: Vec<Box<dyn Verifier>> = Vec::new();
// Check for package.json (Node.js)
let package_json = repo_path.join("package.json");
if package_json.exists() {
if let Ok(content) = tokio::fs::read_to_string(&package_json).await {
if let Ok(pkg) = serde_json::from_str::<serde_json::Value>(&content) {
if let Some(scripts) = pkg.get("scripts").and_then(|s| s.as_object()) {
// Test runner
if scripts.contains_key("test") {
verifiers.push(Box::new(
CommandVerifier::new("npm-test", VerifierType::TestRunner, "npm test")
.with_patterns(vec!["package.json".to_string()])
.as_required(),
));
}
// Linter
if scripts.contains_key("lint") {
verifiers.push(Box::new(
CommandVerifier::new("npm-lint", VerifierType::Linter, "npm run lint")
.with_patterns(vec!["package.json".to_string()]),
));
}
// Build
if scripts.contains_key("build") {
verifiers.push(Box::new(
CommandVerifier::new("npm-build", VerifierType::Build, "npm run build")
.with_patterns(vec!["package.json".to_string()])
.as_required(),
));
}
// Type check (for TypeScript projects)
if scripts.contains_key("typecheck") || scripts.contains_key("type-check") {
let cmd = if scripts.contains_key("typecheck") {
"npm run typecheck"
} else {
"npm run type-check"
};
verifiers.push(Box::new(
CommandVerifier::new("npm-typecheck", VerifierType::TypeChecker, cmd)
.with_patterns(vec!["tsconfig.json".to_string()]),
));
}
}
}
}
}
// Check for Cargo.toml (Rust)
let cargo_toml = repo_path.join("Cargo.toml");
if cargo_toml.exists() {
verifiers.push(Box::new(
CommandVerifier::new("cargo-test", VerifierType::TestRunner, "cargo test")
.with_patterns(vec!["Cargo.toml".to_string()])
.as_required(),
));
verifiers.push(Box::new(
CommandVerifier::new("cargo-clippy", VerifierType::Linter, "cargo clippy -- -D warnings")
.with_patterns(vec!["Cargo.toml".to_string()]),
));
verifiers.push(Box::new(
CommandVerifier::new("cargo-build", VerifierType::Build, "cargo build")
.with_patterns(vec!["Cargo.toml".to_string()])
.as_required(),
));
}
// Check for pyproject.toml or setup.py (Python)
let pyproject = repo_path.join("pyproject.toml");
let setup_py = repo_path.join("setup.py");
if pyproject.exists() || setup_py.exists() {
verifiers.push(Box::new(
CommandVerifier::new("pytest", VerifierType::TestRunner, "pytest")
.with_patterns(vec![
"pyproject.toml".to_string(),
"setup.py".to_string(),
])
.as_required(),
));
verifiers.push(Box::new(
CommandVerifier::new("ruff", VerifierType::Linter, "ruff check .")
.with_patterns(vec!["pyproject.toml".to_string()]),
));
}
verifiers
}
/// Composite evaluator that runs multiple verifiers and combines results.
pub struct CompositeEvaluator {
verifiers: Vec<Box<dyn Verifier>>,
green_threshold: f64,
yellow_threshold: f64,
}
impl CompositeEvaluator {
/// Create a new composite evaluator with default thresholds.
pub fn new(verifiers: Vec<Box<dyn Verifier>>) -> Self {
Self {
verifiers,
green_threshold: 0.8,
yellow_threshold: 0.5,
}
}
/// Set confidence thresholds.
pub fn with_thresholds(mut self, green: f64, yellow: f64) -> Self {
self.green_threshold = green;
self.yellow_threshold = yellow;
self
}
/// Add a verifier.
pub fn add_verifier(mut self, verifier: Box<dyn Verifier>) -> Self {
self.verifiers.push(verifier);
self
}
/// Run all applicable verifiers and return composite result.
pub async fn evaluate(
&self,
repo_path: &Path,
context: &VerificationContext,
) -> EvaluationResult {
let mut results = Vec::new();
for verifier in &self.verifiers {
if !verifier.is_applicable(repo_path).await {
continue;
}
match verifier.verify(repo_path, context).await {
Ok(result) => results.push(result),
Err(e) => {
// Convert error to failed result
results.push(VerifierResult::failed(
verifier.name().to_string(),
verifier.verifier_type(),
format!("Verifier error: {}", e),
));
}
}
}
EvaluationResult::from_verifiers(
context.step_id,
results,
self.green_threshold,
self.yellow_threshold,
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_confidence_level_from_score() {
assert_eq!(
ConfidenceLevel::from_score(0.9, 0.8, 0.5),
ConfidenceLevel::Green
);
assert_eq!(
ConfidenceLevel::from_score(0.8, 0.8, 0.5),
ConfidenceLevel::Green
);
assert_eq!(
ConfidenceLevel::from_score(0.6, 0.8, 0.5),
ConfidenceLevel::Yellow
);
assert_eq!(
ConfidenceLevel::from_score(0.5, 0.8, 0.5),
ConfidenceLevel::Yellow
);
assert_eq!(
ConfidenceLevel::from_score(0.4, 0.8, 0.5),
ConfidenceLevel::Red
);
}
#[test]
fn test_evaluation_result_composite_score() {
let results = vec![
VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into())
.with_weight(1.0),
VerifierResult::failed("test2".into(), VerifierType::Linter, "Failed".into())
.with_weight(1.0),
];
let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5);
assert!((eval.composite_score - 0.5).abs() < 0.001);
assert_eq!(eval.confidence_level, ConfidenceLevel::Yellow);
}
#[test]
fn test_required_verifier_override() {
let results = vec![
VerifierResult::passed("test1".into(), VerifierType::TestRunner, "OK".into()),
VerifierResult::failed("build".into(), VerifierType::Build, "Failed".into())
.as_required(),
];
let eval = EvaluationResult::from_verifiers(Uuid::new_v4(), results, 0.8, 0.5);
// Even though composite score is 0.5, required failure overrides to red
assert_eq!(eval.confidence_level, ConfidenceLevel::Red);
assert!(!eval.passed);
}
#[test]
fn test_parse_test_output_jest() {
let output = "Tests: 10 passed, 2 failed, 12 total";
let (passed, failed, total) = parse_test_output(output).unwrap();
assert_eq!(passed, 10);
assert_eq!(failed, 2);
assert_eq!(total, 12);
}
#[test]
fn test_parse_test_output_cargo() {
let output = "test result: ok. 25 passed; 0 failed;";
let (passed, failed, total) = parse_test_output(output).unwrap();
assert_eq!(passed, 25);
assert_eq!(failed, 0);
assert_eq!(total, 25);
}
}