//! Contract Evaluator - LLM-based evaluation of completed contracts against directive. //! //! This module provides functionality for: //! - Gathering deliverables, files, and task outputs from completed contracts //! - Building evaluation prompts using directive and acceptance criteria //! - Calling LLM to evaluate work against requirements //! - Parsing evaluation responses use serde::{Deserialize, Serialize}; use sqlx::PgPool; use uuid::Uuid; use crate::db::{ models::{ ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest, DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult, }, repository, }; use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent}; use super::tools::Tool; /// Result of contract evaluation #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ContractEvaluationResult { /// Whether the contract passed evaluation pub passed: bool, /// Overall score from 0.0 to 1.0 pub overall_score: f64, /// Results for each acceptance criterion pub criteria_results: Vec, /// Summary feedback from the evaluator pub summary_feedback: String, /// Instructions for rework if failed pub rework_instructions: Option, } /// Context gathered for evaluation #[derive(Debug, Clone)] pub struct EvaluationContext { /// The contract being evaluated pub contract: Contract, /// The chain contract record pub chain_contract: ChainContract, /// The directive document pub directive: ChainDirective, /// Files associated with the contract pub files: Vec, /// Task outputs from the contract pub task_outputs: Vec, /// Deliverables marked as complete pub deliverables: Vec, /// Acceptance criteria specific to this contract pub acceptance_criteria: Vec, /// Requirements mapped to this contract pub requirements: Vec, } /// File content for evaluation #[derive(Debug, Clone, Serialize)] pub struct FileContent { pub path: String, pub description: Option, pub content: String, pub is_deliverable: bool, } /// Task output for evaluation #[derive(Debug, Clone, Serialize)] pub struct TaskOutput { pub task_name: String, pub output_summary: String, pub exit_code: Option, } /// Deliverable info for evaluation #[derive(Debug, Clone, Serialize)] pub struct DeliverableInfo { pub name: String, pub status: String, pub file_path: Option, } /// Error types for evaluation #[derive(Debug, thiserror::Error)] pub enum EvaluationError { #[error("Database error: {0}")] Database(#[from] sqlx::Error), #[error("Contract not found: {0}")] ContractNotFound(Uuid), #[error("Chain contract not found for contract: {0}")] ChainContractNotFound(Uuid), #[error("Directive not found for chain: {0}")] DirectiveNotFound(Uuid), #[error("LLM evaluation failed: {0}")] LlmError(String), #[error("Failed to parse evaluation response: {0}")] ParseError(String), } /// Contract evaluator for directive-driven evaluation pub struct ContractEvaluator { pool: PgPool, claude_client: ClaudeClient, model: ClaudeModel, /// Minimum score required to pass (default 0.8) pass_threshold: f64, } impl ContractEvaluator { /// Create a new evaluator pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self { Self { pool, claude_client, model: ClaudeModel::Sonnet, pass_threshold: 0.8, } } /// Set the LLM model to use for evaluation pub fn with_model(mut self, model: ClaudeModel) -> Self { self.model = model; self } /// Set the pass threshold pub fn with_pass_threshold(mut self, threshold: f64) -> Self { self.pass_threshold = threshold; self } /// Evaluate a completed contract against the directive pub async fn evaluate_contract( &self, contract_id: Uuid, owner_id: Uuid, ) -> Result { // Gather evaluation context let context = self.gather_context(contract_id, owner_id).await?; // Build evaluation prompt let prompt = self.build_evaluation_prompt(&context); // Call LLM for evaluation let response = self.call_llm_for_evaluation(&prompt).await?; // Parse the response let result = self.parse_evaluation_response(&response, &context)?; Ok(result) } /// Gather all context needed for evaluation async fn gather_context( &self, contract_id: Uuid, owner_id: Uuid, ) -> Result { // Get contract let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id) .await? .ok_or(EvaluationError::ContractNotFound(contract_id))?; // Get chain contract let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id) .await? .ok_or(EvaluationError::ChainContractNotFound(contract_id))?; // Get directive let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id) .await? .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?; // Get files directly from repository let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id) .await .unwrap_or_default(); // Get tasks directly from repository let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id) .await .unwrap_or_default(); // Build file contents from FileSummary // Note: FileSummary doesn't have content, so we use name and description let files: Vec = contract_files.iter().map(|f| { FileContent { path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()), description: f.description.clone(), content: format!("[File: {} - content not loaded in summary view]", f.name), is_deliverable: false, // FileSummary doesn't track deliverable status } }).collect(); // Build task outputs from TaskSummary let task_outputs: Vec = contract_tasks.iter().map(|t| { TaskOutput { task_name: t.name.clone(), output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)), exit_code: None, } }).collect(); // Build deliverables info from files marked as deliverables // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables let deliverables: Vec = contract_files.iter() .map(|f| DeliverableInfo { name: f.name.clone(), status: "complete".to_string(), file_path: f.repo_file_path.clone(), }) .collect(); // Parse requirements and acceptance criteria from directive let requirements: Vec = serde_json::from_value(directive.requirements.clone()).unwrap_or_default(); let all_criteria: Vec = serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default(); // Get contract definition to find mapped requirements // For now, use all acceptance criteria let acceptance_criteria = all_criteria; Ok(EvaluationContext { contract, chain_contract, directive, files, task_outputs, deliverables, acceptance_criteria, requirements, }) } /// Build the evaluation prompt fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String { let mut prompt = String::new(); prompt.push_str("# Contract Completion Evaluation\n\n"); prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n"); // Contract info prompt.push_str("## Contract Information\n\n"); prompt.push_str(&format!("**Name:** {}\n", context.contract.name)); if let Some(ref desc) = context.contract.description { prompt.push_str(&format!("**Description:** {}\n", desc)); } prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type)); prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase)); prompt.push_str("\n"); // Requirements if !context.requirements.is_empty() { prompt.push_str("## Requirements\n\n"); for req in &context.requirements { prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title)); if !req.description.is_empty() { prompt.push_str(&format!(" {}\n", req.description)); } } prompt.push_str("\n"); } // Acceptance criteria if !context.acceptance_criteria.is_empty() { prompt.push_str("## Acceptance Criteria\n\n"); for (i, criterion) in context.acceptance_criteria.iter().enumerate() { prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description)); prompt.push_str(&format!(" - Testable: {}\n", criterion.testable)); if !criterion.requirement_ids.is_empty() { prompt.push_str(&format!(" - Covers: {}\n", criterion.requirement_ids.join(", "))); } } prompt.push_str("\n"); } // Deliverables if !context.deliverables.is_empty() { prompt.push_str("## Deliverables\n\n"); for d in &context.deliverables { prompt.push_str(&format!("- {} ({})\n", d.name, d.status)); } prompt.push_str("\n"); } // Files if !context.files.is_empty() { prompt.push_str("## Files Created/Modified\n\n"); for file in &context.files { prompt.push_str(&format!("### {}", file.path)); if file.is_deliverable { prompt.push_str(" [DELIVERABLE]"); } prompt.push_str("\n"); if let Some(ref desc) = file.description { prompt.push_str(&format!("*{}*\n", desc)); } // Truncate content if too long let content = if file.content.len() > 5000 { format!("{}...\n[Content truncated - {} chars total]", &file.content[..5000], file.content.len()) } else { file.content.clone() }; prompt.push_str("```\n"); prompt.push_str(&content); prompt.push_str("\n```\n\n"); } } // Task outputs if !context.task_outputs.is_empty() { prompt.push_str("## Task Outputs\n\n"); for task in &context.task_outputs { prompt.push_str(&format!("### {}\n", task.task_name)); prompt.push_str(&format!("{}\n\n", task.output_summary)); } } // Evaluation instructions prompt.push_str("## Evaluation Instructions\n\n"); prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n"); prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n"); prompt.push_str("Respond with a JSON object in the following format:\n\n"); prompt.push_str("```json\n"); prompt.push_str(r#"{ "passed": true/false, "overallScore": 0.0-1.0, "criteriaResults": [ { "criterionId": "criterion identifier or index", "met": true/false, "score": 0.0-1.0, "feedback": "explanation of why criterion was/wasn't met" } ], "summaryFeedback": "overall summary of the evaluation", "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)" } "#); prompt.push_str("```\n\n"); prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold)); prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n"); prompt } /// Call LLM for evaluation async fn call_llm_for_evaluation(&self, prompt: &str) -> Result { let messages = vec![Message { role: "user".to_string(), content: MessageContent::Text(prompt.to_string()), }]; // Use chat_with_tools with empty tools array for simple chat let empty_tools: Vec = vec![]; let result = self .claude_client .chat_with_tools(messages, &empty_tools) .await .map_err(|e| EvaluationError::LlmError(e.to_string()))?; // ChatResult.content is already an Option let text = result.content.unwrap_or_default(); Ok(text) } /// Parse the LLM response into an evaluation result fn parse_evaluation_response( &self, response: &str, context: &EvaluationContext, ) -> Result { // Extract JSON from response (may be wrapped in markdown code blocks) let json_str = extract_json_from_response(response)?; // Parse the JSON let parsed: EvaluationResponseJson = serde_json::from_str(&json_str) .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?; // Convert to our result type let criteria_results: Vec = parsed .criteria_results .into_iter() .map(|cr| EvaluationCriterionResult { criterion_id: cr.criterion_id.clone(), criterion_text: cr.criterion_id, // Use ID as text if not provided passed: cr.passed, score: cr.score, feedback: cr.feedback, evidence: vec![], }) .collect(); // Determine pass/fail based on threshold and results let passed = parsed.passed && parsed.overall_score >= self.pass_threshold; Ok(ContractEvaluationResult { passed, overall_score: parsed.overall_score, criteria_results, summary_feedback: parsed.summary_feedback, rework_instructions: if passed { None } else { parsed.rework_instructions }, }) } /// Save evaluation result to database pub async fn save_evaluation( &self, contract_id: Uuid, chain_id: Uuid, chain_contract_id: Uuid, result: &ContractEvaluationResult, ) -> Result { let req = CreateContractEvaluationRequest { contract_id, chain_id: Some(chain_id), chain_contract_id: Some(chain_contract_id), evaluator_model: Some(format!("{:?}", self.model)), passed: result.passed, overall_score: Some(result.overall_score), criteria_results: result.criteria_results.clone(), summary_feedback: result.summary_feedback.clone(), rework_instructions: result.rework_instructions.clone(), }; let evaluation = repository::create_contract_evaluation(&self.pool, req).await?; // Update chain contract status let status = if result.passed { "passed" } else { "failed" }; repository::update_chain_contract_evaluation_status( &self.pool, chain_contract_id, status, Some(evaluation.id), result.rework_instructions.as_deref(), ) .await?; Ok(evaluation) } } /// JSON structure for parsing LLM response #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] struct EvaluationResponseJson { passed: bool, overall_score: f64, criteria_results: Vec, summary_feedback: String, rework_instructions: Option, } #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] struct CriterionResultJson { criterion_id: String, #[serde(alias = "met")] passed: bool, #[serde(default)] score: f64, feedback: String, } /// Extract JSON from a response that may contain markdown code blocks fn extract_json_from_response(response: &str) -> Result { // Try to find JSON in code blocks first if let Some(start) = response.find("```json") { let json_start = start + 7; if let Some(end) = response[json_start..].find("```") { return Ok(response[json_start..json_start + end].trim().to_string()); } } // Try plain code blocks if let Some(start) = response.find("```") { let json_start = start + 3; // Skip any language identifier on the same line let actual_start = response[json_start..] .find('\n') .map(|i| json_start + i + 1) .unwrap_or(json_start); if let Some(end) = response[actual_start..].find("```") { return Ok(response[actual_start..actual_start + end].trim().to_string()); } } // Try to find raw JSON (starts with {) if let Some(start) = response.find('{') { // Find matching closing brace let mut depth = 0; let mut end = start; for (i, c) in response[start..].char_indices() { match c { '{' => depth += 1, '}' => { depth -= 1; if depth == 0 { end = start + i + 1; break; } } _ => {} } } if end > start { return Ok(response[start..end].to_string()); } } Err(EvaluationError::ParseError( "Could not find JSON in response".to_string(), )) } #[cfg(test)] mod tests { use super::*; #[test] fn test_extract_json_from_code_block() { let response = r#"Here is the evaluation: ```json { "passed": true, "overallScore": 0.85 } ``` Done."#; let json = extract_json_from_response(response).unwrap(); assert!(json.contains("\"passed\": true")); } #[test] fn test_extract_json_raw() { let response = r#"The result is {"passed": false, "overallScore": 0.5}"#; let json = extract_json_from_response(response).unwrap(); assert!(json.contains("\"passed\": false")); } }