//! Contract Evaluator - LLM-based evaluation of completed contracts against directive.
//!
//! This module provides functionality for:
//! - Gathering deliverables, files, and task outputs from completed contracts
//! - Building evaluation prompts using directive and acceptance criteria
//! - Calling LLM to evaluate work against requirements
//! - Parsing evaluation responses
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
use crate::db::{
models::{
ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest,
DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult,
},
repository,
};
use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent};
use super::tools::Tool;
/// Result of contract evaluation
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ContractEvaluationResult {
/// Whether the contract passed evaluation
pub passed: bool,
/// Overall score from 0.0 to 1.0
pub overall_score: f64,
/// Results for each acceptance criterion
pub criteria_results: Vec<EvaluationCriterionResult>,
/// Summary feedback from the evaluator
pub summary_feedback: String,
/// Instructions for rework if failed
pub rework_instructions: Option<String>,
}
/// Context gathered for evaluation
#[derive(Debug, Clone)]
pub struct EvaluationContext {
/// The contract being evaluated
pub contract: Contract,
/// The chain contract record
pub chain_contract: ChainContract,
/// The directive document
pub directive: ChainDirective,
/// Files associated with the contract
pub files: Vec<FileContent>,
/// Task outputs from the contract
pub task_outputs: Vec<TaskOutput>,
/// Deliverables marked as complete
pub deliverables: Vec<DeliverableInfo>,
/// Acceptance criteria specific to this contract
pub acceptance_criteria: Vec<DirectiveAcceptanceCriterion>,
/// Requirements mapped to this contract
pub requirements: Vec<DirectiveRequirement>,
}
/// File content for evaluation
#[derive(Debug, Clone, Serialize)]
pub struct FileContent {
pub path: String,
pub description: Option<String>,
pub content: String,
pub is_deliverable: bool,
}
/// Task output for evaluation
#[derive(Debug, Clone, Serialize)]
pub struct TaskOutput {
pub task_name: String,
pub output_summary: String,
pub exit_code: Option<i32>,
}
/// Deliverable info for evaluation
#[derive(Debug, Clone, Serialize)]
pub struct DeliverableInfo {
pub name: String,
pub status: String,
pub file_path: Option<String>,
}
/// Error types for evaluation
#[derive(Debug, thiserror::Error)]
pub enum EvaluationError {
#[error("Database error: {0}")]
Database(#[from] sqlx::Error),
#[error("Contract not found: {0}")]
ContractNotFound(Uuid),
#[error("Chain contract not found for contract: {0}")]
ChainContractNotFound(Uuid),
#[error("Directive not found for chain: {0}")]
DirectiveNotFound(Uuid),
#[error("LLM evaluation failed: {0}")]
LlmError(String),
#[error("Failed to parse evaluation response: {0}")]
ParseError(String),
}
/// Contract evaluator for directive-driven evaluation
pub struct ContractEvaluator {
pool: PgPool,
claude_client: ClaudeClient,
model: ClaudeModel,
/// Minimum score required to pass (default 0.8)
pass_threshold: f64,
}
impl ContractEvaluator {
/// Create a new evaluator
pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self {
Self {
pool,
claude_client,
model: ClaudeModel::Sonnet,
pass_threshold: 0.8,
}
}
/// Set the LLM model to use for evaluation
pub fn with_model(mut self, model: ClaudeModel) -> Self {
self.model = model;
self
}
/// Set the pass threshold
pub fn with_pass_threshold(mut self, threshold: f64) -> Self {
self.pass_threshold = threshold;
self
}
/// Evaluate a completed contract against the directive
pub async fn evaluate_contract(
&self,
contract_id: Uuid,
owner_id: Uuid,
) -> Result<ContractEvaluationResult, EvaluationError> {
// Gather evaluation context
let context = self.gather_context(contract_id, owner_id).await?;
// Build evaluation prompt
let prompt = self.build_evaluation_prompt(&context);
// Call LLM for evaluation
let response = self.call_llm_for_evaluation(&prompt).await?;
// Parse the response
let result = self.parse_evaluation_response(&response, &context)?;
Ok(result)
}
/// Gather all context needed for evaluation
async fn gather_context(
&self,
contract_id: Uuid,
owner_id: Uuid,
) -> Result<EvaluationContext, EvaluationError> {
// Get contract
let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id)
.await?
.ok_or(EvaluationError::ContractNotFound(contract_id))?;
// Get chain contract
let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id)
.await?
.ok_or(EvaluationError::ChainContractNotFound(contract_id))?;
// Get directive
let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id)
.await?
.ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?;
// Get files directly from repository
let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id)
.await
.unwrap_or_default();
// Get tasks directly from repository
let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id)
.await
.unwrap_or_default();
// Build file contents from FileSummary
// Note: FileSummary doesn't have content, so we use name and description
let files: Vec<FileContent> = contract_files.iter().map(|f| {
FileContent {
path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()),
description: f.description.clone(),
content: format!("[File: {} - content not loaded in summary view]", f.name),
is_deliverable: false, // FileSummary doesn't track deliverable status
}
}).collect();
// Build task outputs from TaskSummary
let task_outputs: Vec<TaskOutput> = contract_tasks.iter().map(|t| {
TaskOutput {
task_name: t.name.clone(),
output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)),
exit_code: None,
}
}).collect();
// Build deliverables info from files marked as deliverables
// Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables
let deliverables: Vec<DeliverableInfo> = contract_files.iter()
.map(|f| DeliverableInfo {
name: f.name.clone(),
status: "complete".to_string(),
file_path: f.repo_file_path.clone(),
})
.collect();
// Parse requirements and acceptance criteria from directive
let requirements: Vec<DirectiveRequirement> =
serde_json::from_value(directive.requirements.clone()).unwrap_or_default();
let all_criteria: Vec<DirectiveAcceptanceCriterion> =
serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default();
// Get contract definition to find mapped requirements
// For now, use all acceptance criteria
let acceptance_criteria = all_criteria;
Ok(EvaluationContext {
contract,
chain_contract,
directive,
files,
task_outputs,
deliverables,
acceptance_criteria,
requirements,
})
}
/// Build the evaluation prompt
fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String {
let mut prompt = String::new();
prompt.push_str("# Contract Completion Evaluation\n\n");
prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n");
// Contract info
prompt.push_str("## Contract Information\n\n");
prompt.push_str(&format!("**Name:** {}\n", context.contract.name));
if let Some(ref desc) = context.contract.description {
prompt.push_str(&format!("**Description:** {}\n", desc));
}
prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type));
prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase));
prompt.push_str("\n");
// Requirements
if !context.requirements.is_empty() {
prompt.push_str("## Requirements\n\n");
for req in &context.requirements {
prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title));
if !req.description.is_empty() {
prompt.push_str(&format!(" {}\n", req.description));
}
}
prompt.push_str("\n");
}
// Acceptance criteria
if !context.acceptance_criteria.is_empty() {
prompt.push_str("## Acceptance Criteria\n\n");
for (i, criterion) in context.acceptance_criteria.iter().enumerate() {
prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description));
prompt.push_str(&format!(" - Testable: {}\n", criterion.testable));
if !criterion.requirement_ids.is_empty() {
prompt.push_str(&format!(" - Covers: {}\n", criterion.requirement_ids.join(", ")));
}
}
prompt.push_str("\n");
}
// Deliverables
if !context.deliverables.is_empty() {
prompt.push_str("## Deliverables\n\n");
for d in &context.deliverables {
prompt.push_str(&format!("- {} ({})\n", d.name, d.status));
}
prompt.push_str("\n");
}
// Files
if !context.files.is_empty() {
prompt.push_str("## Files Created/Modified\n\n");
for file in &context.files {
prompt.push_str(&format!("### {}", file.path));
if file.is_deliverable {
prompt.push_str(" [DELIVERABLE]");
}
prompt.push_str("\n");
if let Some(ref desc) = file.description {
prompt.push_str(&format!("*{}*\n", desc));
}
// Truncate content if too long
let content = if file.content.len() > 5000 {
format!("{}...\n[Content truncated - {} chars total]",
&file.content[..5000], file.content.len())
} else {
file.content.clone()
};
prompt.push_str("```\n");
prompt.push_str(&content);
prompt.push_str("\n```\n\n");
}
}
// Task outputs
if !context.task_outputs.is_empty() {
prompt.push_str("## Task Outputs\n\n");
for task in &context.task_outputs {
prompt.push_str(&format!("### {}\n", task.task_name));
prompt.push_str(&format!("{}\n\n", task.output_summary));
}
}
// Evaluation instructions
prompt.push_str("## Evaluation Instructions\n\n");
prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n");
prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n");
prompt.push_str("Respond with a JSON object in the following format:\n\n");
prompt.push_str("```json\n");
prompt.push_str(r#"{
"passed": true/false,
"overallScore": 0.0-1.0,
"criteriaResults": [
{
"criterionId": "criterion identifier or index",
"met": true/false,
"score": 0.0-1.0,
"feedback": "explanation of why criterion was/wasn't met"
}
],
"summaryFeedback": "overall summary of the evaluation",
"reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)"
}
"#);
prompt.push_str("```\n\n");
prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold));
prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n");
prompt
}
/// Call LLM for evaluation
async fn call_llm_for_evaluation(&self, prompt: &str) -> Result<String, EvaluationError> {
let messages = vec![Message {
role: "user".to_string(),
content: MessageContent::Text(prompt.to_string()),
}];
// Use chat_with_tools with empty tools array for simple chat
let empty_tools: Vec<Tool> = vec![];
let result = self
.claude_client
.chat_with_tools(messages, &empty_tools)
.await
.map_err(|e| EvaluationError::LlmError(e.to_string()))?;
// ChatResult.content is already an Option<String>
let text = result.content.unwrap_or_default();
Ok(text)
}
/// Parse the LLM response into an evaluation result
fn parse_evaluation_response(
&self,
response: &str,
context: &EvaluationContext,
) -> Result<ContractEvaluationResult, EvaluationError> {
// Extract JSON from response (may be wrapped in markdown code blocks)
let json_str = extract_json_from_response(response)?;
// Parse the JSON
let parsed: EvaluationResponseJson = serde_json::from_str(&json_str)
.map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?;
// Convert to our result type
let criteria_results: Vec<EvaluationCriterionResult> = parsed
.criteria_results
.into_iter()
.map(|cr| EvaluationCriterionResult {
criterion_id: cr.criterion_id.clone(),
criterion_text: cr.criterion_id, // Use ID as text if not provided
passed: cr.passed,
score: cr.score,
feedback: cr.feedback,
evidence: vec![],
})
.collect();
// Determine pass/fail based on threshold and results
let passed = parsed.passed && parsed.overall_score >= self.pass_threshold;
Ok(ContractEvaluationResult {
passed,
overall_score: parsed.overall_score,
criteria_results,
summary_feedback: parsed.summary_feedback,
rework_instructions: if passed { None } else { parsed.rework_instructions },
})
}
/// Save evaluation result to database
pub async fn save_evaluation(
&self,
contract_id: Uuid,
chain_id: Uuid,
chain_contract_id: Uuid,
result: &ContractEvaluationResult,
) -> Result<ContractEvaluation, EvaluationError> {
let req = CreateContractEvaluationRequest {
contract_id,
chain_id: Some(chain_id),
chain_contract_id: Some(chain_contract_id),
evaluator_model: Some(format!("{:?}", self.model)),
passed: result.passed,
overall_score: Some(result.overall_score),
criteria_results: result.criteria_results.clone(),
summary_feedback: result.summary_feedback.clone(),
rework_instructions: result.rework_instructions.clone(),
};
let evaluation = repository::create_contract_evaluation(&self.pool, req).await?;
// Update chain contract status
let status = if result.passed { "passed" } else { "failed" };
repository::update_chain_contract_evaluation_status(
&self.pool,
chain_contract_id,
status,
Some(evaluation.id),
result.rework_instructions.as_deref(),
)
.await?;
Ok(evaluation)
}
}
/// JSON structure for parsing LLM response
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct EvaluationResponseJson {
passed: bool,
overall_score: f64,
criteria_results: Vec<CriterionResultJson>,
summary_feedback: String,
rework_instructions: Option<String>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct CriterionResultJson {
criterion_id: String,
#[serde(alias = "met")]
passed: bool,
#[serde(default)]
score: f64,
feedback: String,
}
/// Extract JSON from a response that may contain markdown code blocks
fn extract_json_from_response(response: &str) -> Result<String, EvaluationError> {
// Try to find JSON in code blocks first
if let Some(start) = response.find("```json") {
let json_start = start + 7;
if let Some(end) = response[json_start..].find("```") {
return Ok(response[json_start..json_start + end].trim().to_string());
}
}
// Try plain code blocks
if let Some(start) = response.find("```") {
let json_start = start + 3;
// Skip any language identifier on the same line
let actual_start = response[json_start..]
.find('\n')
.map(|i| json_start + i + 1)
.unwrap_or(json_start);
if let Some(end) = response[actual_start..].find("```") {
return Ok(response[actual_start..actual_start + end].trim().to_string());
}
}
// Try to find raw JSON (starts with {)
if let Some(start) = response.find('{') {
// Find matching closing brace
let mut depth = 0;
let mut end = start;
for (i, c) in response[start..].char_indices() {
match c {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
end = start + i + 1;
break;
}
}
_ => {}
}
}
if end > start {
return Ok(response[start..end].to_string());
}
}
Err(EvaluationError::ParseError(
"Could not find JSON in response".to_string(),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_json_from_code_block() {
let response = r#"Here is the evaluation:
```json
{
"passed": true,
"overallScore": 0.85
}
```
Done."#;
let json = extract_json_from_response(response).unwrap();
assert!(json.contains("\"passed\": true"));
}
#[test]
fn test_extract_json_raw() {
let response = r#"The result is {"passed": false, "overallScore": 0.5}"#;
let json = extract_json_from_response(response).unwrap();
assert!(json.contains("\"passed\": false"));
}
}