summaryrefslogtreecommitdiff
path: root/makima/src/llm/contract_evaluator.rs
diff options
context:
space:
mode:
Diffstat (limited to 'makima/src/llm/contract_evaluator.rs')
-rw-r--r--makima/src/llm/contract_evaluator.rs555
1 files changed, 555 insertions, 0 deletions
diff --git a/makima/src/llm/contract_evaluator.rs b/makima/src/llm/contract_evaluator.rs
new file mode 100644
index 0000000..fcc4826
--- /dev/null
+++ b/makima/src/llm/contract_evaluator.rs
@@ -0,0 +1,555 @@
+//! Contract Evaluator - LLM-based evaluation of completed contracts against directive.
+//!
+//! This module provides functionality for:
+//! - Gathering deliverables, files, and task outputs from completed contracts
+//! - Building evaluation prompts using directive and acceptance criteria
+//! - Calling LLM to evaluate work against requirements
+//! - Parsing evaluation responses
+
+use serde::{Deserialize, Serialize};
+use sqlx::PgPool;
+use uuid::Uuid;
+
+use crate::db::{
+ models::{
+ ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest,
+ DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult,
+ },
+ repository,
+};
+
+use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent};
+use super::tools::Tool;
+
+/// Result of contract evaluation
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct ContractEvaluationResult {
+ /// Whether the contract passed evaluation
+ pub passed: bool,
+ /// Overall score from 0.0 to 1.0
+ pub overall_score: f64,
+ /// Results for each acceptance criterion
+ pub criteria_results: Vec<EvaluationCriterionResult>,
+ /// Summary feedback from the evaluator
+ pub summary_feedback: String,
+ /// Instructions for rework if failed
+ pub rework_instructions: Option<String>,
+}
+
+/// Context gathered for evaluation
+#[derive(Debug, Clone)]
+pub struct EvaluationContext {
+ /// The contract being evaluated
+ pub contract: Contract,
+ /// The chain contract record
+ pub chain_contract: ChainContract,
+ /// The directive document
+ pub directive: ChainDirective,
+ /// Files associated with the contract
+ pub files: Vec<FileContent>,
+ /// Task outputs from the contract
+ pub task_outputs: Vec<TaskOutput>,
+ /// Deliverables marked as complete
+ pub deliverables: Vec<DeliverableInfo>,
+ /// Acceptance criteria specific to this contract
+ pub acceptance_criteria: Vec<DirectiveAcceptanceCriterion>,
+ /// Requirements mapped to this contract
+ pub requirements: Vec<DirectiveRequirement>,
+}
+
+/// File content for evaluation
+#[derive(Debug, Clone, Serialize)]
+pub struct FileContent {
+ pub path: String,
+ pub description: Option<String>,
+ pub content: String,
+ pub is_deliverable: bool,
+}
+
+/// Task output for evaluation
+#[derive(Debug, Clone, Serialize)]
+pub struct TaskOutput {
+ pub task_name: String,
+ pub output_summary: String,
+ pub exit_code: Option<i32>,
+}
+
+/// Deliverable info for evaluation
+#[derive(Debug, Clone, Serialize)]
+pub struct DeliverableInfo {
+ pub name: String,
+ pub status: String,
+ pub file_path: Option<String>,
+}
+
+/// Error types for evaluation
+#[derive(Debug, thiserror::Error)]
+pub enum EvaluationError {
+ #[error("Database error: {0}")]
+ Database(#[from] sqlx::Error),
+
+ #[error("Contract not found: {0}")]
+ ContractNotFound(Uuid),
+
+ #[error("Chain contract not found for contract: {0}")]
+ ChainContractNotFound(Uuid),
+
+ #[error("Directive not found for chain: {0}")]
+ DirectiveNotFound(Uuid),
+
+ #[error("LLM evaluation failed: {0}")]
+ LlmError(String),
+
+ #[error("Failed to parse evaluation response: {0}")]
+ ParseError(String),
+}
+
+/// Contract evaluator for directive-driven evaluation
+pub struct ContractEvaluator {
+ pool: PgPool,
+ claude_client: ClaudeClient,
+ model: ClaudeModel,
+ /// Minimum score required to pass (default 0.8)
+ pass_threshold: f64,
+}
+
+impl ContractEvaluator {
+ /// Create a new evaluator
+ pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self {
+ Self {
+ pool,
+ claude_client,
+ model: ClaudeModel::Sonnet,
+ pass_threshold: 0.8,
+ }
+ }
+
+ /// Set the LLM model to use for evaluation
+ pub fn with_model(mut self, model: ClaudeModel) -> Self {
+ self.model = model;
+ self
+ }
+
+ /// Set the pass threshold
+ pub fn with_pass_threshold(mut self, threshold: f64) -> Self {
+ self.pass_threshold = threshold;
+ self
+ }
+
+ /// Evaluate a completed contract against the directive
+ pub async fn evaluate_contract(
+ &self,
+ contract_id: Uuid,
+ owner_id: Uuid,
+ ) -> Result<ContractEvaluationResult, EvaluationError> {
+ // Gather evaluation context
+ let context = self.gather_context(contract_id, owner_id).await?;
+
+ // Build evaluation prompt
+ let prompt = self.build_evaluation_prompt(&context);
+
+ // Call LLM for evaluation
+ let response = self.call_llm_for_evaluation(&prompt).await?;
+
+ // Parse the response
+ let result = self.parse_evaluation_response(&response, &context)?;
+
+ Ok(result)
+ }
+
+ /// Gather all context needed for evaluation
+ async fn gather_context(
+ &self,
+ contract_id: Uuid,
+ owner_id: Uuid,
+ ) -> Result<EvaluationContext, EvaluationError> {
+ // Get contract
+ let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id)
+ .await?
+ .ok_or(EvaluationError::ContractNotFound(contract_id))?;
+
+ // Get chain contract
+ let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id)
+ .await?
+ .ok_or(EvaluationError::ChainContractNotFound(contract_id))?;
+
+ // Get directive
+ let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id)
+ .await?
+ .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?;
+
+ // Get files directly from repository
+ let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id)
+ .await
+ .unwrap_or_default();
+
+ // Get tasks directly from repository
+ let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id)
+ .await
+ .unwrap_or_default();
+
+ // Build file contents from FileSummary
+ // Note: FileSummary doesn't have content, so we use name and description
+ let files: Vec<FileContent> = contract_files.iter().map(|f| {
+ FileContent {
+ path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()),
+ description: f.description.clone(),
+ content: format!("[File: {} - content not loaded in summary view]", f.name),
+ is_deliverable: false, // FileSummary doesn't track deliverable status
+ }
+ }).collect();
+
+ // Build task outputs from TaskSummary
+ let task_outputs: Vec<TaskOutput> = contract_tasks.iter().map(|t| {
+ TaskOutput {
+ task_name: t.name.clone(),
+ output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)),
+ exit_code: None,
+ }
+ }).collect();
+
+ // Build deliverables info from files marked as deliverables
+ // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables
+ let deliverables: Vec<DeliverableInfo> = contract_files.iter()
+ .map(|f| DeliverableInfo {
+ name: f.name.clone(),
+ status: "complete".to_string(),
+ file_path: f.repo_file_path.clone(),
+ })
+ .collect();
+
+ // Parse requirements and acceptance criteria from directive
+ let requirements: Vec<DirectiveRequirement> =
+ serde_json::from_value(directive.requirements.clone()).unwrap_or_default();
+
+ let all_criteria: Vec<DirectiveAcceptanceCriterion> =
+ serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default();
+
+ // Get contract definition to find mapped requirements
+ // For now, use all acceptance criteria
+ let acceptance_criteria = all_criteria;
+
+ Ok(EvaluationContext {
+ contract,
+ chain_contract,
+ directive,
+ files,
+ task_outputs,
+ deliverables,
+ acceptance_criteria,
+ requirements,
+ })
+ }
+
+ /// Build the evaluation prompt
+ fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String {
+ let mut prompt = String::new();
+
+ prompt.push_str("# Contract Completion Evaluation\n\n");
+ prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n");
+
+ // Contract info
+ prompt.push_str("## Contract Information\n\n");
+ prompt.push_str(&format!("**Name:** {}\n", context.contract.name));
+ if let Some(ref desc) = context.contract.description {
+ prompt.push_str(&format!("**Description:** {}\n", desc));
+ }
+ prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type));
+ prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase));
+ prompt.push_str("\n");
+
+ // Requirements
+ if !context.requirements.is_empty() {
+ prompt.push_str("## Requirements\n\n");
+ for req in &context.requirements {
+ prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title));
+ if !req.description.is_empty() {
+ prompt.push_str(&format!(" {}\n", req.description));
+ }
+ }
+ prompt.push_str("\n");
+ }
+
+ // Acceptance criteria
+ if !context.acceptance_criteria.is_empty() {
+ prompt.push_str("## Acceptance Criteria\n\n");
+ for (i, criterion) in context.acceptance_criteria.iter().enumerate() {
+ prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description));
+ prompt.push_str(&format!(" - Testable: {}\n", criterion.testable));
+ if !criterion.requirement_ids.is_empty() {
+ prompt.push_str(&format!(" - Covers: {}\n", criterion.requirement_ids.join(", ")));
+ }
+ }
+ prompt.push_str("\n");
+ }
+
+ // Deliverables
+ if !context.deliverables.is_empty() {
+ prompt.push_str("## Deliverables\n\n");
+ for d in &context.deliverables {
+ prompt.push_str(&format!("- {} ({})\n", d.name, d.status));
+ }
+ prompt.push_str("\n");
+ }
+
+ // Files
+ if !context.files.is_empty() {
+ prompt.push_str("## Files Created/Modified\n\n");
+ for file in &context.files {
+ prompt.push_str(&format!("### {}", file.path));
+ if file.is_deliverable {
+ prompt.push_str(" [DELIVERABLE]");
+ }
+ prompt.push_str("\n");
+ if let Some(ref desc) = file.description {
+ prompt.push_str(&format!("*{}*\n", desc));
+ }
+ // Truncate content if too long
+ let content = if file.content.len() > 5000 {
+ format!("{}...\n[Content truncated - {} chars total]",
+ &file.content[..5000], file.content.len())
+ } else {
+ file.content.clone()
+ };
+ prompt.push_str("```\n");
+ prompt.push_str(&content);
+ prompt.push_str("\n```\n\n");
+ }
+ }
+
+ // Task outputs
+ if !context.task_outputs.is_empty() {
+ prompt.push_str("## Task Outputs\n\n");
+ for task in &context.task_outputs {
+ prompt.push_str(&format!("### {}\n", task.task_name));
+ prompt.push_str(&format!("{}\n\n", task.output_summary));
+ }
+ }
+
+ // Evaluation instructions
+ prompt.push_str("## Evaluation Instructions\n\n");
+ prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n");
+ prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n");
+ prompt.push_str("Respond with a JSON object in the following format:\n\n");
+ prompt.push_str("```json\n");
+ prompt.push_str(r#"{
+ "passed": true/false,
+ "overallScore": 0.0-1.0,
+ "criteriaResults": [
+ {
+ "criterionId": "criterion identifier or index",
+ "met": true/false,
+ "score": 0.0-1.0,
+ "feedback": "explanation of why criterion was/wasn't met"
+ }
+ ],
+ "summaryFeedback": "overall summary of the evaluation",
+ "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)"
+}
+"#);
+ prompt.push_str("```\n\n");
+ prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold));
+ prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n");
+
+ prompt
+ }
+
+ /// Call LLM for evaluation
+ async fn call_llm_for_evaluation(&self, prompt: &str) -> Result<String, EvaluationError> {
+ let messages = vec![Message {
+ role: "user".to_string(),
+ content: MessageContent::Text(prompt.to_string()),
+ }];
+
+ // Use chat_with_tools with empty tools array for simple chat
+ let empty_tools: Vec<Tool> = vec![];
+ let result = self
+ .claude_client
+ .chat_with_tools(messages, &empty_tools)
+ .await
+ .map_err(|e| EvaluationError::LlmError(e.to_string()))?;
+
+ // ChatResult.content is already an Option<String>
+ let text = result.content.unwrap_or_default();
+
+ Ok(text)
+ }
+
+ /// Parse the LLM response into an evaluation result
+ fn parse_evaluation_response(
+ &self,
+ response: &str,
+ context: &EvaluationContext,
+ ) -> Result<ContractEvaluationResult, EvaluationError> {
+ // Extract JSON from response (may be wrapped in markdown code blocks)
+ let json_str = extract_json_from_response(response)?;
+
+ // Parse the JSON
+ let parsed: EvaluationResponseJson = serde_json::from_str(&json_str)
+ .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?;
+
+ // Convert to our result type
+ let criteria_results: Vec<EvaluationCriterionResult> = parsed
+ .criteria_results
+ .into_iter()
+ .map(|cr| EvaluationCriterionResult {
+ criterion_id: cr.criterion_id.clone(),
+ criterion_text: cr.criterion_id, // Use ID as text if not provided
+ passed: cr.passed,
+ score: cr.score,
+ feedback: cr.feedback,
+ evidence: vec![],
+ })
+ .collect();
+
+ // Determine pass/fail based on threshold and results
+ let passed = parsed.passed && parsed.overall_score >= self.pass_threshold;
+
+ Ok(ContractEvaluationResult {
+ passed,
+ overall_score: parsed.overall_score,
+ criteria_results,
+ summary_feedback: parsed.summary_feedback,
+ rework_instructions: if passed { None } else { parsed.rework_instructions },
+ })
+ }
+
+ /// Save evaluation result to database
+ pub async fn save_evaluation(
+ &self,
+ contract_id: Uuid,
+ chain_id: Uuid,
+ chain_contract_id: Uuid,
+ result: &ContractEvaluationResult,
+ ) -> Result<ContractEvaluation, EvaluationError> {
+ let req = CreateContractEvaluationRequest {
+ contract_id,
+ chain_id: Some(chain_id),
+ chain_contract_id: Some(chain_contract_id),
+ evaluator_model: Some(format!("{:?}", self.model)),
+ passed: result.passed,
+ overall_score: Some(result.overall_score),
+ criteria_results: result.criteria_results.clone(),
+ summary_feedback: result.summary_feedback.clone(),
+ rework_instructions: result.rework_instructions.clone(),
+ };
+
+ let evaluation = repository::create_contract_evaluation(&self.pool, req).await?;
+
+ // Update chain contract status
+ let status = if result.passed { "passed" } else { "failed" };
+ repository::update_chain_contract_evaluation_status(
+ &self.pool,
+ chain_contract_id,
+ status,
+ Some(evaluation.id),
+ result.rework_instructions.as_deref(),
+ )
+ .await?;
+
+ Ok(evaluation)
+ }
+}
+
+/// JSON structure for parsing LLM response
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct EvaluationResponseJson {
+ passed: bool,
+ overall_score: f64,
+ criteria_results: Vec<CriterionResultJson>,
+ summary_feedback: String,
+ rework_instructions: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct CriterionResultJson {
+ criterion_id: String,
+ #[serde(alias = "met")]
+ passed: bool,
+ #[serde(default)]
+ score: f64,
+ feedback: String,
+}
+
+/// Extract JSON from a response that may contain markdown code blocks
+fn extract_json_from_response(response: &str) -> Result<String, EvaluationError> {
+ // Try to find JSON in code blocks first
+ if let Some(start) = response.find("```json") {
+ let json_start = start + 7;
+ if let Some(end) = response[json_start..].find("```") {
+ return Ok(response[json_start..json_start + end].trim().to_string());
+ }
+ }
+
+ // Try plain code blocks
+ if let Some(start) = response.find("```") {
+ let json_start = start + 3;
+ // Skip any language identifier on the same line
+ let actual_start = response[json_start..]
+ .find('\n')
+ .map(|i| json_start + i + 1)
+ .unwrap_or(json_start);
+ if let Some(end) = response[actual_start..].find("```") {
+ return Ok(response[actual_start..actual_start + end].trim().to_string());
+ }
+ }
+
+ // Try to find raw JSON (starts with {)
+ if let Some(start) = response.find('{') {
+ // Find matching closing brace
+ let mut depth = 0;
+ let mut end = start;
+ for (i, c) in response[start..].char_indices() {
+ match c {
+ '{' => depth += 1,
+ '}' => {
+ depth -= 1;
+ if depth == 0 {
+ end = start + i + 1;
+ break;
+ }
+ }
+ _ => {}
+ }
+ }
+ if end > start {
+ return Ok(response[start..end].to_string());
+ }
+ }
+
+ Err(EvaluationError::ParseError(
+ "Could not find JSON in response".to_string(),
+ ))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_extract_json_from_code_block() {
+ let response = r#"Here is the evaluation:
+
+```json
+{
+ "passed": true,
+ "overallScore": 0.85
+}
+```
+
+Done."#;
+
+ let json = extract_json_from_response(response).unwrap();
+ assert!(json.contains("\"passed\": true"));
+ }
+
+ #[test]
+ fn test_extract_json_raw() {
+ let response = r#"The result is {"passed": false, "overallScore": 0.5}"#;
+ let json = extract_json_from_response(response).unwrap();
+ assert!(json.contains("\"passed\": false"));
+ }
+}