diff options
Diffstat (limited to 'makima/src/llm')
| -rw-r--r-- | makima/src/llm/contract_evaluator.rs | 564 | ||||
| -rw-r--r-- | makima/src/llm/mod.rs | 2 | ||||
| -rw-r--r-- | makima/src/llm/task_output.rs | 26 |
3 files changed, 77 insertions, 515 deletions
diff --git a/makima/src/llm/contract_evaluator.rs b/makima/src/llm/contract_evaluator.rs index fcc4826..e63bbfa 100644 --- a/makima/src/llm/contract_evaluator.rs +++ b/makima/src/llm/contract_evaluator.rs @@ -1,25 +1,19 @@ //! Contract Evaluator - LLM-based evaluation of completed contracts against directive. //! -//! This module provides functionality for: -//! - Gathering deliverables, files, and task outputs from completed contracts -//! - Building evaluation prompts using directive and acceptance criteria -//! - Calling LLM to evaluate work against requirements -//! - Parsing evaluation responses +//! This module will be reimplemented as part of the directive verification engine. +//! See the orchestration module for the new evaluation system. +//! +//! The new evaluation system will provide: +//! - Tiered verification (programmatic verifiers first, then LLM evaluation) +//! - Composite confidence scoring (weighted combination of results) +//! - Pluggable verifier interface (test runner, linter, build, type checker) +//! - Proper integration with the directive chain steps use serde::{Deserialize, Serialize}; use sqlx::PgPool; use uuid::Uuid; -use crate::db::{ - models::{ - ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest, - DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult, - }, - repository, -}; - -use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent}; -use super::tools::Tool; +// use crate::db::models::{Contract, DirectiveAcceptanceCriterion, DirectiveRequirement}; /// Result of contract evaluation #[derive(Debug, Clone, Serialize, Deserialize)] @@ -30,526 +24,74 @@ pub struct ContractEvaluationResult { /// Overall score from 0.0 to 1.0 pub overall_score: f64, /// Results for each acceptance criterion - pub criteria_results: Vec<EvaluationCriterionResult>, + pub criteria_results: Vec<EvaluationCriterionResultLegacy>, /// Summary feedback from the evaluator pub summary_feedback: String, /// Instructions for rework if failed pub rework_instructions: Option<String>, } -/// Context gathered for evaluation -#[derive(Debug, Clone)] -pub struct EvaluationContext { - /// The contract being evaluated - pub contract: Contract, - /// The chain contract record - pub chain_contract: ChainContract, - /// The directive document - pub directive: ChainDirective, - /// Files associated with the contract - pub files: Vec<FileContent>, - /// Task outputs from the contract - pub task_outputs: Vec<TaskOutput>, - /// Deliverables marked as complete - pub deliverables: Vec<DeliverableInfo>, - /// Acceptance criteria specific to this contract - pub acceptance_criteria: Vec<DirectiveAcceptanceCriterion>, - /// Requirements mapped to this contract - pub requirements: Vec<DirectiveRequirement>, +/// Per-criterion evaluation result (legacy - kept for compatibility) +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct EvaluationCriterionResultLegacy { + pub criterion_id: String, + pub criterion_text: String, + pub passed: bool, + /// Score (0.0-1.0) + pub score: f64, + pub feedback: String, + /// Evidence supporting the evaluation + pub evidence: Vec<String>, } -/// File content for evaluation -#[derive(Debug, Clone, Serialize)] +/// File content for evaluation context +#[derive(Debug, Clone)] pub struct FileContent { pub path: String, - pub description: Option<String>, pub content: String, - pub is_deliverable: bool, } -/// Task output for evaluation -#[derive(Debug, Clone, Serialize)] -pub struct TaskOutput { - pub task_name: String, - pub output_summary: String, - pub exit_code: Option<i32>, -} - -/// Deliverable info for evaluation -#[derive(Debug, Clone, Serialize)] -pub struct DeliverableInfo { - pub name: String, - pub status: String, - pub file_path: Option<String>, -} - -/// Error types for evaluation -#[derive(Debug, thiserror::Error)] -pub enum EvaluationError { - #[error("Database error: {0}")] - Database(#[from] sqlx::Error), - - #[error("Contract not found: {0}")] - ContractNotFound(Uuid), - - #[error("Chain contract not found for contract: {0}")] - ChainContractNotFound(Uuid), - - #[error("Directive not found for chain: {0}")] - DirectiveNotFound(Uuid), - - #[error("LLM evaluation failed: {0}")] - LlmError(String), - - #[error("Failed to parse evaluation response: {0}")] - ParseError(String), -} - -/// Contract evaluator for directive-driven evaluation +/// Contract evaluator for LLM-based assessment. +/// +/// NOTE: This is a stub implementation. The full evaluation system will be +/// implemented as part of the orchestration/verifier module. pub struct ContractEvaluator { - pool: PgPool, - claude_client: ClaudeClient, - model: ClaudeModel, - /// Minimum score required to pass (default 0.8) - pass_threshold: f64, + _pool: PgPool, } impl ContractEvaluator { - /// Create a new evaluator - pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self { - Self { - pool, - claude_client, - model: ClaudeModel::Sonnet, - pass_threshold: 0.8, - } - } - - /// Set the LLM model to use for evaluation - pub fn with_model(mut self, model: ClaudeModel) -> Self { - self.model = model; - self - } - - /// Set the pass threshold - pub fn with_pass_threshold(mut self, threshold: f64) -> Self { - self.pass_threshold = threshold; - self + /// Create a new contract evaluator. + pub fn new(pool: PgPool) -> Self { + Self { _pool: pool } } - /// Evaluate a completed contract against the directive + /// Evaluate a contract - stub implementation. + /// + /// This will be reimplemented in the orchestration module with: + /// - Programmatic verification (tests, lint, build) + /// - LLM evaluation + /// - Composite scoring pub async fn evaluate_contract( &self, - contract_id: Uuid, - owner_id: Uuid, - ) -> Result<ContractEvaluationResult, EvaluationError> { - // Gather evaluation context - let context = self.gather_context(contract_id, owner_id).await?; - - // Build evaluation prompt - let prompt = self.build_evaluation_prompt(&context); - - // Call LLM for evaluation - let response = self.call_llm_for_evaluation(&prompt).await?; - - // Parse the response - let result = self.parse_evaluation_response(&response, &context)?; - - Ok(result) - } - - /// Gather all context needed for evaluation - async fn gather_context( - &self, - contract_id: Uuid, - owner_id: Uuid, - ) -> Result<EvaluationContext, EvaluationError> { - // Get contract - let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id) - .await? - .ok_or(EvaluationError::ContractNotFound(contract_id))?; - - // Get chain contract - let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id) - .await? - .ok_or(EvaluationError::ChainContractNotFound(contract_id))?; - - // Get directive - let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id) - .await? - .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?; - - // Get files directly from repository - let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id) - .await - .unwrap_or_default(); - - // Get tasks directly from repository - let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id) - .await - .unwrap_or_default(); - - // Build file contents from FileSummary - // Note: FileSummary doesn't have content, so we use name and description - let files: Vec<FileContent> = contract_files.iter().map(|f| { - FileContent { - path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()), - description: f.description.clone(), - content: format!("[File: {} - content not loaded in summary view]", f.name), - is_deliverable: false, // FileSummary doesn't track deliverable status - } - }).collect(); - - // Build task outputs from TaskSummary - let task_outputs: Vec<TaskOutput> = contract_tasks.iter().map(|t| { - TaskOutput { - task_name: t.name.clone(), - output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)), - exit_code: None, - } - }).collect(); - - // Build deliverables info from files marked as deliverables - // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables - let deliverables: Vec<DeliverableInfo> = contract_files.iter() - .map(|f| DeliverableInfo { - name: f.name.clone(), - status: "complete".to_string(), - file_path: f.repo_file_path.clone(), - }) - .collect(); - - // Parse requirements and acceptance criteria from directive - let requirements: Vec<DirectiveRequirement> = - serde_json::from_value(directive.requirements.clone()).unwrap_or_default(); - - let all_criteria: Vec<DirectiveAcceptanceCriterion> = - serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default(); - - // Get contract definition to find mapped requirements - // For now, use all acceptance criteria - let acceptance_criteria = all_criteria; - - Ok(EvaluationContext { - contract, - chain_contract, - directive, - files, - task_outputs, - deliverables, - acceptance_criteria, - requirements, - }) + _contract_id: Uuid, + ) -> Result<ContractEvaluationResult, ContractEvaluatorError> { + // TODO: Implement using the new directive evaluation system + Err(ContractEvaluatorError::NotImplemented( + "Contract evaluator will be reimplemented with directive system".to_string(), + )) } - - /// Build the evaluation prompt - fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String { - let mut prompt = String::new(); - - prompt.push_str("# Contract Completion Evaluation\n\n"); - prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n"); - - // Contract info - prompt.push_str("## Contract Information\n\n"); - prompt.push_str(&format!("**Name:** {}\n", context.contract.name)); - if let Some(ref desc) = context.contract.description { - prompt.push_str(&format!("**Description:** {}\n", desc)); - } - prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type)); - prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase)); - prompt.push_str("\n"); - - // Requirements - if !context.requirements.is_empty() { - prompt.push_str("## Requirements\n\n"); - for req in &context.requirements { - prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title)); - if !req.description.is_empty() { - prompt.push_str(&format!(" {}\n", req.description)); - } - } - prompt.push_str("\n"); - } - - // Acceptance criteria - if !context.acceptance_criteria.is_empty() { - prompt.push_str("## Acceptance Criteria\n\n"); - for (i, criterion) in context.acceptance_criteria.iter().enumerate() { - prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description)); - prompt.push_str(&format!(" - Testable: {}\n", criterion.testable)); - if !criterion.requirement_ids.is_empty() { - prompt.push_str(&format!(" - Covers: {}\n", criterion.requirement_ids.join(", "))); - } - } - prompt.push_str("\n"); - } - - // Deliverables - if !context.deliverables.is_empty() { - prompt.push_str("## Deliverables\n\n"); - for d in &context.deliverables { - prompt.push_str(&format!("- {} ({})\n", d.name, d.status)); - } - prompt.push_str("\n"); - } - - // Files - if !context.files.is_empty() { - prompt.push_str("## Files Created/Modified\n\n"); - for file in &context.files { - prompt.push_str(&format!("### {}", file.path)); - if file.is_deliverable { - prompt.push_str(" [DELIVERABLE]"); - } - prompt.push_str("\n"); - if let Some(ref desc) = file.description { - prompt.push_str(&format!("*{}*\n", desc)); - } - // Truncate content if too long - let content = if file.content.len() > 5000 { - format!("{}...\n[Content truncated - {} chars total]", - &file.content[..5000], file.content.len()) - } else { - file.content.clone() - }; - prompt.push_str("```\n"); - prompt.push_str(&content); - prompt.push_str("\n```\n\n"); - } - } - - // Task outputs - if !context.task_outputs.is_empty() { - prompt.push_str("## Task Outputs\n\n"); - for task in &context.task_outputs { - prompt.push_str(&format!("### {}\n", task.task_name)); - prompt.push_str(&format!("{}\n\n", task.output_summary)); - } - } - - // Evaluation instructions - prompt.push_str("## Evaluation Instructions\n\n"); - prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n"); - prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n"); - prompt.push_str("Respond with a JSON object in the following format:\n\n"); - prompt.push_str("```json\n"); - prompt.push_str(r#"{ - "passed": true/false, - "overallScore": 0.0-1.0, - "criteriaResults": [ - { - "criterionId": "criterion identifier or index", - "met": true/false, - "score": 0.0-1.0, - "feedback": "explanation of why criterion was/wasn't met" - } - ], - "summaryFeedback": "overall summary of the evaluation", - "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)" -} -"#); - prompt.push_str("```\n\n"); - prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold)); - prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n"); - - prompt - } - - /// Call LLM for evaluation - async fn call_llm_for_evaluation(&self, prompt: &str) -> Result<String, EvaluationError> { - let messages = vec![Message { - role: "user".to_string(), - content: MessageContent::Text(prompt.to_string()), - }]; - - // Use chat_with_tools with empty tools array for simple chat - let empty_tools: Vec<Tool> = vec![]; - let result = self - .claude_client - .chat_with_tools(messages, &empty_tools) - .await - .map_err(|e| EvaluationError::LlmError(e.to_string()))?; - - // ChatResult.content is already an Option<String> - let text = result.content.unwrap_or_default(); - - Ok(text) - } - - /// Parse the LLM response into an evaluation result - fn parse_evaluation_response( - &self, - response: &str, - context: &EvaluationContext, - ) -> Result<ContractEvaluationResult, EvaluationError> { - // Extract JSON from response (may be wrapped in markdown code blocks) - let json_str = extract_json_from_response(response)?; - - // Parse the JSON - let parsed: EvaluationResponseJson = serde_json::from_str(&json_str) - .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?; - - // Convert to our result type - let criteria_results: Vec<EvaluationCriterionResult> = parsed - .criteria_results - .into_iter() - .map(|cr| EvaluationCriterionResult { - criterion_id: cr.criterion_id.clone(), - criterion_text: cr.criterion_id, // Use ID as text if not provided - passed: cr.passed, - score: cr.score, - feedback: cr.feedback, - evidence: vec![], - }) - .collect(); - - // Determine pass/fail based on threshold and results - let passed = parsed.passed && parsed.overall_score >= self.pass_threshold; - - Ok(ContractEvaluationResult { - passed, - overall_score: parsed.overall_score, - criteria_results, - summary_feedback: parsed.summary_feedback, - rework_instructions: if passed { None } else { parsed.rework_instructions }, - }) - } - - /// Save evaluation result to database - pub async fn save_evaluation( - &self, - contract_id: Uuid, - chain_id: Uuid, - chain_contract_id: Uuid, - result: &ContractEvaluationResult, - ) -> Result<ContractEvaluation, EvaluationError> { - let req = CreateContractEvaluationRequest { - contract_id, - chain_id: Some(chain_id), - chain_contract_id: Some(chain_contract_id), - evaluator_model: Some(format!("{:?}", self.model)), - passed: result.passed, - overall_score: Some(result.overall_score), - criteria_results: result.criteria_results.clone(), - summary_feedback: result.summary_feedback.clone(), - rework_instructions: result.rework_instructions.clone(), - }; - - let evaluation = repository::create_contract_evaluation(&self.pool, req).await?; - - // Update chain contract status - let status = if result.passed { "passed" } else { "failed" }; - repository::update_chain_contract_evaluation_status( - &self.pool, - chain_contract_id, - status, - Some(evaluation.id), - result.rework_instructions.as_deref(), - ) - .await?; - - Ok(evaluation) - } -} - -/// JSON structure for parsing LLM response -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -struct EvaluationResponseJson { - passed: bool, - overall_score: f64, - criteria_results: Vec<CriterionResultJson>, - summary_feedback: String, - rework_instructions: Option<String>, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -struct CriterionResultJson { - criterion_id: String, - #[serde(alias = "met")] - passed: bool, - #[serde(default)] - score: f64, - feedback: String, -} - -/// Extract JSON from a response that may contain markdown code blocks -fn extract_json_from_response(response: &str) -> Result<String, EvaluationError> { - // Try to find JSON in code blocks first - if let Some(start) = response.find("```json") { - let json_start = start + 7; - if let Some(end) = response[json_start..].find("```") { - return Ok(response[json_start..json_start + end].trim().to_string()); - } - } - - // Try plain code blocks - if let Some(start) = response.find("```") { - let json_start = start + 3; - // Skip any language identifier on the same line - let actual_start = response[json_start..] - .find('\n') - .map(|i| json_start + i + 1) - .unwrap_or(json_start); - if let Some(end) = response[actual_start..].find("```") { - return Ok(response[actual_start..actual_start + end].trim().to_string()); - } - } - - // Try to find raw JSON (starts with {) - if let Some(start) = response.find('{') { - // Find matching closing brace - let mut depth = 0; - let mut end = start; - for (i, c) in response[start..].char_indices() { - match c { - '{' => depth += 1, - '}' => { - depth -= 1; - if depth == 0 { - end = start + i + 1; - break; - } - } - _ => {} - } - } - if end > start { - return Ok(response[start..end].to_string()); - } - } - - Err(EvaluationError::ParseError( - "Could not find JSON in response".to_string(), - )) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_json_from_code_block() { - let response = r#"Here is the evaluation: - -```json -{ - "passed": true, - "overallScore": 0.85 } -``` -Done."#; +/// Error types for contract evaluation. +#[derive(Debug, thiserror::Error)] +pub enum ContractEvaluatorError { + #[error("Database error: {0}")] + Database(#[from] sqlx::Error), - let json = extract_json_from_response(response).unwrap(); - assert!(json.contains("\"passed\": true")); - } + #[error("LLM error: {0}")] + Llm(String), - #[test] - fn test_extract_json_raw() { - let response = r#"The result is {"passed": false, "overallScore": 0.5}"#; - let json = extract_json_from_response(response).unwrap(); - assert!(json.contains("\"passed\": false")); - } + #[error("Not implemented: {0}")] + NotImplemented(String), } diff --git a/makima/src/llm/mod.rs b/makima/src/llm/mod.rs index 702e1fd..6c9965c 100644 --- a/makima/src/llm/mod.rs +++ b/makima/src/llm/mod.rs @@ -46,7 +46,7 @@ pub use transcript_analyzer::{ calculate_speaker_stats, build_analysis_prompt, parse_analysis_response, }; pub use contract_evaluator::{ - ContractEvaluator, ContractEvaluationResult, EvaluationContext, EvaluationError, + ContractEvaluator, ContractEvaluationResult, ContractEvaluatorError, }; /// Available LLM providers and models diff --git a/makima/src/llm/task_output.rs b/makima/src/llm/task_output.rs index c5d709e..c7f6990 100644 --- a/makima/src/llm/task_output.rs +++ b/makima/src/llm/task_output.rs @@ -126,7 +126,7 @@ pub fn parse_tasks_from_breakdown(content: &str) -> TaskParseResult { let heading_pattern = Regex::new(r"^##\s+(?:Phase\s*\d*:?\s*)?(.+)$").unwrap(); // Patterns for dependencies (inline) - let depends_pattern = Regex::new(r"(?i)(?:depends on|after|requires):?\s*(.+)").unwrap(); + let depends_pattern = Regex::new(r"(?i)\(?\s*(?:depends on|after|requires):?\s*([^)]+)\)?").unwrap(); for line in content.lines() { let trimmed = line.trim(); @@ -226,7 +226,7 @@ pub fn parse_tasks_from_breakdown(content: &str) -> TaskParseResult { } } -/// Check if text looks like a task (has action verbs) +/// Check if text looks like a task (has action verbs at word boundaries) fn looks_like_task(text: &str) -> bool { let lower = text.to_lowercase(); let action_verbs = [ @@ -237,7 +237,27 @@ fn looks_like_task(text: &str) -> bool { "disable", "install", "initialize", "define", "extend", "extract", ]; - action_verbs.iter().any(|verb| lower.starts_with(verb) || lower.contains(&format!(" {}", verb))) + // Check if text starts with an action verb (followed by space or end) + for verb in &action_verbs { + if lower.starts_with(verb) { + // Check for word boundary after verb + let after = &lower[verb.len()..]; + if after.is_empty() || after.starts_with(' ') || after.starts_with('_') { + return true; + } + } + // Check if verb appears after space with word boundary + let pattern = format!(" {} ", verb); + let pattern_end = format!(" {}", verb); + if lower.contains(&pattern) { + return true; + } + // Check if verb is at the end of string after a space + if lower.ends_with(&pattern_end) && lower.len() > pattern_end.len() { + return true; + } + } + false } /// Analyze a completed task's output to suggest next actions |
