path: root/makima/src/llm/contract_evaluator.rs



//! Contract Evaluator - LLM-based evaluation of completed contracts against directive.
//!
//! This module provides functionality for:
//! - Gathering deliverables, files, and task outputs from completed contracts
//! - Building evaluation prompts using directive and acceptance criteria
//! - Calling LLM to evaluate work against requirements
//! - Parsing evaluation responses

use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;

use crate::db::{
    models::{
        ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest,
        DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult,
    },
    repository,
};

use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent};
use super::tools::Tool;

/// Result of contract evaluation
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ContractEvaluationResult {
    /// Whether the contract passed evaluation
    pub passed: bool,
    /// Overall score from 0.0 to 1.0
    pub overall_score: f64,
    /// Results for each acceptance criterion
    pub criteria_results: Vec<EvaluationCriterionResult>,
    /// Summary feedback from the evaluator
    pub summary_feedback: String,
    /// Instructions for rework if failed
    pub rework_instructions: Option<String>,
}

/// Context gathered for evaluation
#[derive(Debug, Clone)]
pub struct EvaluationContext {
    /// The contract being evaluated
    pub contract: Contract,
    /// The chain contract record
    pub chain_contract: ChainContract,
    /// The directive document
    pub directive: ChainDirective,
    /// Files associated with the contract
    pub files: Vec<FileContent>,
    /// Task outputs from the contract
    pub task_outputs: Vec<TaskOutput>,
    /// Deliverables marked as complete
    pub deliverables: Vec<DeliverableInfo>,
    /// Acceptance criteria specific to this contract
    pub acceptance_criteria: Vec<DirectiveAcceptanceCriterion>,
    /// Requirements mapped to this contract
    pub requirements: Vec<DirectiveRequirement>,
}

/// File content for evaluation
#[derive(Debug, Clone, Serialize)]
pub struct FileContent {
    pub path: String,
    pub description: Option<String>,
    pub content: String,
    pub is_deliverable: bool,
}

/// Task output for evaluation
#[derive(Debug, Clone, Serialize)]
pub struct TaskOutput {
    pub task_name: String,
    pub output_summary: String,
    pub exit_code: Option<i32>,
}

/// Deliverable info for evaluation
#[derive(Debug, Clone, Serialize)]
pub struct DeliverableInfo {
    pub name: String,
    pub status: String,
    pub file_path: Option<String>,
}

/// Error types for evaluation
#[derive(Debug, thiserror::Error)]
pub enum EvaluationError {
    #[error("Database error: {0}")]
    Database(#[from] sqlx::Error),

    #[error("Contract not found: {0}")]
    ContractNotFound(Uuid),

    #[error("Chain contract not found for contract: {0}")]
    ChainContractNotFound(Uuid),

    #[error("Directive not found for chain: {0}")]
    DirectiveNotFound(Uuid),

    #[error("LLM evaluation failed: {0}")]
    LlmError(String),

    #[error("Failed to parse evaluation response: {0}")]
    ParseError(String),
}

/// Contract evaluator for directive-driven evaluation
pub struct ContractEvaluator {
    pool: PgPool,
    claude_client: ClaudeClient,
    model: ClaudeModel,
    /// Minimum score required to pass (default 0.8)
    pass_threshold: f64,
}

impl ContractEvaluator {
    /// Create a new evaluator
    pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self {
        Self {
            pool,
            claude_client,
            model: ClaudeModel::Sonnet,
            pass_threshold: 0.8,
        }
    }

    /// Set the LLM model to use for evaluation
    pub fn with_model(mut self, model: ClaudeModel) -> Self {
        self.model = model;
        self
    }

    /// Set the pass threshold
    pub fn with_pass_threshold(mut self, threshold: f64) -> Self {
        self.pass_threshold = threshold;
        self
    }

    /// Evaluate a completed contract against the directive
    pub async fn evaluate_contract(
        &self,
        contract_id: Uuid,
        owner_id: Uuid,
    ) -> Result<ContractEvaluationResult, EvaluationError> {
        // Gather evaluation context
        let context = self.gather_context(contract_id, owner_id).await?;

        // Build evaluation prompt
        let prompt = self.build_evaluation_prompt(&context);

        // Call LLM for evaluation
        let response = self.call_llm_for_evaluation(&prompt).await?;

        // Parse the response
        let result = self.parse_evaluation_response(&response, &context)?;

        Ok(result)
    }

    /// Gather all context needed for evaluation
    async fn gather_context(
        &self,
        contract_id: Uuid,
        owner_id: Uuid,
    ) -> Result<EvaluationContext, EvaluationError> {
        // Get contract
        let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id)
            .await?
            .ok_or(EvaluationError::ContractNotFound(contract_id))?;

        // Get chain contract
        let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id)
            .await?
            .ok_or(EvaluationError::ChainContractNotFound(contract_id))?;

        // Get directive
        let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id)
            .await?
            .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?;

        // Get files directly from repository
        let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id)
            .await
            .unwrap_or_default();

        // Get tasks directly from repository
        let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id)
            .await
            .unwrap_or_default();

        // Build file contents from FileSummary
        // Note: FileSummary doesn't have content, so we use name and description
        let files: Vec<FileContent> = contract_files.iter().map(|f| {
            FileContent {
                path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()),
                description: f.description.clone(),
                content: format!("[File: {} - content not loaded in summary view]", f.name),
                is_deliverable: false, // FileSummary doesn't track deliverable status
            }
        }).collect();

        // Build task outputs from TaskSummary
        let task_outputs: Vec<TaskOutput> = contract_tasks.iter().map(|t| {
            TaskOutput {
                task_name: t.name.clone(),
                output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)),
                exit_code: None,
            }
        }).collect();

        // Build deliverables info from files marked as deliverables
        // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables
        let deliverables: Vec<DeliverableInfo> = contract_files.iter()
            .map(|f| DeliverableInfo {
                name: f.name.clone(),
                status: "complete".to_string(),
                file_path: f.repo_file_path.clone(),
            })
            .collect();

        // Parse requirements and acceptance criteria from directive
        let requirements: Vec<DirectiveRequirement> =
            serde_json::from_value(directive.requirements.clone()).unwrap_or_default();

        let all_criteria: Vec<DirectiveAcceptanceCriterion> =
            serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default();

        // Get contract definition to find mapped requirements
        // For now, use all acceptance criteria
        let acceptance_criteria = all_criteria;

        Ok(EvaluationContext {
            contract,
            chain_contract,
            directive,
            files,
            task_outputs,
            deliverables,
            acceptance_criteria,
            requirements,
        })
    }

    /// Build the evaluation prompt
    fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String {
        let mut prompt = String::new();

        prompt.push_str("# Contract Completion Evaluation\n\n");
        prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n");

        // Contract info
        prompt.push_str("## Contract Information\n\n");
        prompt.push_str(&format!("**Name:** {}\n", context.contract.name));
        if let Some(ref desc) = context.contract.description {
            prompt.push_str(&format!("**Description:** {}\n", desc));
        }
        prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type));
        prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase));
        prompt.push_str("\n");

        // Requirements
        if !context.requirements.is_empty() {
            prompt.push_str("## Requirements\n\n");
            for req in &context.requirements {
                prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title));
                if !req.description.is_empty() {
                    prompt.push_str(&format!("  {}\n", req.description));
                }
            }
            prompt.push_str("\n");
        }

        // Acceptance criteria
        if !context.acceptance_criteria.is_empty() {
            prompt.push_str("## Acceptance Criteria\n\n");
            for (i, criterion) in context.acceptance_criteria.iter().enumerate() {
                prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description));
                prompt.push_str(&format!("   - Testable: {}\n", criterion.testable));
                if !criterion.requirement_ids.is_empty() {
                    prompt.push_str(&format!("   - Covers: {}\n", criterion.requirement_ids.join(", ")));
                }
            }
            prompt.push_str("\n");
        }

        // Deliverables
        if !context.deliverables.is_empty() {
            prompt.push_str("## Deliverables\n\n");
            for d in &context.deliverables {
                prompt.push_str(&format!("- {} ({})\n", d.name, d.status));
            }
            prompt.push_str("\n");
        }

        // Files
        if !context.files.is_empty() {
            prompt.push_str("## Files Created/Modified\n\n");
            for file in &context.files {
                prompt.push_str(&format!("### {}", file.path));
                if file.is_deliverable {
                    prompt.push_str(" [DELIVERABLE]");
                }
                prompt.push_str("\n");
                if let Some(ref desc) = file.description {
                    prompt.push_str(&format!("*{}*\n", desc));
                }
                // Truncate content if too long
                let content = if file.content.len() > 5000 {
                    format!("{}...\n[Content truncated - {} chars total]",
                        &file.content[..5000], file.content.len())
                } else {
                    file.content.clone()
                };
                prompt.push_str("```\n");
                prompt.push_str(&content);
                prompt.push_str("\n```\n\n");
            }
        }

        // Task outputs
        if !context.task_outputs.is_empty() {
            prompt.push_str("## Task Outputs\n\n");
            for task in &context.task_outputs {
                prompt.push_str(&format!("### {}\n", task.task_name));
                prompt.push_str(&format!("{}\n\n", task.output_summary));
            }
        }

        // Evaluation instructions
        prompt.push_str("## Evaluation Instructions\n\n");
        prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n");
        prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n");
        prompt.push_str("Respond with a JSON object in the following format:\n\n");
        prompt.push_str("```json\n");
        prompt.push_str(r#"{
  "passed": true/false,
  "overallScore": 0.0-1.0,
  "criteriaResults": [
    {
      "criterionId": "criterion identifier or index",
      "met": true/false,
      "score": 0.0-1.0,
      "feedback": "explanation of why criterion was/wasn't met"
    }
  ],
  "summaryFeedback": "overall summary of the evaluation",
  "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)"
}
"#);
        prompt.push_str("```\n\n");
        prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold));
        prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n");

        prompt
    }

    /// Call LLM for evaluation
    async fn call_llm_for_evaluation(&self, prompt: &str) -> Result<String, EvaluationError> {
        let messages = vec![Message {
            role: "user".to_string(),
            content: MessageContent::Text(prompt.to_string()),
        }];

        // Use chat_with_tools with empty tools array for simple chat
        let empty_tools: Vec<Tool> = vec![];
        let result = self
            .claude_client
            .chat_with_tools(messages, &empty_tools)
            .await
            .map_err(|e| EvaluationError::LlmError(e.to_string()))?;

        // ChatResult.content is already an Option<String>
        let text = result.content.unwrap_or_default();

        Ok(text)
    }

    /// Parse the LLM response into an evaluation result
    fn parse_evaluation_response(
        &self,
        response: &str,
        context: &EvaluationContext,
    ) -> Result<ContractEvaluationResult, EvaluationError> {
        // Extract JSON from response (may be wrapped in markdown code blocks)
        let json_str = extract_json_from_response(response)?;

        // Parse the JSON
        let parsed: EvaluationResponseJson = serde_json::from_str(&json_str)
            .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?;

        // Convert to our result type
        let criteria_results: Vec<EvaluationCriterionResult> = parsed
            .criteria_results
            .into_iter()
            .map(|cr| EvaluationCriterionResult {
                criterion_id: cr.criterion_id.clone(),
                criterion_text: cr.criterion_id, // Use ID as text if not provided
                passed: cr.passed,
                score: cr.score,
                feedback: cr.feedback,
                evidence: vec![],
            })
            .collect();

        // Determine pass/fail based on threshold and results
        let passed = parsed.passed && parsed.overall_score >= self.pass_threshold;

        Ok(ContractEvaluationResult {
            passed,
            overall_score: parsed.overall_score,
            criteria_results,
            summary_feedback: parsed.summary_feedback,
            rework_instructions: if passed { None } else { parsed.rework_instructions },
        })
    }

    /// Save evaluation result to database
    pub async fn save_evaluation(
        &self,
        contract_id: Uuid,
        chain_id: Uuid,
        chain_contract_id: Uuid,
        result: &ContractEvaluationResult,
    ) -> Result<ContractEvaluation, EvaluationError> {
        let req = CreateContractEvaluationRequest {
            contract_id,
            chain_id: Some(chain_id),
            chain_contract_id: Some(chain_contract_id),
            evaluator_model: Some(format!("{:?}", self.model)),
            passed: result.passed,
            overall_score: Some(result.overall_score),
            criteria_results: result.criteria_results.clone(),
            summary_feedback: result.summary_feedback.clone(),
            rework_instructions: result.rework_instructions.clone(),
        };

        let evaluation = repository::create_contract_evaluation(&self.pool, req).await?;

        // Update chain contract status
        let status = if result.passed { "passed" } else { "failed" };
        repository::update_chain_contract_evaluation_status(
            &self.pool,
            chain_contract_id,
            status,
            Some(evaluation.id),
            result.rework_instructions.as_deref(),
        )
        .await?;

        Ok(evaluation)
    }
}

/// JSON structure for parsing LLM response
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct EvaluationResponseJson {
    passed: bool,
    overall_score: f64,
    criteria_results: Vec<CriterionResultJson>,
    summary_feedback: String,
    rework_instructions: Option<String>,
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct CriterionResultJson {
    criterion_id: String,
    #[serde(alias = "met")]
    passed: bool,
    #[serde(default)]
    score: f64,
    feedback: String,
}

/// Extract JSON from a response that may contain markdown code blocks
fn extract_json_from_response(response: &str) -> Result<String, EvaluationError> {
    // Try to find JSON in code blocks first
    if let Some(start) = response.find("```json") {
        let json_start = start + 7;
        if let Some(end) = response[json_start..].find("```") {
            return Ok(response[json_start..json_start + end].trim().to_string());
        }
    }

    // Try plain code blocks
    if let Some(start) = response.find("```") {
        let json_start = start + 3;
        // Skip any language identifier on the same line
        let actual_start = response[json_start..]
            .find('\n')
            .map(|i| json_start + i + 1)
            .unwrap_or(json_start);
        if let Some(end) = response[actual_start..].find("```") {
            return Ok(response[actual_start..actual_start + end].trim().to_string());
        }
    }

    // Try to find raw JSON (starts with {)
    if let Some(start) = response.find('{') {
        // Find matching closing brace
        let mut depth = 0;
        let mut end = start;
        for (i, c) in response[start..].char_indices() {
            match c {
                '{' => depth += 1,
                '}' => {
                    depth -= 1;
                    if depth == 0 {
                        end = start + i + 1;
                        break;
                    }
                }
                _ => {}
            }
        }
        if end > start {
            return Ok(response[start..end].to_string());
        }
    }

    Err(EvaluationError::ParseError(
        "Could not find JSON in response".to_string(),
    ))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_json_from_code_block() {
        let response = r#"Here is the evaluation:

```json
{
  "passed": true,
  "overallScore": 0.85
}
```

Done."#;

        let json = extract_json_from_response(response).unwrap();
        assert!(json.contains("\"passed\": true"));
    }

    #[test]
    fn test_extract_json_raw() {
        let response = r#"The result is {"passed": false, "overallScore": 0.5}"#;
        let json = extract_json_from_response(response).unwrap();
        assert!(json.contains("\"passed\": false"));
    }
}