summaryrefslogtreecommitdiff
path: root/makima/src/llm/contract_evaluator.rs
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-02-05 23:42:48 +0000
committersoryu <soryu@soryu.co>2026-02-05 23:42:48 +0000
commit88a4f15ce1310f8ee8693835be14aa5280233f17 (patch)
tree5c1a0417e02071d2198d13478ffa85533b19f891 /makima/src/llm/contract_evaluator.rs
parentf1a50b80f3969d150bd1c31edde0aff05369157e (diff)
downloadsoryu-88a4f15ce1310f8ee8693835be14aa5280233f17.tar.gz
soryu-88a4f15ce1310f8ee8693835be14aa5280233f17.zip
Add directive-first chain system redesign
Redesigns the chain system with a directive-first architecture where Directive is the top-level entity (the "why/what") and Chains are generated execution plans (the "how") that can be dynamically modified. Backend: - Add database migration for directive system tables - Add Directive, DirectiveChain, ChainStep, DirectiveEvent models - Add DirectiveVerifier and DirectiveApproval models - Add orchestration module with engine, planner, and verifier - Add comprehensive API handlers for directives - Add daemon CLI commands for directive management - Add directive skill documentation - Integrate contract completion with directive engine - Add SSE endpoint for real-time directive events Frontend: - Add directives route with split-view layout - Add 6-tab detail view (Overview, Chain, Events, Evaluations, Approvals, Verifiers) - Add React Flow DAG visualization for chain steps - Add SSE subscription hook for real-time event updates - Add useDirectives and useDirectiveEventSubscription hooks - Add directive types and API functions Fixes: - Fix test failures in ws/protocol, task_output, completion_gate, patch - Fix word boundary matching in looks_like_task() - Fix parse_last() to find actual last completion gate - Fix create_export_patch when merge-base equals HEAD - Clean up clippy warnings in new code Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'makima/src/llm/contract_evaluator.rs')
-rw-r--r--makima/src/llm/contract_evaluator.rs564
1 files changed, 53 insertions, 511 deletions
diff --git a/makima/src/llm/contract_evaluator.rs b/makima/src/llm/contract_evaluator.rs
index fcc4826..e63bbfa 100644
--- a/makima/src/llm/contract_evaluator.rs
+++ b/makima/src/llm/contract_evaluator.rs
@@ -1,25 +1,19 @@
//! Contract Evaluator - LLM-based evaluation of completed contracts against directive.
//!
-//! This module provides functionality for:
-//! - Gathering deliverables, files, and task outputs from completed contracts
-//! - Building evaluation prompts using directive and acceptance criteria
-//! - Calling LLM to evaluate work against requirements
-//! - Parsing evaluation responses
+//! This module will be reimplemented as part of the directive verification engine.
+//! See the orchestration module for the new evaluation system.
+//!
+//! The new evaluation system will provide:
+//! - Tiered verification (programmatic verifiers first, then LLM evaluation)
+//! - Composite confidence scoring (weighted combination of results)
+//! - Pluggable verifier interface (test runner, linter, build, type checker)
+//! - Proper integration with the directive chain steps
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
-use crate::db::{
- models::{
- ChainContract, ChainDirective, Contract, ContractEvaluation, CreateContractEvaluationRequest,
- DirectiveAcceptanceCriterion, DirectiveRequirement, EvaluationCriterionResult,
- },
- repository,
-};
-
-use super::claude::{ClaudeClient, ClaudeModel, Message, MessageContent};
-use super::tools::Tool;
+// use crate::db::models::{Contract, DirectiveAcceptanceCriterion, DirectiveRequirement};
/// Result of contract evaluation
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -30,526 +24,74 @@ pub struct ContractEvaluationResult {
/// Overall score from 0.0 to 1.0
pub overall_score: f64,
/// Results for each acceptance criterion
- pub criteria_results: Vec<EvaluationCriterionResult>,
+ pub criteria_results: Vec<EvaluationCriterionResultLegacy>,
/// Summary feedback from the evaluator
pub summary_feedback: String,
/// Instructions for rework if failed
pub rework_instructions: Option<String>,
}
-/// Context gathered for evaluation
-#[derive(Debug, Clone)]
-pub struct EvaluationContext {
- /// The contract being evaluated
- pub contract: Contract,
- /// The chain contract record
- pub chain_contract: ChainContract,
- /// The directive document
- pub directive: ChainDirective,
- /// Files associated with the contract
- pub files: Vec<FileContent>,
- /// Task outputs from the contract
- pub task_outputs: Vec<TaskOutput>,
- /// Deliverables marked as complete
- pub deliverables: Vec<DeliverableInfo>,
- /// Acceptance criteria specific to this contract
- pub acceptance_criteria: Vec<DirectiveAcceptanceCriterion>,
- /// Requirements mapped to this contract
- pub requirements: Vec<DirectiveRequirement>,
+/// Per-criterion evaluation result (legacy - kept for compatibility)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct EvaluationCriterionResultLegacy {
+ pub criterion_id: String,
+ pub criterion_text: String,
+ pub passed: bool,
+ /// Score (0.0-1.0)
+ pub score: f64,
+ pub feedback: String,
+ /// Evidence supporting the evaluation
+ pub evidence: Vec<String>,
}
-/// File content for evaluation
-#[derive(Debug, Clone, Serialize)]
+/// File content for evaluation context
+#[derive(Debug, Clone)]
pub struct FileContent {
pub path: String,
- pub description: Option<String>,
pub content: String,
- pub is_deliverable: bool,
}
-/// Task output for evaluation
-#[derive(Debug, Clone, Serialize)]
-pub struct TaskOutput {
- pub task_name: String,
- pub output_summary: String,
- pub exit_code: Option<i32>,
-}
-
-/// Deliverable info for evaluation
-#[derive(Debug, Clone, Serialize)]
-pub struct DeliverableInfo {
- pub name: String,
- pub status: String,
- pub file_path: Option<String>,
-}
-
-/// Error types for evaluation
-#[derive(Debug, thiserror::Error)]
-pub enum EvaluationError {
- #[error("Database error: {0}")]
- Database(#[from] sqlx::Error),
-
- #[error("Contract not found: {0}")]
- ContractNotFound(Uuid),
-
- #[error("Chain contract not found for contract: {0}")]
- ChainContractNotFound(Uuid),
-
- #[error("Directive not found for chain: {0}")]
- DirectiveNotFound(Uuid),
-
- #[error("LLM evaluation failed: {0}")]
- LlmError(String),
-
- #[error("Failed to parse evaluation response: {0}")]
- ParseError(String),
-}
-
-/// Contract evaluator for directive-driven evaluation
+/// Contract evaluator for LLM-based assessment.
+///
+/// NOTE: This is a stub implementation. The full evaluation system will be
+/// implemented as part of the orchestration/verifier module.
pub struct ContractEvaluator {
- pool: PgPool,
- claude_client: ClaudeClient,
- model: ClaudeModel,
- /// Minimum score required to pass (default 0.8)
- pass_threshold: f64,
+ _pool: PgPool,
}
impl ContractEvaluator {
- /// Create a new evaluator
- pub fn new(pool: PgPool, claude_client: ClaudeClient) -> Self {
- Self {
- pool,
- claude_client,
- model: ClaudeModel::Sonnet,
- pass_threshold: 0.8,
- }
- }
-
- /// Set the LLM model to use for evaluation
- pub fn with_model(mut self, model: ClaudeModel) -> Self {
- self.model = model;
- self
- }
-
- /// Set the pass threshold
- pub fn with_pass_threshold(mut self, threshold: f64) -> Self {
- self.pass_threshold = threshold;
- self
+ /// Create a new contract evaluator.
+ pub fn new(pool: PgPool) -> Self {
+ Self { _pool: pool }
}
- /// Evaluate a completed contract against the directive
+ /// Evaluate a contract - stub implementation.
+ ///
+ /// This will be reimplemented in the orchestration module with:
+ /// - Programmatic verification (tests, lint, build)
+ /// - LLM evaluation
+ /// - Composite scoring
pub async fn evaluate_contract(
&self,
- contract_id: Uuid,
- owner_id: Uuid,
- ) -> Result<ContractEvaluationResult, EvaluationError> {
- // Gather evaluation context
- let context = self.gather_context(contract_id, owner_id).await?;
-
- // Build evaluation prompt
- let prompt = self.build_evaluation_prompt(&context);
-
- // Call LLM for evaluation
- let response = self.call_llm_for_evaluation(&prompt).await?;
-
- // Parse the response
- let result = self.parse_evaluation_response(&response, &context)?;
-
- Ok(result)
- }
-
- /// Gather all context needed for evaluation
- async fn gather_context(
- &self,
- contract_id: Uuid,
- owner_id: Uuid,
- ) -> Result<EvaluationContext, EvaluationError> {
- // Get contract
- let contract = repository::get_contract_for_owner(&self.pool, contract_id, owner_id)
- .await?
- .ok_or(EvaluationError::ContractNotFound(contract_id))?;
-
- // Get chain contract
- let chain_contract = repository::get_chain_contract_by_contract_id(&self.pool, contract_id)
- .await?
- .ok_or(EvaluationError::ChainContractNotFound(contract_id))?;
-
- // Get directive
- let directive = repository::get_chain_directive(&self.pool, chain_contract.chain_id)
- .await?
- .ok_or(EvaluationError::DirectiveNotFound(chain_contract.chain_id))?;
-
- // Get files directly from repository
- let contract_files = repository::list_files_in_contract(&self.pool, contract_id, owner_id)
- .await
- .unwrap_or_default();
-
- // Get tasks directly from repository
- let contract_tasks = repository::list_tasks_in_contract(&self.pool, contract_id, owner_id)
- .await
- .unwrap_or_default();
-
- // Build file contents from FileSummary
- // Note: FileSummary doesn't have content, so we use name and description
- let files: Vec<FileContent> = contract_files.iter().map(|f| {
- FileContent {
- path: f.repo_file_path.clone().unwrap_or_else(|| f.name.clone()),
- description: f.description.clone(),
- content: format!("[File: {} - content not loaded in summary view]", f.name),
- is_deliverable: false, // FileSummary doesn't track deliverable status
- }
- }).collect();
-
- // Build task outputs from TaskSummary
- let task_outputs: Vec<TaskOutput> = contract_tasks.iter().map(|t| {
- TaskOutput {
- task_name: t.name.clone(),
- output_summary: t.progress_summary.clone().unwrap_or_else(|| format!("Status: {}", t.status)),
- exit_code: None,
- }
- }).collect();
-
- // Build deliverables info from files marked as deliverables
- // Since FileSummary doesn't have deliverable info, we treat all files as potential deliverables
- let deliverables: Vec<DeliverableInfo> = contract_files.iter()
- .map(|f| DeliverableInfo {
- name: f.name.clone(),
- status: "complete".to_string(),
- file_path: f.repo_file_path.clone(),
- })
- .collect();
-
- // Parse requirements and acceptance criteria from directive
- let requirements: Vec<DirectiveRequirement> =
- serde_json::from_value(directive.requirements.clone()).unwrap_or_default();
-
- let all_criteria: Vec<DirectiveAcceptanceCriterion> =
- serde_json::from_value(directive.acceptance_criteria.clone()).unwrap_or_default();
-
- // Get contract definition to find mapped requirements
- // For now, use all acceptance criteria
- let acceptance_criteria = all_criteria;
-
- Ok(EvaluationContext {
- contract,
- chain_contract,
- directive,
- files,
- task_outputs,
- deliverables,
- acceptance_criteria,
- requirements,
- })
+ _contract_id: Uuid,
+ ) -> Result<ContractEvaluationResult, ContractEvaluatorError> {
+ // TODO: Implement using the new directive evaluation system
+ Err(ContractEvaluatorError::NotImplemented(
+ "Contract evaluator will be reimplemented with directive system".to_string(),
+ ))
}
-
- /// Build the evaluation prompt
- fn build_evaluation_prompt(&self, context: &EvaluationContext) -> String {
- let mut prompt = String::new();
-
- prompt.push_str("# Contract Completion Evaluation\n\n");
- prompt.push_str("You are evaluating whether a contract has been completed successfully against its requirements.\n\n");
-
- // Contract info
- prompt.push_str("## Contract Information\n\n");
- prompt.push_str(&format!("**Name:** {}\n", context.contract.name));
- if let Some(ref desc) = context.contract.description {
- prompt.push_str(&format!("**Description:** {}\n", desc));
- }
- prompt.push_str(&format!("**Type:** {}\n", context.contract.contract_type));
- prompt.push_str(&format!("**Phase:** {}\n", context.contract.phase));
- prompt.push_str("\n");
-
- // Requirements
- if !context.requirements.is_empty() {
- prompt.push_str("## Requirements\n\n");
- for req in &context.requirements {
- prompt.push_str(&format!("- **{}** ({}): {}\n", req.id, req.priority, req.title));
- if !req.description.is_empty() {
- prompt.push_str(&format!(" {}\n", req.description));
- }
- }
- prompt.push_str("\n");
- }
-
- // Acceptance criteria
- if !context.acceptance_criteria.is_empty() {
- prompt.push_str("## Acceptance Criteria\n\n");
- for (i, criterion) in context.acceptance_criteria.iter().enumerate() {
- prompt.push_str(&format!("{}. **{}**\n", i + 1, criterion.description));
- prompt.push_str(&format!(" - Testable: {}\n", criterion.testable));
- if !criterion.requirement_ids.is_empty() {
- prompt.push_str(&format!(" - Covers: {}\n", criterion.requirement_ids.join(", ")));
- }
- }
- prompt.push_str("\n");
- }
-
- // Deliverables
- if !context.deliverables.is_empty() {
- prompt.push_str("## Deliverables\n\n");
- for d in &context.deliverables {
- prompt.push_str(&format!("- {} ({})\n", d.name, d.status));
- }
- prompt.push_str("\n");
- }
-
- // Files
- if !context.files.is_empty() {
- prompt.push_str("## Files Created/Modified\n\n");
- for file in &context.files {
- prompt.push_str(&format!("### {}", file.path));
- if file.is_deliverable {
- prompt.push_str(" [DELIVERABLE]");
- }
- prompt.push_str("\n");
- if let Some(ref desc) = file.description {
- prompt.push_str(&format!("*{}*\n", desc));
- }
- // Truncate content if too long
- let content = if file.content.len() > 5000 {
- format!("{}...\n[Content truncated - {} chars total]",
- &file.content[..5000], file.content.len())
- } else {
- file.content.clone()
- };
- prompt.push_str("```\n");
- prompt.push_str(&content);
- prompt.push_str("\n```\n\n");
- }
- }
-
- // Task outputs
- if !context.task_outputs.is_empty() {
- prompt.push_str("## Task Outputs\n\n");
- for task in &context.task_outputs {
- prompt.push_str(&format!("### {}\n", task.task_name));
- prompt.push_str(&format!("{}\n\n", task.output_summary));
- }
- }
-
- // Evaluation instructions
- prompt.push_str("## Evaluation Instructions\n\n");
- prompt.push_str("Please evaluate the completed work against the requirements and acceptance criteria.\n\n");
- prompt.push_str("For each acceptance criterion, determine if it has been met and provide a brief explanation.\n\n");
- prompt.push_str("Respond with a JSON object in the following format:\n\n");
- prompt.push_str("```json\n");
- prompt.push_str(r#"{
- "passed": true/false,
- "overallScore": 0.0-1.0,
- "criteriaResults": [
- {
- "criterionId": "criterion identifier or index",
- "met": true/false,
- "score": 0.0-1.0,
- "feedback": "explanation of why criterion was/wasn't met"
- }
- ],
- "summaryFeedback": "overall summary of the evaluation",
- "reworkInstructions": "if failed, specific instructions for what needs to be fixed (null if passed)"
-}
-"#);
- prompt.push_str("```\n\n");
- prompt.push_str(&format!("The pass threshold is {}. ", self.pass_threshold));
- prompt.push_str("A contract passes if the overall score is >= the threshold AND all critical criteria are met.\n");
-
- prompt
- }
-
- /// Call LLM for evaluation
- async fn call_llm_for_evaluation(&self, prompt: &str) -> Result<String, EvaluationError> {
- let messages = vec![Message {
- role: "user".to_string(),
- content: MessageContent::Text(prompt.to_string()),
- }];
-
- // Use chat_with_tools with empty tools array for simple chat
- let empty_tools: Vec<Tool> = vec![];
- let result = self
- .claude_client
- .chat_with_tools(messages, &empty_tools)
- .await
- .map_err(|e| EvaluationError::LlmError(e.to_string()))?;
-
- // ChatResult.content is already an Option<String>
- let text = result.content.unwrap_or_default();
-
- Ok(text)
- }
-
- /// Parse the LLM response into an evaluation result
- fn parse_evaluation_response(
- &self,
- response: &str,
- context: &EvaluationContext,
- ) -> Result<ContractEvaluationResult, EvaluationError> {
- // Extract JSON from response (may be wrapped in markdown code blocks)
- let json_str = extract_json_from_response(response)?;
-
- // Parse the JSON
- let parsed: EvaluationResponseJson = serde_json::from_str(&json_str)
- .map_err(|e| EvaluationError::ParseError(format!("JSON parse error: {}", e)))?;
-
- // Convert to our result type
- let criteria_results: Vec<EvaluationCriterionResult> = parsed
- .criteria_results
- .into_iter()
- .map(|cr| EvaluationCriterionResult {
- criterion_id: cr.criterion_id.clone(),
- criterion_text: cr.criterion_id, // Use ID as text if not provided
- passed: cr.passed,
- score: cr.score,
- feedback: cr.feedback,
- evidence: vec![],
- })
- .collect();
-
- // Determine pass/fail based on threshold and results
- let passed = parsed.passed && parsed.overall_score >= self.pass_threshold;
-
- Ok(ContractEvaluationResult {
- passed,
- overall_score: parsed.overall_score,
- criteria_results,
- summary_feedback: parsed.summary_feedback,
- rework_instructions: if passed { None } else { parsed.rework_instructions },
- })
- }
-
- /// Save evaluation result to database
- pub async fn save_evaluation(
- &self,
- contract_id: Uuid,
- chain_id: Uuid,
- chain_contract_id: Uuid,
- result: &ContractEvaluationResult,
- ) -> Result<ContractEvaluation, EvaluationError> {
- let req = CreateContractEvaluationRequest {
- contract_id,
- chain_id: Some(chain_id),
- chain_contract_id: Some(chain_contract_id),
- evaluator_model: Some(format!("{:?}", self.model)),
- passed: result.passed,
- overall_score: Some(result.overall_score),
- criteria_results: result.criteria_results.clone(),
- summary_feedback: result.summary_feedback.clone(),
- rework_instructions: result.rework_instructions.clone(),
- };
-
- let evaluation = repository::create_contract_evaluation(&self.pool, req).await?;
-
- // Update chain contract status
- let status = if result.passed { "passed" } else { "failed" };
- repository::update_chain_contract_evaluation_status(
- &self.pool,
- chain_contract_id,
- status,
- Some(evaluation.id),
- result.rework_instructions.as_deref(),
- )
- .await?;
-
- Ok(evaluation)
- }
-}
-
-/// JSON structure for parsing LLM response
-#[derive(Debug, Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct EvaluationResponseJson {
- passed: bool,
- overall_score: f64,
- criteria_results: Vec<CriterionResultJson>,
- summary_feedback: String,
- rework_instructions: Option<String>,
-}
-
-#[derive(Debug, Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct CriterionResultJson {
- criterion_id: String,
- #[serde(alias = "met")]
- passed: bool,
- #[serde(default)]
- score: f64,
- feedback: String,
-}
-
-/// Extract JSON from a response that may contain markdown code blocks
-fn extract_json_from_response(response: &str) -> Result<String, EvaluationError> {
- // Try to find JSON in code blocks first
- if let Some(start) = response.find("```json") {
- let json_start = start + 7;
- if let Some(end) = response[json_start..].find("```") {
- return Ok(response[json_start..json_start + end].trim().to_string());
- }
- }
-
- // Try plain code blocks
- if let Some(start) = response.find("```") {
- let json_start = start + 3;
- // Skip any language identifier on the same line
- let actual_start = response[json_start..]
- .find('\n')
- .map(|i| json_start + i + 1)
- .unwrap_or(json_start);
- if let Some(end) = response[actual_start..].find("```") {
- return Ok(response[actual_start..actual_start + end].trim().to_string());
- }
- }
-
- // Try to find raw JSON (starts with {)
- if let Some(start) = response.find('{') {
- // Find matching closing brace
- let mut depth = 0;
- let mut end = start;
- for (i, c) in response[start..].char_indices() {
- match c {
- '{' => depth += 1,
- '}' => {
- depth -= 1;
- if depth == 0 {
- end = start + i + 1;
- break;
- }
- }
- _ => {}
- }
- }
- if end > start {
- return Ok(response[start..end].to_string());
- }
- }
-
- Err(EvaluationError::ParseError(
- "Could not find JSON in response".to_string(),
- ))
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_extract_json_from_code_block() {
- let response = r#"Here is the evaluation:
-
-```json
-{
- "passed": true,
- "overallScore": 0.85
}
-```
-Done."#;
+/// Error types for contract evaluation.
+#[derive(Debug, thiserror::Error)]
+pub enum ContractEvaluatorError {
+ #[error("Database error: {0}")]
+ Database(#[from] sqlx::Error),
- let json = extract_json_from_response(response).unwrap();
- assert!(json.contains("\"passed\": true"));
- }
+ #[error("LLM error: {0}")]
+ Llm(String),
- #[test]
- fn test_extract_json_raw() {
- let response = r#"The result is {"passed": false, "overallScore": 0.5}"#;
- let json = extract_json_from_response(response).unwrap();
- assert!(json.contains("\"passed\": false"));
- }
+ #[error("Not implemented: {0}")]
+ NotImplemented(String),
}