1 files changed, 292 insertions, 15 deletions
diff --git a/makima/src/server/handlers/chat.rs b/makima/src/server/handlers/chat.rs
index 396c973..306093a 100644
--- a/makima/src/server/handlers/chat.rs
+++ b/makima/src/server/handlers/chat.rs
@@ -20,7 +20,18 @@ use crate::llm::{
 use crate::server::state::{FileUpdateNotification, SharedState};
 
 /// Maximum number of tool-calling rounds to prevent infinite loops
-const MAX_TOOL_ROUNDS: usize = 10;
+const MAX_TOOL_ROUNDS: usize = 20;
+
+/// Context limits for different models (in tokens)
+/// Claude models have 200K context, Groq models vary
+const CLAUDE_CONTEXT_LIMIT: usize = 200_000;
+const GROQ_CONTEXT_LIMIT: usize = 32_000;
+
+/// Threshold for triggering context compaction (90% of limit)
+const CONTEXT_COMPACTION_THRESHOLD: f32 = 0.90;
+
+/// Approximate characters per token (rough estimate for English text)
+const CHARS_PER_TOKEN: usize = 4;
 
 #[derive(Debug, Deserialize, ToSchema)]
 #[serde(rename_all = "camelCase")]
@@ -206,20 +217,62 @@ pub async fn chat_handler(
     // Build context about the file
     let file_context = build_file_context(&file);
 
+    // Build agentic system prompt
+    let system_prompt = format!(
+        r#"You are an intelligent document editing agent. You help users view, analyze, and modify document files.
+
+## Your Capabilities
+You have access to tools for:
+- **Viewing content**: view_body (see all elements), read_element (inspect specific element), view_transcript (read full transcript)
+- **Adding content**: add_heading, add_paragraph, add_chart
+- **Modifying content**: update_element, remove_element, reorder_elements, clear_body
+- **Document metadata**: set_summary
+- **Data processing**: parse_csv (convert CSV to JSON), jq (transform JSON data)
+- **Version history**: list_versions, read_version, restore_version
+
+## Agentic Behavior Guidelines
+
+### 1. Analyze Before Acting
+- For complex requests, first gather information using view_body, view_transcript, or read_element
+- Understand the current state of the document before making changes
+- For simple, direct requests (e.g., "add a heading called X"), you can act immediately without prior inspection
+
+### 2. Plan Multi-Step Operations
+- Break complex tasks into logical steps
+- For data visualization: parse_csv → (optionally jq to transform) → add_chart
+- For restructuring: view_body → understand structure → make targeted changes
+
+### 3. Handle Errors Gracefully
+- If a tool call fails, analyze the error message
+- Try an alternative approach or different parameters
+- Don't repeat the exact same failing call
+
+### 4. Know When to Stop
+- Stop when you've completed the user's request
+- Stop when you've provided the requested information
+- Provide a clear summary of what you did in your final response
+
+### 5. Be Efficient
+- Don't over-analyze simple requests
+- Use the minimum number of tool calls needed
+- Combine operations when possible
+
+## Current Document Context
+{file_context}
+
+## Important Notes
+- Body element indices are 0-based
+- When updating elements, provide ALL required fields for that element type
+- The transcript is read-only (you cannot modify it, only read it)
+- Changes are saved automatically after tool execution"#,
+        file_context = file_context
+    );
+
     // Build initial messages (Groq/OpenAI format - will be converted for Claude)
     let mut messages = vec![
         Message {
             role: "system".to_string(),
-            content: Some(format!(
-                "You are a helpful assistant that helps users edit and analyze document files. \
-                You have access to tools to add headings, paragraphs, charts, and set summaries. \
-                When the user asks you to modify the file, use the appropriate tools.\n\n\
-                IMPORTANT: You can call multiple tools in sequence. For example, if the user provides CSV data \
-                and asks for a chart, first call parse_csv to convert the data to JSON, then use that JSON \
-                to call add_chart.\n\n\
-                Current file context:\n{}",
-                file_context
-            )),
+            content: Some(system_prompt),
             tool_calls: None,
             tool_call_id: None,
         },
@@ -240,10 +293,48 @@ pub async fn chat_handler(
     let mut version_restored = false;
     // Track if there were modifications after a restore
     let mut has_changes_after_restore = false;
+    // Track consecutive failures for agentic retry logic
+    let mut consecutive_failures = 0;
+    const MAX_CONSECUTIVE_FAILURES: usize = 3;
 
-    // Multi-turn tool calling loop
+    // Multi-turn agentic tool calling loop
     for round in 0..MAX_TOOL_ROUNDS {
-        tracing::debug!(round = round, "LLM tool calling round");
+        tracing::info!(
+            round = round,
+            body_elements = current_body.len(),
+            total_tool_calls = all_tool_call_infos.len(),
+            "Agentic loop iteration"
+        );
+
+        // Check if we've hit too many consecutive failures
+        if consecutive_failures >= MAX_CONSECUTIVE_FAILURES {
+            tracing::warn!("Breaking loop due to {} consecutive failures", consecutive_failures);
+            final_response = Some(format!(
+                "I encountered multiple consecutive errors and stopped to avoid an infinite loop. \
+                Please try rephrasing your request or check if the document state is as expected."
+            ));
+            break;
+        }
+
+        // Check context usage and compact if nearing limit
+        if is_context_near_limit(&messages, &model) {
+            let estimated_tokens = estimate_total_tokens(&messages);
+            tracing::warn!(
+                estimated_tokens = estimated_tokens,
+                round = round,
+                "Context nearing limit, compacting conversation"
+            );
+            compact_conversation(&mut messages, &all_tool_call_infos);
+
+            // Log the new token count
+            let new_tokens = estimate_total_tokens(&messages);
+            tracing::info!(
+                tokens_before = estimated_tokens,
+                tokens_after = new_tokens,
+                tokens_saved = estimated_tokens - new_tokens,
+                "Conversation compacted"
+            );
+        }
 
         // Call the appropriate LLM API
         let result = match &llm_client {
@@ -324,8 +415,14 @@ pub async fn chat_handler(
 
         // Execute each tool call and add results to conversation
         for (i, tool_call) in result.tool_calls.iter().enumerate() {
+            tracing::info!(
+                tool = %tool_call.name,
+                round = round,
+                "Executing tool call"
+            );
+
             let mut execution_result =
-                execute_tool_call(tool_call, &current_body, current_summary.as_deref());
+                execute_tool_call(tool_call, &current_body, current_summary.as_deref(), &file.transcript);
 
             // Handle version tool requests that need async database access
             if let Some(version_request) = &execution_result.version_request {
@@ -369,7 +466,19 @@ pub async fn chat_handler(
                 }
             }
 
-            // Build tool result message content
+            // Track consecutive failures for agentic behavior
+            if execution_result.result.success {
+                consecutive_failures = 0;
+            } else {
+                consecutive_failures += 1;
+                tracing::warn!(
+                    tool = %tool_call.name,
+                    consecutive_failures = consecutive_failures,
+                    "Tool call failed"
+                );
+            }
+
+            // Build tool result message content with enhanced context for agentic reasoning
             let result_content = if let Some(parsed_data) = &execution_result.parsed_data {
                 // Include parsed data in the result for the LLM to use
                 serde_json::json!({
@@ -378,6 +487,19 @@ pub async fn chat_handler(
                     "data": parsed_data
                 })
                 .to_string()
+            } else if !execution_result.result.success {
+                // On failure, include hints for the LLM
+                let hint = if consecutive_failures >= MAX_CONSECUTIVE_FAILURES {
+                    " [HINT: Multiple consecutive failures detected. Consider a different approach or verify your parameters.]"
+                } else {
+                    ""
+                };
+                serde_json::json!({
+                    "success": false,
+                    "message": format!("{}{}", execution_result.result.message, hint),
+                    "currentBodyElementCount": current_body.len()
+                })
+                .to_string()
             } else {
                 serde_json::json!({
                     "success": execution_result.result.success,
@@ -742,3 +864,158 @@ async fn handle_version_request(
         }
     }
 }
+
+/// Estimate the token count of a message
+fn estimate_message_tokens(message: &Message) -> usize {
+    let mut chars = 0;
+
+    // Count content characters
+    if let Some(ref content) = message.content {
+        chars += content.len();
+    }
+
+    // Count tool call characters (rough estimate)
+    if let Some(ref tool_calls) = message.tool_calls {
+        for tc in tool_calls {
+            chars += tc.function.name.len();
+            chars += tc.function.arguments.len();
+        }
+    }
+
+    // Count tool call ID
+    if let Some(ref id) = message.tool_call_id {
+        chars += id.len();
+    }
+
+    // Add overhead for role and structure
+    chars += message.role.len() + 20;
+
+    // Convert to tokens
+    chars / CHARS_PER_TOKEN
+}
+
+/// Estimate total token count of all messages
+fn estimate_total_tokens(messages: &[Message]) -> usize {
+    messages.iter().map(estimate_message_tokens).sum()
+}
+
+/// Check if context is nearing the limit
+fn is_context_near_limit(messages: &[Message], model: &LlmModel) -> bool {
+    let estimated_tokens = estimate_total_tokens(messages);
+    let limit = match model {
+        LlmModel::ClaudeSonnet | LlmModel::ClaudeOpus => CLAUDE_CONTEXT_LIMIT,
+        LlmModel::GroqKimi => GROQ_CONTEXT_LIMIT,
+    };
+    let threshold = (limit as f32 * CONTEXT_COMPACTION_THRESHOLD) as usize;
+
+    estimated_tokens >= threshold
+}
+
+/// Compact the conversation by summarizing older messages
+/// Keeps: system message, last N user/assistant exchanges, and a summary of older content
+fn compact_conversation(messages: &mut Vec<Message>, tool_call_history: &[ToolCallInfo]) {
+    // Keep at least system message + 4 recent messages (2 exchanges)
+    const MIN_MESSAGES_TO_KEEP: usize = 5;
+
+    if messages.len() <= MIN_MESSAGES_TO_KEEP {
+        return;
+    }
+
+    // Extract system message (always first)
+    let system_message = messages.remove(0);
+
+    // Calculate how many messages to summarize
+    // Keep the last ~1/3 of messages for recent context
+    let messages_to_keep = std::cmp::max(4, messages.len() / 3);
+    let messages_to_summarize = messages.len() - messages_to_keep;
+
+    if messages_to_summarize < 2 {
+        // Not enough to summarize, just put system message back
+        messages.insert(0, system_message);
+        return;
+    }
+
+    // Extract messages to summarize
+    let old_messages: Vec<Message> = messages.drain(..messages_to_summarize).collect();
+
+    // Build summary of old messages
+    let mut summary_parts: Vec<String> = Vec::new();
+
+    // Summarize user requests
+    let user_requests: Vec<&str> = old_messages
+        .iter()
+        .filter(|m| m.role == "user")
+        .filter_map(|m| m.content.as_deref())
+        .collect();
+
+    if !user_requests.is_empty() {
+        summary_parts.push(format!(
+            "Previous user requests: {}",
+            user_requests.join("; ")
+        ));
+    }
+
+    // Summarize tool calls executed so far
+    if !tool_call_history.is_empty() {
+        let tool_summary: Vec<String> = tool_call_history
+            .iter()
+            .map(|tc| {
+                if tc.result.success {
+                    format!("{}(ok)", tc.name)
+                } else {
+                    format!("{}(failed: {})", tc.name, tc.result.message)
+                }
+            })
+            .collect();
+
+        summary_parts.push(format!(
+            "Tools executed: {}",
+            tool_summary.join(", ")
+        ));
+    }
+
+    // Count assistant responses that were summarized
+    let assistant_responses = old_messages
+        .iter()
+        .filter(|m| m.role == "assistant" && m.content.is_some())
+        .count();
+
+    if assistant_responses > 0 {
+        summary_parts.push(format!(
+            "({} previous assistant responses omitted for brevity)",
+            assistant_responses
+        ));
+    }
+
+    // Create compacted context message
+    let compacted_content = format!(
+        "[CONTEXT SUMMARY - Earlier conversation compacted to save tokens]\n{}",
+        summary_parts.join("\n")
+    );
+
+    // Rebuild messages: system + summary + remaining recent messages
+    let mut new_messages = vec![
+        system_message,
+        Message {
+            role: "user".to_string(),
+            content: Some(compacted_content),
+            tool_calls: None,
+            tool_call_id: None,
+        },
+        Message {
+            role: "assistant".to_string(),
+            content: Some("Understood. I have context from the previous conversation and will continue from here.".to_string()),
+            tool_calls: None,
+            tool_call_id: None,
+        },
+    ];
+
+    new_messages.append(messages);
+    *messages = new_messages;
+
+    tracing::info!(
+        summarized_messages = messages_to_summarize,
+        remaining_messages = messages.len(),
+        "Compacted conversation to save context"
+    );
+}