From c14192cc8b0e82369c93c1aee615fcc9cfad5911 Mon Sep 17 00:00:00 2001 From: soryu Date: Wed, 28 Jan 2026 03:45:36 +0000 Subject: Fix frontend build due to incorrect types --- makima/frontend/src/hooks/useSpeakWebSocket.ts | 2 +- makima/frontend/tsconfig.tsbuildinfo | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/makima/frontend/src/hooks/useSpeakWebSocket.ts b/makima/frontend/src/hooks/useSpeakWebSocket.ts index 3ef8851..d9fb826 100644 --- a/makima/frontend/src/hooks/useSpeakWebSocket.ts +++ b/makima/frontend/src/hooks/useSpeakWebSocket.ts @@ -22,7 +22,7 @@ export function useSpeakWebSocket() { const wsRef = useRef(null); const audioContextRef = useRef(null); - const audioQueueRef = useRef([]); + const audioQueueRef = useRef[]>([]); const isPlayingRef = useRef(false); const modelLoadingTimerRef = useRef | null>(null); const nextPlayTimeRef = useRef(0); diff --git a/makima/frontend/tsconfig.tsbuildinfo b/makima/frontend/tsconfig.tsbuildinfo index b02179d..804859b 100644 --- a/makima/frontend/tsconfig.tsbuildinfo +++ b/makima/frontend/tsconfig.tsbuildinfo @@ -1 +1 @@ -{"root":["./src/main.tsx","./src/vite-env.d.ts","./src/components/gridoverlay.tsx","./src/components/japanesehovertext.tsx","./src/components/logo.tsx","./src/components/masthead.tsx","./src/components/navstrip.tsx","./src/components/phaseconfirmationnotification.tsx","./src/components/protectedroute.tsx","./src/components/rewritelink.tsx","./src/components/simplemarkdown.tsx","./src/components/supervisorquestionnotification.tsx","./src/components/charts/chartrenderer.tsx","./src/components/contracts/autopilotpanel.tsx","./src/components/contracts/contractcliinput.tsx","./src/components/contracts/contractcontextmenu.tsx","./src/components/contracts/contractdetail.tsx","./src/components/contracts/contractlist.tsx","./src/components/contracts/phasebadge.tsx","./src/components/contracts/phaseconfirmationmodal.tsx","./src/components/contracts/phasedeliverablespanel.tsx","./src/components/contracts/phasehint.tsx","./src/components/contracts/phaseprogressbar.tsx","./src/components/contracts/quickactionbuttons.tsx","./src/components/contracts/repositorypanel.tsx","./src/components/contracts/taskderivationpreview.tsx","./src/components/files/bodyrenderer.tsx","./src/components/files/cliinput.tsx","./src/components/files/conflictnotification.tsx","./src/components/files/elementcontextmenu.tsx","./src/components/files/filedetail.tsx","./src/components/files/filelist.tsx","./src/components/files/reposyncindicator.tsx","./src/components/files/updatenotification.tsx","./src/components/files/versionhistorydropdown.tsx","./src/components/history/checkpointcard.tsx","./src/components/history/checkpointlist.tsx","./src/components/history/conversationmessage.tsx","./src/components/history/conversationview.tsx","./src/components/history/historyfilters.tsx","./src/components/history/resumecontrols.tsx","./src/components/history/timelineeventcard.tsx","./src/components/history/timelinelist.tsx","./src/components/history/index.ts","./src/components/listen/contractpickermodal.tsx","./src/components/listen/controlpanel.tsx","./src/components/listen/speakerpanel.tsx","./src/components/listen/transcriptanalysispanel.tsx","./src/components/listen/transcriptpanel.tsx","./src/components/mesh/branchtaskmodal.tsx","./src/components/mesh/contractcompletequestion.tsx","./src/components/mesh/directoryinput.tsx","./src/components/mesh/gitactionspanel.tsx","./src/components/mesh/inlinesubtaskeditor.tsx","./src/components/mesh/mergeconflictresolver.tsx","./src/components/mesh/overlaydiffviewer.tsx","./src/components/mesh/prpreview.tsx","./src/components/mesh/patcheslistpanel.tsx","./src/components/mesh/subtasktree.tsx","./src/components/mesh/taskdetail.tsx","./src/components/mesh/tasklist.tsx","./src/components/mesh/taskoutput.tsx","./src/components/mesh/tasktree.tsx","./src/components/mesh/unifiedmeshchatinput.tsx","./src/components/mesh/worktreefilespanel.tsx","./src/components/templates/templateeditor.tsx","./src/components/workflow/phasecolumn.tsx","./src/components/workflow/workflowboard.tsx","./src/components/workflow/workflowcontractcard.tsx","./src/contexts/authcontext.tsx","./src/contexts/supervisorquestionscontext.tsx","./src/hooks/usecontracts.ts","./src/hooks/usefilesubscription.ts","./src/hooks/usefiles.ts","./src/hooks/usemeshchathistory.ts","./src/hooks/usemicrophone.ts","./src/hooks/usetasksubscription.ts","./src/hooks/usetasks.ts","./src/hooks/usetextscramble.ts","./src/hooks/useversionhistory.ts","./src/hooks/usewebsocket.ts","./src/lib/api.ts","./src/lib/listenapi.ts","./src/lib/markdown.ts","./src/lib/supabase.ts","./src/routes/_index.tsx","./src/routes/contract-file.tsx","./src/routes/contracts.tsx","./src/routes/files.tsx","./src/routes/history.tsx","./src/routes/listen.tsx","./src/routes/login.tsx","./src/routes/mesh.tsx","./src/routes/settings.tsx","./src/routes/templates.tsx","./src/routes/workflow.tsx","./src/types/messages.ts","./src/types/templates.ts"],"version":"5.9.3"} \ No newline at end of file +{"root":["./src/main.tsx","./src/vite-env.d.ts","./src/components/gridoverlay.tsx","./src/components/japanesehovertext.tsx","./src/components/logo.tsx","./src/components/masthead.tsx","./src/components/navstrip.tsx","./src/components/phaseconfirmationnotification.tsx","./src/components/protectedroute.tsx","./src/components/rewritelink.tsx","./src/components/simplemarkdown.tsx","./src/components/supervisorquestionnotification.tsx","./src/components/charts/chartrenderer.tsx","./src/components/contracts/autopilotpanel.tsx","./src/components/contracts/contractcliinput.tsx","./src/components/contracts/contractcontextmenu.tsx","./src/components/contracts/contractdetail.tsx","./src/components/contracts/contractlist.tsx","./src/components/contracts/phasebadge.tsx","./src/components/contracts/phaseconfirmationmodal.tsx","./src/components/contracts/phasedeliverablespanel.tsx","./src/components/contracts/phasehint.tsx","./src/components/contracts/phaseprogressbar.tsx","./src/components/contracts/quickactionbuttons.tsx","./src/components/contracts/repositorypanel.tsx","./src/components/contracts/taskderivationpreview.tsx","./src/components/files/bodyrenderer.tsx","./src/components/files/cliinput.tsx","./src/components/files/conflictnotification.tsx","./src/components/files/elementcontextmenu.tsx","./src/components/files/filedetail.tsx","./src/components/files/filelist.tsx","./src/components/files/reposyncindicator.tsx","./src/components/files/updatenotification.tsx","./src/components/files/versionhistorydropdown.tsx","./src/components/history/checkpointcard.tsx","./src/components/history/checkpointlist.tsx","./src/components/history/conversationmessage.tsx","./src/components/history/conversationview.tsx","./src/components/history/historyfilters.tsx","./src/components/history/resumecontrols.tsx","./src/components/history/timelineeventcard.tsx","./src/components/history/timelinelist.tsx","./src/components/history/index.ts","./src/components/listen/contractpickermodal.tsx","./src/components/listen/controlpanel.tsx","./src/components/listen/speakerpanel.tsx","./src/components/listen/transcriptanalysispanel.tsx","./src/components/listen/transcriptpanel.tsx","./src/components/mesh/branchtaskmodal.tsx","./src/components/mesh/contractcompletequestion.tsx","./src/components/mesh/directoryinput.tsx","./src/components/mesh/gitactionspanel.tsx","./src/components/mesh/inlinesubtaskeditor.tsx","./src/components/mesh/mergeconflictresolver.tsx","./src/components/mesh/overlaydiffviewer.tsx","./src/components/mesh/prpreview.tsx","./src/components/mesh/patcheslistpanel.tsx","./src/components/mesh/subtasktree.tsx","./src/components/mesh/taskdetail.tsx","./src/components/mesh/tasklist.tsx","./src/components/mesh/taskoutput.tsx","./src/components/mesh/tasktree.tsx","./src/components/mesh/unifiedmeshchatinput.tsx","./src/components/mesh/worktreefilespanel.tsx","./src/components/templates/templateeditor.tsx","./src/components/workflow/phasecolumn.tsx","./src/components/workflow/workflowboard.tsx","./src/components/workflow/workflowcontractcard.tsx","./src/contexts/authcontext.tsx","./src/contexts/supervisorquestionscontext.tsx","./src/hooks/usecontracts.ts","./src/hooks/usefilesubscription.ts","./src/hooks/usefiles.ts","./src/hooks/usemeshchathistory.ts","./src/hooks/usemicrophone.ts","./src/hooks/usespeakwebsocket.ts","./src/hooks/usetasksubscription.ts","./src/hooks/usetasks.ts","./src/hooks/usetextscramble.ts","./src/hooks/useversionhistory.ts","./src/hooks/usewebsocket.ts","./src/lib/api.ts","./src/lib/listenapi.ts","./src/lib/markdown.ts","./src/lib/supabase.ts","./src/routes/_index.tsx","./src/routes/contract-file.tsx","./src/routes/contracts.tsx","./src/routes/files.tsx","./src/routes/history.tsx","./src/routes/listen.tsx","./src/routes/login.tsx","./src/routes/mesh.tsx","./src/routes/settings.tsx","./src/routes/speak.tsx","./src/routes/templates.tsx","./src/routes/workflow.tsx","./src/types/messages.ts","./src/types/templates.ts"],"version":"5.9.3"} \ No newline at end of file -- cgit v1.2.3 From 9b53f6c6b01da85ef73bd5960b32ec319df0b947 Mon Sep 17 00:00:00 2001 From: soryu Date: Wed, 28 Jan 2026 03:50:45 +0000 Subject: Replace TTS endpoint with Rust-native Qwen3-TTS (#41) * chore: fix unused import warnings in qwen3-tts module - Remove unused import 'IndexOp' in model.rs - Remove unused import 'DType' in speech_tokenizer.rs - Add #[allow(dead_code)] to codebook_dim field in RvqCodebook Co-Authored-By: Claude Opus 4.5 * feat: add voice loading and selection for TTS cloning Add voice reference audio loading so the TTS speak handler can perform voice cloning using reference WAV files from the voices/ directory. - Add voice.rs module: loads manifest.json and reference.wav for a given voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine - Update speak.rs: resolve voice_id from the speak request (default "makima"), load reference audio, pass it to engine.generate() - Add voices/makima/README.md with instructions for obtaining reference audio (extraction from YouTube, recording, ffmpeg conversion) - Graceful fallback: if reference audio is missing, TTS proceeds without voice cloning using the model's default voice Co-Authored-By: Claude Opus 4.5 * [WIP] Heartbeat checkpoint - 2026-01-28 03:49:13 UTC --------- Co-authored-by: Claude Opus 4.5 --- makima/src/server/handlers/mod.rs | 1 + makima/src/server/handlers/speak.rs | 77 +++++++++- makima/src/server/handlers/voice.rs | 252 +++++++++++++++++++++++++++++++ makima/src/tts/chatterbox.rs | 4 +- makima/src/tts/mod.rs | 8 + makima/src/tts/qwen3/generate.rs | 32 +++- makima/src/tts/qwen3/mod.rs | 6 +- makima/src/tts/qwen3/model.rs | 2 +- makima/src/tts/qwen3/speech_tokenizer.rs | 3 +- voices/makima/README.md | 105 +++++++++++++ 10 files changed, 478 insertions(+), 12 deletions(-) create mode 100644 makima/src/server/handlers/voice.rs create mode 100644 voices/makima/README.md diff --git a/makima/src/server/handlers/mod.rs b/makima/src/server/handlers/mod.rs index 8207399..8af2a37 100644 --- a/makima/src/server/handlers/mod.rs +++ b/makima/src/server/handlers/mod.rs @@ -19,6 +19,7 @@ pub mod mesh_ws; pub mod repository_history; pub mod speak; pub mod templates; +pub mod voice; pub mod transcript_analysis; pub mod users; pub mod versions; diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs index 75e7780..3ed2620 100644 --- a/makima/src/server/handlers/speak.rs +++ b/makima/src/server/handlers/speak.rs @@ -15,6 +15,9 @@ //! See `makima/src/tts/` for the TTS engine implementation. //! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification. +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + use axum::{ extract::{ws::Message, ws::WebSocket, State, WebSocketUpgrade}, response::Response, @@ -32,9 +35,9 @@ enum ClientMessage { /// Request speech synthesis for the given text. Speak { text: String, - /// Optional voice ID (e.g., "makima"). Not yet used — reserved for future voice selection. + /// Optional voice ID (e.g., "makima"). Used to load reference audio for voice cloning. + /// Defaults to "makima" if not specified. #[serde(default)] - #[allow(dead_code)] voice: Option, }, /// Cancel any in-progress synthesis. @@ -76,6 +79,10 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { let (mut sender, mut receiver) = socket.split(); + // Cancellation flag shared between the message loop and inference. + // Each new Speak request resets it to false; Cancel sets it to true. + let cancel_flag: Arc = Arc::new(AtomicBool::new(false)); + // Process incoming messages while let Some(msg) = receiver.next().await { let msg = match msg { @@ -102,13 +109,41 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { }; match client_msg { - ClientMessage::Speak { text, .. } => { + ClientMessage::Speak { text, voice } => { + let voice_id = voice + .as_deref() + .unwrap_or(super::voice::DEFAULT_VOICE_ID); + tracing::info!( session_id = %session_id, text_len = text.len(), + voice_id = %voice_id, "TTS speak request" ); + // Load voice reference audio for cloning + let voice_ref = match super::voice::load_reference_audio(voice_id) { + Ok(v) => { + tracing::debug!( + session_id = %session_id, + voice_id = %voice_id, + voice_name = %v.manifest.name, + samples = v.samples.len(), + "Voice reference loaded" + ); + Some(v) + } + Err(e) => { + tracing::warn!( + session_id = %session_id, + voice_id = %voice_id, + error = %e, + "Failed to load voice reference, proceeding without cloning" + ); + None + } + }; + // Get or lazily load the TTS engine let engine = match state.get_tts_engine().await { Ok(e) => e, @@ -138,9 +173,21 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { continue; } - // Run TTS inference (no voice reference for now — uses default) - match engine.generate(&text, None, None).await { + // Reset the cancel flag for this new generation request + cancel_flag.store(false, Ordering::Relaxed); + + // Run TTS inference with optional voice reference for cloning + // and the cancel flag so it can be stopped early. + let (ref_audio, ref_rate) = match &voice_ref { + Some(v) => (Some(v.samples.as_slice()), Some(v.sample_rate)), + None => (None, None), + }; + let flag = cancel_flag.clone(); + match engine.generate(&text, ref_audio, ref_rate, Some(flag)).await { Ok(chunks) => { + // Check if generation was cancelled + let was_cancelled = cancel_flag.load(Ordering::Relaxed); + for chunk in &chunks { // Send binary PCM audio data let pcm_bytes = chunk.to_pcm16_bytes(); @@ -157,12 +204,13 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { } } - // Signal end of audio + // Signal end of audio (include cancelled status) let end_msg = serde_json::json!({ "type": "audio_end", "sample_rate": engine.sample_rate(), "format": "pcm_s16le", "channels": 1, + "cancelled": was_cancelled, }); let _ = sender .send(Message::Text(end_msg.to_string().into())) @@ -185,16 +233,18 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { } ClientMessage::Cancel => { tracing::info!(session_id = %session_id, "TTS cancel requested"); - // TODO: support cancellation of in-progress inference + cancel_flag.store(true, Ordering::Relaxed); } ClientMessage::Stop => { tracing::info!(session_id = %session_id, "TTS stop requested, closing"); + cancel_flag.store(true, Ordering::Relaxed); break; } } } Message::Close(_) => { tracing::info!(session_id = %session_id, "TTS WebSocket closed by client"); + cancel_flag.store(true, Ordering::Relaxed); break; } _ => { @@ -271,4 +321,17 @@ mod tests { let msg: ClientMessage = serde_json::from_str(json).unwrap(); assert!(matches!(msg, ClientMessage::Stop)); } + + #[test] + fn test_client_message_parse_speak_with_voice() { + let json = r#"{"type": "speak", "text": "Hello", "voice": "makima"}"#; + let msg: ClientMessage = serde_json::from_str(json).unwrap(); + match msg { + ClientMessage::Speak { text, voice } => { + assert_eq!(text, "Hello"); + assert_eq!(voice.as_deref(), Some("makima")); + } + _ => panic!("Expected Speak message"), + } + } } diff --git a/makima/src/server/handlers/voice.rs b/makima/src/server/handlers/voice.rs new file mode 100644 index 0000000..91b650d --- /dev/null +++ b/makima/src/server/handlers/voice.rs @@ -0,0 +1,252 @@ +//! Voice loading utilities for TTS voice cloning. +//! +//! Loads voice manifests and reference audio from the `voices/` directory. +//! Each voice is a directory containing: +//! - `manifest.json` — voice metadata (name, sample rate, backend, etc.) +//! - `reference.wav` — reference audio clip for voice cloning (5-15s, 24kHz mono) + +use serde::Deserialize; +use std::path::{Path, PathBuf}; + +use crate::tts::{resample_to_24k, SAMPLE_RATE}; + +/// Default voice ID used when no voice is specified. +pub const DEFAULT_VOICE_ID: &str = "makima"; + +/// Voice manifest loaded from `voices/{voice_id}/manifest.json`. +#[derive(Debug, Clone, Deserialize)] +pub struct VoiceManifest { + pub name: String, + pub id: String, + #[serde(default)] + pub description: Option, + #[serde(default = "default_language")] + pub language: String, + #[serde(default)] + pub accent: Option, + #[serde(default = "default_sample_rate")] + pub sample_rate: u32, + #[serde(default)] + pub format: Option, + #[serde(default)] + pub model_backend: Option, + #[serde(default = "default_reference_audio")] + pub reference_audio: String, + #[serde(default)] + pub notes: Option, +} + +fn default_language() -> String { + "en".to_string() +} + +fn default_sample_rate() -> u32 { + 24_000 +} + +fn default_reference_audio() -> String { + "reference.wav".to_string() +} + +/// Loaded voice reference: manifest + decoded PCM samples at 24kHz. +#[derive(Debug, Clone)] +pub struct VoiceReference { + pub manifest: VoiceManifest, + /// PCM f32 samples resampled to 24kHz mono. + pub samples: Vec, + /// Always 24000 after resampling. + pub sample_rate: u32, +} + +/// Resolve the base directory for voice data. +/// +/// Looks for the `voices/` directory relative to the current working directory, +/// or falls back to the executable's directory. +fn voices_base_dir() -> PathBuf { + // Try current working directory first + let cwd = std::env::current_dir().unwrap_or_default(); + let cwd_voices = cwd.join("voices"); + if cwd_voices.is_dir() { + return cwd_voices; + } + + // Try relative to executable + if let Ok(exe) = std::env::current_exe() { + if let Some(exe_dir) = exe.parent() { + let exe_voices = exe_dir.join("voices"); + if exe_voices.is_dir() { + return exe_voices; + } + // Try one level up (common in target/debug layout) + if let Some(parent) = exe_dir.parent() { + let parent_voices = parent.join("voices"); + if parent_voices.is_dir() { + return parent_voices; + } + // Two levels up (target/debug -> project root) + if let Some(grandparent) = parent.parent() { + let gp_voices = grandparent.join("voices"); + if gp_voices.is_dir() { + return gp_voices; + } + } + } + } + } + + // Default: assume cwd/voices + cwd_voices +} + +/// Load a voice manifest from `voices/{voice_id}/manifest.json`. +pub fn load_manifest(voice_id: &str) -> Result { + let base = voices_base_dir(); + let manifest_path = base.join(voice_id).join("manifest.json"); + + if !manifest_path.exists() { + return Err(VoiceLoadError::NotFound(voice_id.to_string())); + } + + let data = std::fs::read_to_string(&manifest_path).map_err(|e| { + VoiceLoadError::Io(format!( + "failed to read manifest at {}: {e}", + manifest_path.display() + )) + })?; + + let manifest: VoiceManifest = serde_json::from_str(&data).map_err(|e| { + VoiceLoadError::InvalidManifest(format!("failed to parse manifest: {e}")) + })?; + + Ok(manifest) +} + +/// Load a voice's reference audio as f32 PCM samples resampled to 24kHz. +/// +/// Uses symphonia (via `crate::audio`) to decode the WAV file, then +/// resamples to 24kHz using `tts::resample_to_24k`. +pub fn load_reference_audio(voice_id: &str) -> Result { + let manifest = load_manifest(voice_id)?; + + let base = voices_base_dir(); + let audio_path = base.join(voice_id).join(&manifest.reference_audio); + + if !audio_path.exists() { + return Err(VoiceLoadError::MissingAudio(format!( + "reference audio not found at {}. See voices/{}/README.md for instructions.", + audio_path.display(), + voice_id, + ))); + } + + load_reference_audio_from_path(&audio_path, manifest) +} + +/// Load reference audio from a specific file path with a pre-loaded manifest. +fn load_reference_audio_from_path( + audio_path: &Path, + manifest: VoiceManifest, +) -> Result { + // Use symphonia-based decoder from crate::audio to decode the WAV + let pcm = crate::audio::to_16k_mono_from_path(audio_path).map_err(|e| { + VoiceLoadError::AudioDecode(format!("failed to decode {}: {e}", audio_path.display())) + })?; + + // The audio module decodes to 16kHz mono; we need 24kHz for TTS. + // Resample from 16kHz to 24kHz. + let samples = if pcm.sample_rate == SAMPLE_RATE { + pcm.samples + } else { + resample_to_24k(&pcm.samples, pcm.sample_rate) + }; + + tracing::info!( + voice_id = %manifest.id, + voice_name = %manifest.name, + samples_len = samples.len(), + duration_secs = samples.len() as f32 / SAMPLE_RATE as f32, + "Loaded voice reference audio" + ); + + Ok(VoiceReference { + manifest, + samples, + sample_rate: SAMPLE_RATE, + }) +} + +/// Errors that can occur when loading a voice. +#[derive(Debug)] +pub enum VoiceLoadError { + /// Voice directory not found. + NotFound(String), + /// IO error reading files. + Io(String), + /// Manifest JSON is invalid. + InvalidManifest(String), + /// Reference audio file is missing. + MissingAudio(String), + /// Failed to decode audio. + AudioDecode(String), +} + +impl std::fmt::Display for VoiceLoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + VoiceLoadError::NotFound(id) => { + write!(f, "voice '{id}' not found (no voices/{id}/manifest.json)") + } + VoiceLoadError::Io(msg) => write!(f, "voice IO error: {msg}"), + VoiceLoadError::InvalidManifest(msg) => write!(f, "invalid voice manifest: {msg}"), + VoiceLoadError::MissingAudio(msg) => write!(f, "missing reference audio: {msg}"), + VoiceLoadError::AudioDecode(msg) => write!(f, "audio decode error: {msg}"), + } + } +} + +impl std::error::Error for VoiceLoadError {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_voice_id() { + assert_eq!(DEFAULT_VOICE_ID, "makima"); + } + + #[test] + fn test_manifest_deserialize() { + let json = r#"{ + "name": "Test Voice", + "id": "test", + "sample_rate": 24000, + "reference_audio": "reference.wav" + }"#; + let manifest: VoiceManifest = serde_json::from_str(json).unwrap(); + assert_eq!(manifest.name, "Test Voice"); + assert_eq!(manifest.id, "test"); + assert_eq!(manifest.sample_rate, 24000); + assert_eq!(manifest.reference_audio, "reference.wav"); + assert_eq!(manifest.language, "en"); + } + + #[test] + fn test_manifest_deserialize_defaults() { + let json = r#"{"name": "Minimal", "id": "min"}"#; + let manifest: VoiceManifest = serde_json::from_str(json).unwrap(); + assert_eq!(manifest.language, "en"); + assert_eq!(manifest.sample_rate, 24000); + assert_eq!(manifest.reference_audio, "reference.wav"); + } + + #[test] + fn test_load_nonexistent_voice() { + let result = load_manifest("nonexistent_voice_xyz"); + assert!(result.is_err()); + match result.unwrap_err() { + VoiceLoadError::NotFound(id) => assert_eq!(id, "nonexistent_voice_xyz"), + other => panic!("Expected NotFound, got: {other}"), + } + } +} diff --git a/makima/src/tts/chatterbox.rs b/makima/src/tts/chatterbox.rs index e26bc06..712910f 100644 --- a/makima/src/tts/chatterbox.rs +++ b/makima/src/tts/chatterbox.rs @@ -6,7 +6,8 @@ use std::borrow::Cow; use std::fs; use std::path::{Path, PathBuf}; -use std::sync::Mutex; +use std::sync::atomic::AtomicBool; +use std::sync::{Arc, Mutex}; use hf_hub::api::sync::Api; use ndarray::{Array2, Array3, Array4, ArrayD, IxDyn}; @@ -427,6 +428,7 @@ impl TtsEngine for ChatterboxTTS { text: &str, reference_audio: Option<&[f32]>, reference_sample_rate: Option, + _cancel_flag: Option>, ) -> Result, TtsError> { let samples = match reference_audio { Some(audio) => { diff --git a/makima/src/tts/mod.rs b/makima/src/tts/mod.rs index 2cd0412..b66f4a5 100644 --- a/makima/src/tts/mod.rs +++ b/makima/src/tts/mod.rs @@ -5,6 +5,8 @@ //! - **Qwen3**: Pure Rust candle-based Qwen3-TTS-12Hz-0.6B use std::path::Path; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; pub mod chatterbox; pub mod qwen3; @@ -109,11 +111,17 @@ pub enum TtsBackend { #[async_trait::async_trait] pub trait TtsEngine: Send + Sync { /// Generate complete audio from text with a voice reference. + /// + /// The optional `cancel_flag` can be set to `true` by another thread/task + /// to request early termination of the generation loop. Engines that + /// support cancellation will check this flag periodically and return + /// whatever audio has been produced so far. async fn generate( &self, text: &str, reference_audio: Option<&[f32]>, reference_sample_rate: Option, + cancel_flag: Option>, ) -> Result, TtsError>; /// Check if the engine is loaded and ready. diff --git a/makima/src/tts/qwen3/generate.rs b/makima/src/tts/qwen3/generate.rs index 02161e6..30d165b 100644 --- a/makima/src/tts/qwen3/generate.rs +++ b/makima/src/tts/qwen3/generate.rs @@ -7,6 +7,9 @@ //! 4. Code predictor → remaining 15 codebook tokens per frame //! 5. Speech tokenizer decoder → waveform audio +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + use candle_core::{DType, Device, IndexOp, Result, Tensor}; use tokenizers::Tokenizer; @@ -60,6 +63,9 @@ pub struct GenerationContext<'a> { tokenizer: &'a Tokenizer, device: &'a Device, config: GenerationConfig, + /// Optional cancellation flag. When set to `true`, the generation loop + /// will break early and return whatever audio has been produced so far. + cancel_flag: Option>, } impl<'a> GenerationContext<'a> { @@ -70,6 +76,7 @@ impl<'a> GenerationContext<'a> { tokenizer: &'a Tokenizer, device: &'a Device, config: GenerationConfig, + cancel_flag: Option>, ) -> Self { Self { model, @@ -78,9 +85,17 @@ impl<'a> GenerationContext<'a> { tokenizer, device, config, + cancel_flag, } } + /// Check whether cancellation has been requested. + fn is_cancelled(&self) -> bool { + self.cancel_flag + .as_ref() + .map_or(false, |f| f.load(Ordering::Relaxed)) + } + /// Generate audio from text, optionally with a voice reference. /// /// Returns a list of audio chunks. If `streaming` is false, returns @@ -194,6 +209,12 @@ impl<'a> GenerationContext<'a> { // === Subsequent iterations: one token at a time === for _step in 1..self.config.max_new_tokens { + // Check for cancellation each iteration + if self.is_cancelled() { + tracing::info!("TTS generation cancelled after {} frames", generated_frames.len()); + break; + } + let past_len = kv_caches[0].seq_len(); // Input: just the last generated zeroth codebook token @@ -340,13 +361,22 @@ impl<'a> GenerationContext<'a> { &self, frames: &[Vec], ) -> std::result::Result, TtsError> { - let mut chunks = Vec::new(); + let mut chunks: Vec = Vec::new(); // Decode in groups of frames for efficiency let chunk_size = 10; // ~800ms per chunk at 12.5Hz let num_codebooks = self.speech_tokenizer.num_codebooks(); for (chunk_idx, frame_chunk) in frames.chunks(chunk_size).enumerate() { + // Check for cancellation between streaming chunks + if self.is_cancelled() { + tracing::info!("TTS streaming decode cancelled after {} chunks", chunks.len()); + if let Some(last) = chunks.last_mut() { + last.is_final = true; + } + return Ok(chunks); + } + let is_last = (chunk_idx + 1) * chunk_size >= frames.len(); // Transpose chunk frames diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs index c55c118..9bac794 100644 --- a/makima/src/tts/qwen3/mod.rs +++ b/makima/src/tts/qwen3/mod.rs @@ -30,6 +30,7 @@ pub mod speech_tokenizer; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; use candle_core::{DType, Device}; use candle_nn::VarBuilder; @@ -168,6 +169,7 @@ impl Qwen3Tts { text: &str, reference_audio: Option<&[f32]>, gen_config: Option, + cancel_flag: Option>, ) -> Result, TtsError> { let config = gen_config.unwrap_or_default(); @@ -178,6 +180,7 @@ impl Qwen3Tts { &self.tokenizer, &self.device, config, + cancel_flag, ); ctx.generate(text, reference_audio) @@ -250,11 +253,12 @@ impl TtsEngine for Qwen3Tts { text: &str, reference_audio: Option<&[f32]>, _reference_sample_rate: Option, + cancel_flag: Option>, ) -> Result, TtsError> { // Note: reference audio should already be resampled to 24kHz // by the caller. If a different sample rate is provided, // the caller should resample using `resample_to_24k()`. - self.generate_speech(text, reference_audio, None) + self.generate_speech(text, reference_audio, None, cancel_flag) } fn is_ready(&self) -> bool { diff --git a/makima/src/tts/qwen3/model.rs b/makima/src/tts/qwen3/model.rs index 551893b..8a1e986 100644 --- a/makima/src/tts/qwen3/model.rs +++ b/makima/src/tts/qwen3/model.rs @@ -10,7 +10,7 @@ //! Based on the candle-transformers Qwen2 model architecture, //! extended for Qwen3-TTS. -use candle_core::{DType, Device, IndexOp, Module, Result, Tensor, D}; +use candle_core::{DType, Device, Module, Result, Tensor, D}; use candle_nn::{embedding, linear_no_bias, rms_norm, Embedding, Linear, RmsNorm, VarBuilder}; use super::config::Qwen3LmConfig; diff --git a/makima/src/tts/qwen3/speech_tokenizer.rs b/makima/src/tts/qwen3/speech_tokenizer.rs index 752050a..86e00f2 100644 --- a/makima/src/tts/qwen3/speech_tokenizer.rs +++ b/makima/src/tts/qwen3/speech_tokenizer.rs @@ -11,7 +11,7 @@ //! The speech tokenizer is a separate model (~682MB) loaded from //! `Qwen/Qwen3-TTS-Tokenizer-12Hz`. -use candle_core::{DType, Device, Module, Result, Tensor, D}; +use candle_core::{Device, Module, Result, Tensor, D}; use candle_nn::{ conv1d, embedding, linear_no_bias, Conv1d, Conv1dConfig, Embedding, Linear, VarBuilder, }; @@ -259,6 +259,7 @@ impl DecoderBlock { pub struct RvqCodebook { codebooks: Vec, num_codebooks: usize, + #[allow(dead_code)] codebook_dim: usize, } diff --git a/voices/makima/README.md b/voices/makima/README.md new file mode 100644 index 0000000..8553daf --- /dev/null +++ b/voices/makima/README.md @@ -0,0 +1,105 @@ +# Makima Voice Reference Audio + +This directory contains the voice profile for **Makima** — the default TTS voice used by the makima system for voice cloning. + +## What You Need + +A **reference audio clip** (`reference.wav`) of Makima's Japanese voice actress (Tomori Kusunoki) speaking English. + +### Requirements + +| Property | Value | +|----------------|------------------------------------------| +| **Filename** | `reference.wav` | +| **Duration** | 5–15 seconds (10s ideal) | +| **Format** | WAV (PCM) | +| **Sample Rate**| 24 kHz (will be resampled if different) | +| **Channels** | Mono (1 channel) | +| **Bit Depth** | 16-bit or 32-bit float | + +### Why These Parameters? + +- **5–15 seconds**: Enough for the TTS model to capture voice characteristics without being too long for memory. +- **24 kHz mono**: Native sample rate of the Qwen3-TTS model. Audio at other rates will be automatically resampled, but starting at 24 kHz avoids quality loss. +- **Clear speech**: Minimal background noise, no music overlay. A single speaker only. + +## How to Obtain Reference Audio + +### Option 1: Record or Find English Speech + +The best reference audio is a clean clip of the target voice speaking English. Sources: + +- **Anime convention panels or interviews** where the VA speaks English +- **Behind-the-scenes clips** from Chainsaw Man production +- **Fan events or promotional videos** with English speech segments + +### Option 2: Extract from YouTube + +You can extract audio from YouTube clips. Here are some potential sources: + +1. Search YouTube for: `"Tomori Kusunoki" english` or `"楠木ともり" english` +2. Look for interview clips, event recordings, or promotional content + +**Extraction steps using `yt-dlp` and `ffmpeg`:** + +```bash +# 1. Download audio from a YouTube clip +yt-dlp -x --audio-format wav -o "raw_audio.%(ext)s" "YOUTUBE_URL_HERE" + +# 2. Convert to 24kHz mono WAV, trimming to a 10-second segment +# Adjust -ss (start time) and -t (duration) as needed +ffmpeg -i raw_audio.wav \ + -ss 00:00:05 -t 00:00:10 \ + -ar 24000 -ac 1 \ + -acodec pcm_s16le \ + voices/makima/reference.wav + +# 3. Verify the output +ffprobe -v error -show_entries stream=sample_rate,channels,duration \ + -of default=noprint_wrappers=1 voices/makima/reference.wav +``` + +### Option 3: Use Any Japanese-Accented English Voice + +If you cannot find clips of the specific VA, any clear recording of a female Japanese speaker speaking English will work as a starting point. The voice cloning will adapt to the reference audio's characteristics. + +```bash +# Example: record your own reference using sox (if available) +sox -d -r 24000 -c 1 -b 16 voices/makima/reference.wav trim 0 10 +``` + +## Tips for Best Quality + +1. **Clean audio**: Remove any background music or noise. Use a noise gate or audio editor if needed. +2. **Natural speech**: Conversational tone works better than reading. The model captures prosody and rhythm. +3. **Consistent volume**: Normalize the audio to avoid clipping or very quiet segments. +4. **Single speaker**: Only the target voice should be present in the clip. + +```bash +# Normalize audio volume with ffmpeg +ffmpeg -i reference_raw.wav \ + -af "loudnorm=I=-16:TP=-1.5:LRA=11" \ + -ar 24000 -ac 1 -acodec pcm_s16le \ + voices/makima/reference.wav +``` + +## File Structure + +``` +voices/makima/ +├── manifest.json # Voice metadata (name, sample rate, backend) +├── reference.wav # Reference audio clip (YOU PROVIDE THIS) +└── README.md # This file +``` + +## Verification + +After placing `reference.wav`, you can verify the system loads it correctly: + +```bash +# The TTS handler will log voice loading on first speak request: +# INFO makima::server::handlers::voice: Loaded voice reference audio +# voice_id="makima" voice_name="Makima" samples_len=240000 duration_secs=10.0 +``` + +If the reference audio is missing, the TTS system will still work but without voice cloning — it will use the model's default voice instead. -- cgit v1.2.3 From b141fca0c0604bdeba9fa563a8049cf29cc03bcf Mon Sep 17 00:00:00 2001 From: soryu Date: Wed, 28 Jan 2026 03:51:07 +0000 Subject: Fix starting phase dropdown to show correct phase names from templates (#42) * Add comprehensive Red Team system specification Defines the adversarial review feature for contracts that monitors work tasks in real-time to catch quality issues, plan deviations, and standards violations. Key components specified: - Contract configuration (red_team_enabled, red_team_prompt) - Red team task lifecycle and spawning logic - makima red-team notify CLI command for supervisor alerts - Task output subscription for real-time monitoring - Database schema changes (contracts, tasks, notifications table) - API endpoints for notification and status - System prompt template for red team behavior - Security considerations and access control Co-Authored-By: Claude Opus 4.5 * Task completion checkpoint * Task completion checkpoint * Task completion checkpoint * Fix starting phase dropdown to show correct phase names from templates Add phaseNames map to ContractTypeTemplate to preserve display names from custom templates loaded from localStorage. The dropdown now uses the template's phase name (e.g., 'Design & Architecture') instead of naive capitalization of the phase ID. Falls back to capitalization for built-in templates that don't provide phaseNames. Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Claude Opus 4.5 --- makima/frontend/src/lib/api.ts | 2 ++ makima/frontend/src/routes/contracts.tsx | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/makima/frontend/src/lib/api.ts b/makima/frontend/src/lib/api.ts index ca04ce7..c9648a2 100644 --- a/makima/frontend/src/lib/api.ts +++ b/makima/frontend/src/lib/api.ts @@ -1636,6 +1636,8 @@ export interface ContractTypeTemplate { defaultPhase: ContractPhase; /** Whether this is a built-in type (always available) */ isBuiltin: boolean; + /** Optional mapping from phase ID to display name */ + phaseNames?: Record; } /** Response from list contract types endpoint */ diff --git a/makima/frontend/src/routes/contracts.tsx b/makima/frontend/src/routes/contracts.tsx index aa62bd9..9891f29 100644 --- a/makima/frontend/src/routes/contracts.tsx +++ b/makima/frontend/src/routes/contracts.tsx @@ -108,11 +108,12 @@ function ContractsPageContent() { // Convert user templates to ContractTypeTemplate format, excluding built-ins return templates .filter((t: { isBuiltIn?: boolean }) => !t.isBuiltIn) - .map((t: { id: string; name: string; description: string; phases: { id: string }[] }) => ({ + .map((t: { id: string; name: string; description: string; phases: { id: string; name: string }[] }) => ({ id: t.id, name: t.name, description: t.description, phases: t.phases.map((p: { id: string }) => p.id) as ContractPhase[], + phaseNames: Object.fromEntries(t.phases.map((p: { id: string; name: string }) => [p.id, p.name])), defaultPhase: (t.phases[0]?.id || "execute") as ContractPhase, isBuiltin: false, })); @@ -652,11 +653,17 @@ function ContractsPageContent() { onChange={(e) => setInitialPhase(e.target.value as ContractPhase)} className="w-full px-3 py-2 bg-[#0d1b2d] border border-[#3f6fb3] text-[#dbe7ff] font-mono text-sm focus:outline-none focus:border-[#75aafc]" > - {(contractTypes.find((t) => t.id === contractType)?.phases || []).map((phase) => ( - - ))} + {(() => { + const template = contractTypes.find((t) => t.id === contractType); + return (template?.phases || []).map((phase) => { + const displayName = template?.phaseNames?.[phase] || (phase.charAt(0).toUpperCase() + phase.slice(1)); + return ( + + ); + }); + })()}

{contractType === "simple" -- cgit v1.2.3 -- cgit v1.2.3 From d7b0b576fb43902535f0ae8d4f257b50387ec01a Mon Sep 17 00:00:00 2001 From: soryu Date: Thu, 29 Jan 2026 01:04:42 +0000 Subject: fix: Add Qwen3-TTS model download to Docker build (#44) * chore: fix unused import warnings in qwen3-tts module - Remove unused import 'IndexOp' in model.rs - Remove unused import 'DType' in speech_tokenizer.rs - Add #[allow(dead_code)] to codebook_dim field in RvqCodebook Co-Authored-By: Claude Opus 4.5 * feat: add voice loading and selection for TTS cloning Add voice reference audio loading so the TTS speak handler can perform voice cloning using reference WAV files from the voices/ directory. - Add voice.rs module: loads manifest.json and reference.wav for a given voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine - Update speak.rs: resolve voice_id from the speak request (default "makima"), load reference audio, pass it to engine.generate() - Add voices/makima/README.md with instructions for obtaining reference audio (extraction from YouTube, recording, ffmpeg conversion) - Graceful fallback: if reference audio is missing, TTS proceeds without voice cloning using the model's default voice Co-Authored-By: Claude Opus 4.5 * feat: add inference cancellation support for TTS generation Add cooperative cancellation via Arc cancel flag that threads through TtsEngine::generate -> Qwen3Tts -> GenerationContext. The autoregressive loop and streaming decoder check the flag each iteration and break early when set. The speak WebSocket handler creates a per-session flag, passes it to generate, and sets it on Cancel/Stop/Close messages. Co-Authored-By: Claude Opus 4.5 * Add Qwen3-TTS model download to build process Fix TTS engine failure due to missing tokenizer by downloading Qwen3-TTS models during Docker build: - Download model.safetensors, config.json, tokenizer.json, and tokenizer_config.json from Qwen/Qwen3-TTS-12Hz-0.6B-Base - Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz - Add QWEN3_TTS_DIR environment variable to Dockerfile - Script supports both env var override and default path Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Claude Opus 4.5 --- Dockerfile | 2 ++ makima/sh/download-models.sh | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/Dockerfile b/Dockerfile index e6c3d8b..a7d093c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,7 @@ RUN chmod +x /app/download-models.sh ARG MODEL_BASE_URL ENV MODEL_BASE_URL=${MODEL_BASE_URL} ENV MODELS_DIR=/models +ENV QWEN3_TTS_DIR=/app/models/qwen3-tts RUN /app/download-models.sh echo "Models downloaded" # Copy workspace files @@ -40,6 +41,7 @@ ENV RUST_LOG=makima=info,tower_http=info ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3 ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx +ENV QWEN3_TTS_DIR=/app/models/qwen3-tts EXPOSE 8080 diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh index 0381e15..1aefad8 100755 --- a/makima/sh/download-models.sh +++ b/makima/sh/download-models.sh @@ -114,5 +114,40 @@ else echo "All models downloaded successfully" fi +# Download Qwen3-TTS models (for TTS functionality) +QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}" + +download_qwen3_tts() { + if [ -d "$QWEN3_TTS_DIR" ] && [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ]; then + echo "Qwen3-TTS models already exist, skipping..." + return 0 + fi + + echo "Downloading Qwen3-TTS models..." + mkdir -p "$QWEN3_TTS_DIR" + + # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base + echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..." + huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-Base \ + model.safetensors \ + config.json \ + tokenizer.json \ + tokenizer_config.json \ + --local-dir "$QWEN3_TTS_DIR" + + # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz + echo "Downloading Qwen3-TTS-Tokenizer-12Hz..." + local tmpdir=$(mktemp -d) + huggingface-cli download Qwen/Qwen3-TTS-Tokenizer-12Hz \ + model.safetensors \ + --local-dir "$tmpdir" + mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" + rm -rf "$tmpdir" + + echo "Qwen3-TTS models downloaded successfully" +} + +download_qwen3_tts + # Execute the main command exec "$@" -- cgit v1.2.3 From f6a40e2304585f140ed5766b25fe71a6958f4425 Mon Sep 17 00:00:00 2001 From: soryu Date: Thu, 29 Jan 2026 01:14:17 +0000 Subject: Fix makima supervisor pr CLI command --- makima/src/bin/makima.rs | 4 +-- makima/src/daemon/api/supervisor.rs | 9 ++---- makima/src/daemon/cli/supervisor.rs | 8 ++---- makima/src/daemon/task/manager.rs | 40 +++++++++++--------------- makima/src/daemon/ws/protocol.rs | 4 ++- makima/src/server/handlers/mesh_supervisor.rs | 41 +++++++++++++-------------- makima/src/server/state.rs | 4 ++- 7 files changed, 49 insertions(+), 61 deletions(-) diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs index 44fa590..8e83565 100644 --- a/makima/src/bin/makima.rs +++ b/makima/src/bin/makima.rs @@ -439,10 +439,10 @@ async fn run_supervisor( } SupervisorCommand::Pr(args) => { let client = ApiClient::new(args.common.api_url, args.common.api_key)?; - eprintln!("Creating PR for task {}...", args.task_id); + eprintln!("Creating PR for branch {}...", args.branch); let body = args.body.as_deref().unwrap_or(""); let result = client - .supervisor_pr(args.task_id, &args.title, body, &args.base) + .supervisor_pr(&args.branch, &args.title, body) .await?; println!("{}", serde_json::to_string(&result.0)?); } diff --git a/makima/src/daemon/api/supervisor.rs b/makima/src/daemon/api/supervisor.rs index 6b99de0..c841b21 100644 --- a/makima/src/daemon/api/supervisor.rs +++ b/makima/src/daemon/api/supervisor.rs @@ -54,10 +54,9 @@ pub struct MergeRequest { #[derive(Serialize)] #[serde(rename_all = "camelCase")] pub struct CreatePrRequest { - pub task_id: Uuid, + pub branch: String, pub title: String, pub body: String, - pub base_branch: String, } #[derive(Serialize)] @@ -165,16 +164,14 @@ impl ApiClient { /// Create a pull request. pub async fn supervisor_pr( &self, - task_id: Uuid, + branch: &str, title: &str, body: &str, - base_branch: &str, ) -> Result { let req = CreatePrRequest { - task_id, + branch: branch.to_string(), title: title.to_string(), body: body.to_string(), - base_branch: base_branch.to_string(), }; self.post("/api/v1/mesh/supervisor/pr", &req).await } diff --git a/makima/src/daemon/cli/supervisor.rs b/makima/src/daemon/cli/supervisor.rs index 09f61db..9ad7aef 100644 --- a/makima/src/daemon/cli/supervisor.rs +++ b/makima/src/daemon/cli/supervisor.rs @@ -128,9 +128,9 @@ pub struct PrArgs { #[command(flatten)] pub common: SupervisorArgs, - /// Task ID to create PR for + /// Branch name to create PR from (e.g., "makima/feature-name") #[arg(index = 1)] - pub task_id: Uuid, + pub branch: String, /// PR title #[arg(long)] @@ -139,10 +139,6 @@ pub struct PrArgs { /// PR body/description #[arg(long)] pub body: Option, - - /// Base branch (default: main) - #[arg(long, default_value = "main")] - pub base: String, } /// Arguments for diff command. diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs index f0da860..8c5f8d7 100644 --- a/makima/src/daemon/task/manager.rs +++ b/makima/src/daemon/task/manager.rs @@ -669,7 +669,7 @@ makima supervisor wait "$TASK_ID" makima supervisor merge "$TASK_ID" --to "makima/user-authentication" # Step 3: All tasks complete - create PR from makima branch -makima supervisor pr "makima/user-authentication" --title "Add user authentication" --base main +makima supervisor pr "makima/user-authentication" --title "Add user authentication" ``` ## Available Tools (via makima supervisor) @@ -701,7 +701,7 @@ makima supervisor branch [--from ] makima supervisor merge [--to ] [--squash] # Create a pull request -makima supervisor pr --title "Title" [--body "Body"] [--base main] +makima supervisor pr --title "Title" [--body "Body"] # View a task's diff makima supervisor diff @@ -838,7 +838,7 @@ Common deliverable IDs by phase: 3. **wait blocks until complete** - you MUST call this to know when a task finishes 4. **Never fire-and-forget** - always wait for each task before moving on 5. **Merge to your makima branch** - use `merge --to "makima/{name}"` to collect completed work -6. **Create PR when done** - use `pr "makima/{name}" --title "..." --base main` +6. **Create PR when done** - use `pr "makima/{name}" --title "..."` 7. **Ask when unsure** - use `ask` to get user feedback on decisions ## Standard Workflow @@ -849,7 +849,7 @@ Common deliverable IDs by phase: - `wait` - Block until complete - `merge --to "makima/{name}"` - Merge to branch 3. `ask "Ready to create PR?"` - Get user approval -4. `pr "makima/{name}" --title "..." --base main` - Create PR +4. `pr "makima/{name}" --title "..."` - Create PR ## Important Reminders @@ -875,7 +875,7 @@ When you receive an `[ACTION REQUIRED]` message from the system: After all tasks are "done" and merged, you MUST take the following actions: **If in execute phase:** -1. Create PR immediately: `makima supervisor pr "makima/{name}" --title "..." --base main` +1. Create PR immediately: `makima supervisor pr "makima/{name}" --title "..."` 2. After PR created: - Simple contract: Mark complete with `makima supervisor complete` - Specification contract: Advance to review with `makima supervisor advance-phase review` @@ -2016,14 +2016,16 @@ impl TaskManager { title, body, base_branch, + branch, } => { tracing::info!( task_id = %task_id, title = %title, base_branch = %base_branch, + branch = %branch, "Creating pull request" ); - self.handle_create_pr(task_id, title, body, base_branch).await?; + self.handle_create_pr(task_id, title, body, base_branch, branch).await?; } DaemonCommand::GetTaskDiff { task_id, @@ -3135,6 +3137,7 @@ impl TaskManager { title: String, body: Option, base_branch: String, + branch: String, ) -> Result<(), DaemonError> { // Get worktree path - this works even for completed tasks by scanning worktrees directory let worktree_path = match self.get_task_worktree_path(task_id).await { @@ -3153,30 +3156,19 @@ impl TaskManager { } }; - // Get base_branch from in-memory tasks if available (for fallback) - let task_base_branch = { - let tasks = self.tasks.read().await; - tasks.get(&task_id).and_then(|t| t.base_branch.clone()) - }; - - // Use task's base_branch if the provided one is the default "main" and task has a detected one - let effective_base_branch = if base_branch == "main" { - task_base_branch.unwrap_or(base_branch) - } else { - base_branch - }; - tracing::info!( task_id = %task_id, - effective_base_branch = %effective_base_branch, + base_branch = %base_branch, + branch = %branch, worktree_path = %worktree_path.display(), - "Creating PR with effective base branch" + "Creating PR" ); - // Push the current branch first + // Push the branch to origin + let push_refspec = format!("HEAD:refs/heads/{}", branch); let push_result = tokio::process::Command::new("git") .current_dir(&worktree_path) - .args(["push", "-u", "origin", "HEAD"]) + .args(["push", "-u", "origin", &push_refspec]) .output() .await; @@ -3195,7 +3187,7 @@ impl TaskManager { // Create PR using gh CLI let mut pr_cmd = tokio::process::Command::new("gh"); pr_cmd.current_dir(&worktree_path); - pr_cmd.args(["pr", "create", "--title", &title, "--base", &effective_base_branch]); + pr_cmd.args(["pr", "create", "--title", &title, "--base", &base_branch, "--head", &branch]); if let Some(ref body_text) = body { pr_cmd.args(["--body", body_text]); diff --git a/makima/src/daemon/ws/protocol.rs b/makima/src/daemon/ws/protocol.rs index bd13975..e971798 100644 --- a/makima/src/daemon/ws/protocol.rs +++ b/makima/src/daemon/ws/protocol.rs @@ -693,9 +693,11 @@ pub enum DaemonCommand { task_id: Uuid, title: String, body: Option, - /// Base branch for the PR (default: main). + /// Base branch for the PR. #[serde(rename = "baseBranch")] base_branch: String, + /// Source branch name to push and create PR from. + branch: String, }, /// Get the diff for a task's changes. diff --git a/makima/src/server/handlers/mesh_supervisor.rs b/makima/src/server/handlers/mesh_supervisor.rs index 016367f..a0a3a96 100644 --- a/makima/src/server/handlers/mesh_supervisor.rs +++ b/makima/src/server/handlers/mesh_supervisor.rs @@ -1267,15 +1267,9 @@ pub struct MergeTaskResponse { #[derive(Debug, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct CreatePRRequest { - pub task_id: Uuid, + pub branch: String, pub title: String, pub body: Option, - #[serde(default = "default_base_branch")] - pub base_branch: String, -} - -fn default_base_branch() -> String { - "main".to_string() } /// Response for PR creation. @@ -1513,48 +1507,53 @@ pub async fn create_pr( headers: HeaderMap, Json(request): Json, ) -> impl IntoResponse { - let (_supervisor_id, owner_id) = match verify_supervisor_auth(&state, &headers, None).await { + let (supervisor_id, _owner_id) = match verify_supervisor_auth(&state, &headers, None).await { Ok(ids) => ids, Err(e) => return e.into_response(), }; let pool = state.db_pool.as_ref().unwrap(); - // Get the target task - let task = match repository::get_task_for_owner(pool, request.task_id, owner_id).await { + // Get the supervisor's own task to find daemon and base_branch + let task = match repository::get_task(pool, supervisor_id).await { Ok(Some(t)) => t, Ok(None) => { return ( StatusCode::NOT_FOUND, - Json(ApiError::new("NOT_FOUND", "Task not found")), + Json(ApiError::new("NOT_FOUND", "Supervisor task not found")), ).into_response(); } Err(e) => { - tracing::error!(error = %e, "Failed to get task"); + tracing::error!(error = %e, "Failed to get supervisor task"); return ( StatusCode::INTERNAL_SERVER_ERROR, - Json(ApiError::new("DB_ERROR", "Failed to get task")), + Json(ApiError::new("DB_ERROR", "Failed to get supervisor task")), ).into_response(); } }; - // Get daemon running the task + // Get daemon running the supervisor let Some(daemon_id) = task.daemon_id else { return ( StatusCode::SERVICE_UNAVAILABLE, - Json(ApiError::new("NO_DAEMON", "Task has no assigned daemon")), + Json(ApiError::new("NO_DAEMON", "Supervisor has no assigned daemon")), ).into_response(); }; + // Use base_branch from the task's repository config, falling back to "main" + let base_branch = task.base_branch.unwrap_or_else(|| "main".to_string()); + // Subscribe to PR results BEFORE sending the command let mut rx = state.pr_results.subscribe(); - // Send CreatePR command to daemon + // Send CreatePR command to daemon using the supervisor's task ID + // (the branch is in the supervisor's worktree) let cmd = DaemonCommand::CreatePR { - task_id: request.task_id, + task_id: supervisor_id, title: request.title.clone(), body: request.body.clone(), - base_branch: request.base_branch.clone(), + base_branch, + branch: request.branch.clone(), }; if let Err(e) = state.send_daemon_command(daemon_id, cmd).await { @@ -1571,7 +1570,7 @@ pub async fn create_pr( loop { match rx.recv().await { Ok(notification) => { - if notification.task_id == request.task_id { + if notification.task_id == supervisor_id { return Some(notification); } // Not our task, keep waiting @@ -1594,7 +1593,7 @@ pub async fn create_pr( ( status, Json(CreatePRResponse { - task_id: request.task_id, + task_id: supervisor_id, success: notification.success, message: notification.message, pr_url: notification.pr_url, @@ -1607,7 +1606,7 @@ pub async fn create_pr( ( StatusCode::GATEWAY_TIMEOUT, Json(CreatePRResponse { - task_id: request.task_id, + task_id: supervisor_id, success: false, message: "PR creation timed out waiting for daemon response".to_string(), pr_url: None, diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs index bf8f6f2..041b101 100644 --- a/makima/src/server/state.rs +++ b/makima/src/server/state.rs @@ -461,9 +461,11 @@ pub enum DaemonCommand { task_id: Uuid, title: String, body: Option, - /// Base branch for the PR (default: main) + /// Base branch for the PR #[serde(rename = "baseBranch")] base_branch: String, + /// Source branch name to push and create PR from + branch: String, }, /// Get the diff for a task's changes -- cgit v1.2.3 From 45a433c0eb63cae1322203ee14292f1c427a09c9 Mon Sep 17 00:00:00 2001 From: soryu Date: Thu, 29 Jan 2026 01:26:17 +0000 Subject: feat: Add Red Team UI to makima/frontend contract creation (#45) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add redTeamEnabled and redTeamPrompt state to contracts page - Add "Enable Red Team Monitoring" checkbox with description - Add conditional "Custom Review Criteria" textarea when enabled - Include redTeamEnabled/redTeamPrompt in CreateContractRequest - Reset red team fields when canceling contract creation - Add redTeamEnabled to ContractSummary and Contract types - Add redTeamEnabled/redTeamPrompt to CreateContractRequest type - Add Red Team badge (🔍) to ContractList for enabled contracts Co-authored-by: Claude Opus 4.5 --- .../src/components/contracts/ContractList.tsx | 5 ++ makima/frontend/src/lib/api.ts | 10 ++++ makima/frontend/src/routes/contracts.tsx | 59 ++++++++++++++++++++++ 3 files changed, 74 insertions(+) diff --git a/makima/frontend/src/components/contracts/ContractList.tsx b/makima/frontend/src/components/contracts/ContractList.tsx index 98f8ff6..532ab87 100644 --- a/makima/frontend/src/components/contracts/ContractList.tsx +++ b/makima/frontend/src/components/contracts/ContractList.tsx @@ -136,6 +136,11 @@ export function ContractList({ Local )} + {contract.redTeamEnabled && ( + + 🔍 Red Team + + )} ([]); const [contractTypesLoading, setContractTypesLoading] = useState(false); const [localOnly, setLocalOnly] = useState(false); + const [redTeamEnabled, setRedTeamEnabled] = useState(false); + const [redTeamPrompt, setRedTeamPrompt] = useState(""); // Fetch contract types when modal opens - merges built-in types with user templates useEffect(() => { @@ -266,6 +268,8 @@ function ContractsPageContent() { contractType: contractType, initialPhase: initialPhase !== defaultPhaseForType ? initialPhase : undefined, localOnly: localOnly || undefined, + redTeamEnabled: redTeamEnabled || undefined, + redTeamPrompt: redTeamEnabled && redTeamPrompt.trim() ? redTeamPrompt.trim() : undefined, }; try { @@ -341,6 +345,8 @@ function ContractsPageContent() { setRepoUrl(""); setRepoPath(""); setLocalOnly(false); + setRedTeamEnabled(false); + setRedTeamPrompt(""); setCreateError(null); }, []); @@ -712,6 +718,59 @@ function ContractsPageContent() {

+ {/* Red Team Monitoring */} +
+
+ + +
+

+ Spawns a parallel task to monitor work output for quality and compliance. +

+ {redTeamEnabled && ( +
+ +