summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-02-01 00:45:24 +0000
committersoryu <soryu@soryu.co>2026-02-01 02:39:52 +0000
commitd0062efd34dfc22c2d8cfee0a47431ac0c8adfda (patch)
tree5605da5d1090b8cb578d95ddc985472bc990dcc4
parent11a4c81fc60848c7c74cab2f6fca8086e487ae20 (diff)
downloadsoryu-d0062efd34dfc22c2d8cfee0a47431ac0c8adfda.tar.gz
soryu-d0062efd34dfc22c2d8cfee0a47431ac0c8adfda.zip
fix(supervisor): ensure all implementation phases are executed before PR
Previously the supervisor would implement only the first phase of a multi-phase plan and then create a PR. This change updates the supervisor system prompt to explicitly instruct it to: 1. Read and parse plan documents for multiple implementation phases 2. Execute phases sequentially 3. Track completion of each phase 4. Only create PR after ALL phases are complete Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
-rw-r--r--makima/makima-vllm/Dockerfile31
-rw-r--r--makima/makima-vllm/requirements.txt23
-rwxr-xr-xmakima/makima-vllm/run.sh14
-rw-r--r--makima/makima-vllm/server.py390
-rw-r--r--makima/src/db/models.rs1
5 files changed, 458 insertions, 1 deletions
diff --git a/makima/makima-vllm/Dockerfile b/makima/makima-vllm/Dockerfile
new file mode 100644
index 0000000..3ffb557
--- /dev/null
+++ b/makima/makima-vllm/Dockerfile
@@ -0,0 +1,31 @@
+FROM python:3.12-slim-bookworm
+
+WORKDIR /app
+
+# Install system dependencies including sox for audio processing
+RUN apt-get update && apt-get install -y \
+ sox \
+ libsox-dev \
+ libsndfile1 \
+ ffmpeg \
+ curl \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy server code
+COPY server.py .
+
+# Set environment variables
+ENV PORT=8100
+ENV TTS_DEVICE=auto
+ENV QWEN3_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-0.6B-Base
+
+EXPOSE 8100
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+ CMD curl -f http://localhost:${PORT}/health || exit 1
+
+CMD ["python", "server.py"]
diff --git a/makima/makima-vllm/requirements.txt b/makima/makima-vllm/requirements.txt
new file mode 100644
index 0000000..cd3ac2e
--- /dev/null
+++ b/makima/makima-vllm/requirements.txt
@@ -0,0 +1,23 @@
+# Qwen3-TTS Server Dependencies
+
+# PyTorch - use CPU wheel for smaller image or cuda for GPU
+--index-url https://download.pytorch.org/whl/cpu
+torch>=2.0.0
+torchaudio>=2.0.0
+
+# TTS Model
+qwen-tts>=0.0.4
+transformers>=4.40.0
+accelerate>=0.30.0
+
+# Web framework
+fastapi>=0.100.0
+uvicorn[standard]>=0.20.0
+websockets>=11.0
+
+# Audio processing
+numpy>=1.24.0
+soundfile>=0.12.0
+
+# Other
+pydantic>=2.0.0
diff --git a/makima/makima-vllm/run.sh b/makima/makima-vllm/run.sh
new file mode 100755
index 0000000..246fcbf
--- /dev/null
+++ b/makima/makima-vllm/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Run the Qwen3-TTS server
+
+set -e
+
+cd "$(dirname "$0")"
+
+# Activate virtual environment if it exists
+if [ -d ".venv" ]; then
+ source .venv/bin/activate
+fi
+
+# Use exec to replace shell with python so Ctrl+C works properly
+exec python server.py
diff --git a/makima/makima-vllm/server.py b/makima/makima-vllm/server.py
new file mode 100644
index 0000000..2d9ea40
--- /dev/null
+++ b/makima/makima-vllm/server.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+"""
+Qwen3-TTS FastAPI Server
+
+Simple HTTP wrapper around Qwen3-TTS for use by makima.
+Supports streaming audio output for real-time playback.
+"""
+
+import io
+import os
+import base64
+import time
+import asyncio
+from typing import Optional, AsyncGenerator
+from contextlib import asynccontextmanager
+
+import numpy as np
+import torch
+import soundfile as sf
+from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
+from fastapi.responses import Response, StreamingResponse
+from pydantic import BaseModel
+
+# Global model instance
+model = None
+
+
+class TTSRequest(BaseModel):
+ text: str
+ # Supported: auto, chinese, english, french, german, italian, japanese, korean, portuguese, russian, spanish
+ language: str = "english"
+ # Reference audio for voice cloning (base64 encoded WAV)
+ reference_audio: Optional[str] = None
+ reference_text: Optional[str] = None
+
+
+class TTSResponse(BaseModel):
+ # Base64 encoded WAV audio
+ audio: str
+ sample_rate: int
+ duration_seconds: float
+
+
+def get_model_name():
+ """Get model name from environment or use default."""
+ return os.environ.get("QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base")
+
+
+def get_device():
+ """Get device to use for inference."""
+ device = os.environ.get("TTS_DEVICE", "auto")
+ if device == "auto":
+ # MPS has limitations with large output channels, prefer CPU on macOS
+ import platform
+ if platform.system() == "Darwin":
+ return "cpu"
+ elif torch.cuda.is_available():
+ return "cuda"
+ else:
+ return "cpu"
+ return device
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Load model on startup."""
+ global model
+ model_name = get_model_name()
+ print(f"Loading Qwen3-TTS model: {model_name}")
+ start = time.time()
+
+ from qwen_tts import Qwen3TTSModel
+
+ # Check if flash attention is available
+ try:
+ import flash_attn
+ attn_impl = "flash_attention_2"
+ print("Using Flash Attention 2")
+ except ImportError:
+ attn_impl = "eager"
+ print("Flash Attention not available, using eager attention")
+
+ device = get_device()
+ print(f"Using device: {device}")
+
+ # Use float32 for CPU (bfloat16 can be slow on CPU)
+ dtype = torch.float32 if device == "cpu" else torch.bfloat16
+
+ model = Qwen3TTSModel.from_pretrained(
+ model_name,
+ torch_dtype=dtype,
+ attn_implementation=attn_impl,
+ device_map=device,
+ )
+
+ print(f"Model loaded in {time.time() - start:.2f}s")
+ yield
+ # Cleanup
+ model = None
+
+
+app = FastAPI(
+ title="Qwen3-TTS Server",
+ description="HTTP API for Qwen3-TTS text-to-speech",
+ lifespan=lifespan,
+)
+
+
+@app.get("/health")
+async def health():
+ """Health check endpoint."""
+ return {"status": "ok", "model_loaded": model is not None}
+
+
+@app.post("/tts", response_model=TTSResponse)
+async def generate_tts(request: TTSRequest):
+ """Generate speech from text."""
+ if model is None:
+ raise HTTPException(status_code=503, detail="Model not loaded")
+
+ try:
+ start = time.time()
+
+ # Decode reference audio if provided
+ ref_audio = None
+ if request.reference_audio:
+ audio_bytes = base64.b64decode(request.reference_audio)
+ audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes))
+ # qwen-tts expects tuple of (audio, sample_rate) for numpy input
+ ref_audio = (audio_data, audio_sr)
+
+ # Voice cloning requires reference audio
+ if ref_audio is None:
+ raise HTTPException(
+ status_code=400,
+ detail="reference_audio is required for the Base model. Please provide a base64-encoded WAV file."
+ )
+
+ # Use x_vector_only_mode if no reference text provided (simpler voice extraction)
+ use_x_vector_only = request.reference_text is None or request.reference_text.strip() == ""
+
+ wavs, sample_rate = model.generate_voice_clone(
+ text=request.text,
+ language=request.language,
+ ref_audio=ref_audio,
+ ref_text=request.reference_text if not use_x_vector_only else None,
+ x_vector_only_mode=use_x_vector_only,
+ max_new_tokens=2048,
+ temperature=0.9,
+ top_k=50,
+ repetition_penalty=1.05,
+ )
+
+ # Get first waveform
+ waveform = wavs[0] if isinstance(wavs, list) else wavs
+
+ # Convert to numpy if tensor
+ if torch.is_tensor(waveform):
+ waveform = waveform.cpu().numpy()
+
+ # Ensure 1D array
+ if waveform.ndim > 1:
+ waveform = waveform.squeeze()
+
+ # Encode as WAV
+ buffer = io.BytesIO()
+ sf.write(buffer, waveform, sample_rate, format="WAV")
+ audio_bytes = buffer.getvalue()
+
+ duration = len(waveform) / sample_rate
+ elapsed = time.time() - start
+ print(f"Generated {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})")
+
+ return TTSResponse(
+ audio=base64.b64encode(audio_bytes).decode("utf-8"),
+ sample_rate=sample_rate,
+ duration_seconds=duration,
+ )
+
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/tts/raw")
+async def generate_tts_raw(request: TTSRequest):
+ """Generate speech and return raw WAV bytes."""
+ if model is None:
+ raise HTTPException(status_code=503, detail="Model not loaded")
+
+ try:
+ # Decode reference audio if provided
+ ref_audio = None
+ if request.reference_audio:
+ audio_bytes = base64.b64decode(request.reference_audio)
+ audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes))
+ ref_audio = (audio_data, audio_sr)
+
+ # Voice cloning requires reference audio
+ if ref_audio is None:
+ raise HTTPException(
+ status_code=400,
+ detail="reference_audio is required for the Base model."
+ )
+
+ use_x_vector_only = request.reference_text is None or request.reference_text.strip() == ""
+
+ wavs, sample_rate = model.generate_voice_clone(
+ text=request.text,
+ language=request.language,
+ ref_audio=ref_audio,
+ ref_text=request.reference_text if not use_x_vector_only else None,
+ x_vector_only_mode=use_x_vector_only,
+ max_new_tokens=2048,
+ temperature=0.9,
+ top_k=50,
+ repetition_penalty=1.05,
+ )
+
+ waveform = wavs[0] if isinstance(wavs, list) else wavs
+
+ if torch.is_tensor(waveform):
+ waveform = waveform.cpu().numpy()
+
+ if waveform.ndim > 1:
+ waveform = waveform.squeeze()
+
+ # Return raw WAV
+ buffer = io.BytesIO()
+ sf.write(buffer, waveform, sample_rate, format="WAV")
+
+ return Response(
+ content=buffer.getvalue(),
+ media_type="audio/wav",
+ )
+
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.websocket("/tts/stream")
+async def stream_tts(websocket: WebSocket):
+ """
+ WebSocket endpoint for streaming TTS.
+
+ Protocol:
+ - Client sends JSON: {"text": "...", "language": "english", "reference_audio": "base64...", "reference_text": "..."}
+ - Server sends binary PCM16 chunks as they're generated
+ - Server sends JSON {"type": "audio_end", "sample_rate": 24000} when done
+ - Server sends JSON {"type": "error", "message": "..."} on error
+ """
+ await websocket.accept()
+
+ if model is None:
+ await websocket.send_json({"type": "error", "message": "Model not loaded"})
+ await websocket.close()
+ return
+
+ try:
+ # Wait for request
+ data = await websocket.receive_json()
+ text = data.get("text", "")
+ language = data.get("language", "english")
+ ref_audio_b64 = data.get("reference_audio")
+ ref_text = data.get("reference_text")
+
+ if not text.strip():
+ await websocket.send_json({"type": "error", "message": "No text provided"})
+ await websocket.close()
+ return
+
+ # Decode reference audio
+ ref_audio = None
+ if ref_audio_b64:
+ audio_bytes = base64.b64decode(ref_audio_b64)
+ audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes))
+ ref_audio = (audio_data, audio_sr)
+
+ if ref_audio is None:
+ await websocket.send_json({"type": "error", "message": "reference_audio is required"})
+ await websocket.close()
+ return
+
+ use_x_vector_only = ref_text is None or ref_text.strip() == ""
+
+ print(f"Streaming TTS for {len(text)} chars...")
+ start = time.time()
+
+ # Use streaming mode (non_streaming_mode=False is default)
+ # This returns a generator that yields audio chunks
+ generator = model.generate_voice_clone(
+ text=text,
+ language=language,
+ ref_audio=ref_audio,
+ ref_text=ref_text if not use_x_vector_only else None,
+ x_vector_only_mode=use_x_vector_only,
+ max_new_tokens=2048,
+ temperature=0.9,
+ top_k=50,
+ repetition_penalty=1.05,
+ non_streaming_mode=False, # Enable streaming
+ )
+
+ total_samples = 0
+ sample_rate = 24000
+
+ # Check if generator is actually a generator or just the result
+ if hasattr(generator, '__iter__') and not isinstance(generator, tuple):
+ for chunk_data in generator:
+ # chunk_data might be (wav_chunk, sr) or just wav_chunk
+ if isinstance(chunk_data, tuple):
+ wav_chunk, sample_rate = chunk_data
+ else:
+ wav_chunk = chunk_data
+
+ if torch.is_tensor(wav_chunk):
+ wav_chunk = wav_chunk.cpu().numpy()
+
+ if wav_chunk.ndim > 1:
+ wav_chunk = wav_chunk.squeeze()
+
+ # Convert to PCM16
+ pcm16 = (wav_chunk * 32767).astype(np.int16)
+ total_samples += len(pcm16)
+
+ # Send binary audio chunk
+ await websocket.send_bytes(pcm16.tobytes())
+
+ # Yield to allow other tasks
+ await asyncio.sleep(0)
+ else:
+ # Non-streaming fallback - model returned full result
+ wavs, sample_rate = generator if isinstance(generator, tuple) else (generator, 24000)
+ waveform = wavs[0] if isinstance(wavs, list) else wavs
+
+ if torch.is_tensor(waveform):
+ waveform = waveform.cpu().numpy()
+
+ if waveform.ndim > 1:
+ waveform = waveform.squeeze()
+
+ # Send in chunks for better streaming behavior
+ chunk_size = sample_rate // 4 # 250ms chunks
+ for i in range(0, len(waveform), chunk_size):
+ chunk = waveform[i:i + chunk_size]
+ pcm16 = (chunk * 32767).astype(np.int16)
+ total_samples += len(pcm16)
+ await websocket.send_bytes(pcm16.tobytes())
+ await asyncio.sleep(0)
+
+ duration = total_samples / sample_rate
+ elapsed = time.time() - start
+ print(f"Streamed {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})")
+
+ # Send completion message
+ await websocket.send_json({
+ "type": "audio_end",
+ "sample_rate": sample_rate,
+ "duration_seconds": duration,
+ })
+
+ except WebSocketDisconnect:
+ print("Client disconnected during streaming")
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ try:
+ await websocket.send_json({"type": "error", "message": str(e)})
+ except:
+ pass
+ finally:
+ try:
+ await websocket.close()
+ except:
+ pass
+
+
+if __name__ == "__main__":
+ import uvicorn
+ port = int(os.environ.get("PORT", "8100"))
+ uvicorn.run(
+ app,
+ host="0.0.0.0",
+ port=port,
+ # Increase keep-alive timeout to avoid connection resets
+ timeout_keep_alive=120,
+ )
diff --git a/makima/src/db/models.rs b/makima/src/db/models.rs
index 636d81a..be3350b 100644
--- a/makima/src/db/models.rs
+++ b/makima/src/db/models.rs
@@ -1920,7 +1920,6 @@ pub struct TaskCheckpoint {
/// Commit message
pub message: String,
/// Files changed in this commit: [{path, action: 'A'|'M'|'D'}]
- #[sqlx(json)]
pub files_changed: Option<serde_json::Value>,
/// Lines added in this commit
pub lines_added: Option<i32>,