diff options
| author | soryu <soryu@soryu.co> | 2026-02-01 00:45:24 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2026-02-01 02:39:52 +0000 |
| commit | d0062efd34dfc22c2d8cfee0a47431ac0c8adfda (patch) | |
| tree | 5605da5d1090b8cb578d95ddc985472bc990dcc4 | |
| parent | 11a4c81fc60848c7c74cab2f6fca8086e487ae20 (diff) | |
| download | soryu-d0062efd34dfc22c2d8cfee0a47431ac0c8adfda.tar.gz soryu-d0062efd34dfc22c2d8cfee0a47431ac0c8adfda.zip | |
fix(supervisor): ensure all implementation phases are executed before PR
Previously the supervisor would implement only the first phase of a
multi-phase plan and then create a PR. This change updates the supervisor
system prompt to explicitly instruct it to:
1. Read and parse plan documents for multiple implementation phases
2. Execute phases sequentially
3. Track completion of each phase
4. Only create PR after ALL phases are complete
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
| -rw-r--r-- | makima/makima-vllm/Dockerfile | 31 | ||||
| -rw-r--r-- | makima/makima-vllm/requirements.txt | 23 | ||||
| -rwxr-xr-x | makima/makima-vllm/run.sh | 14 | ||||
| -rw-r--r-- | makima/makima-vllm/server.py | 390 | ||||
| -rw-r--r-- | makima/src/db/models.rs | 1 |
5 files changed, 458 insertions, 1 deletions
diff --git a/makima/makima-vllm/Dockerfile b/makima/makima-vllm/Dockerfile new file mode 100644 index 0000000..3ffb557 --- /dev/null +++ b/makima/makima-vllm/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.12-slim-bookworm + +WORKDIR /app + +# Install system dependencies including sox for audio processing +RUN apt-get update && apt-get install -y \ + sox \ + libsox-dev \ + libsndfile1 \ + ffmpeg \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy server code +COPY server.py . + +# Set environment variables +ENV PORT=8100 +ENV TTS_DEVICE=auto +ENV QWEN3_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-0.6B-Base + +EXPOSE 8100 + +HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ + CMD curl -f http://localhost:${PORT}/health || exit 1 + +CMD ["python", "server.py"] diff --git a/makima/makima-vllm/requirements.txt b/makima/makima-vllm/requirements.txt new file mode 100644 index 0000000..cd3ac2e --- /dev/null +++ b/makima/makima-vllm/requirements.txt @@ -0,0 +1,23 @@ +# Qwen3-TTS Server Dependencies + +# PyTorch - use CPU wheel for smaller image or cuda for GPU +--index-url https://download.pytorch.org/whl/cpu +torch>=2.0.0 +torchaudio>=2.0.0 + +# TTS Model +qwen-tts>=0.0.4 +transformers>=4.40.0 +accelerate>=0.30.0 + +# Web framework +fastapi>=0.100.0 +uvicorn[standard]>=0.20.0 +websockets>=11.0 + +# Audio processing +numpy>=1.24.0 +soundfile>=0.12.0 + +# Other +pydantic>=2.0.0 diff --git a/makima/makima-vllm/run.sh b/makima/makima-vllm/run.sh new file mode 100755 index 0000000..246fcbf --- /dev/null +++ b/makima/makima-vllm/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Run the Qwen3-TTS server + +set -e + +cd "$(dirname "$0")" + +# Activate virtual environment if it exists +if [ -d ".venv" ]; then + source .venv/bin/activate +fi + +# Use exec to replace shell with python so Ctrl+C works properly +exec python server.py diff --git a/makima/makima-vllm/server.py b/makima/makima-vllm/server.py new file mode 100644 index 0000000..2d9ea40 --- /dev/null +++ b/makima/makima-vllm/server.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +""" +Qwen3-TTS FastAPI Server + +Simple HTTP wrapper around Qwen3-TTS for use by makima. +Supports streaming audio output for real-time playback. +""" + +import io +import os +import base64 +import time +import asyncio +from typing import Optional, AsyncGenerator +from contextlib import asynccontextmanager + +import numpy as np +import torch +import soundfile as sf +from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect +from fastapi.responses import Response, StreamingResponse +from pydantic import BaseModel + +# Global model instance +model = None + + +class TTSRequest(BaseModel): + text: str + # Supported: auto, chinese, english, french, german, italian, japanese, korean, portuguese, russian, spanish + language: str = "english" + # Reference audio for voice cloning (base64 encoded WAV) + reference_audio: Optional[str] = None + reference_text: Optional[str] = None + + +class TTSResponse(BaseModel): + # Base64 encoded WAV audio + audio: str + sample_rate: int + duration_seconds: float + + +def get_model_name(): + """Get model name from environment or use default.""" + return os.environ.get("QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base") + + +def get_device(): + """Get device to use for inference.""" + device = os.environ.get("TTS_DEVICE", "auto") + if device == "auto": + # MPS has limitations with large output channels, prefer CPU on macOS + import platform + if platform.system() == "Darwin": + return "cpu" + elif torch.cuda.is_available(): + return "cuda" + else: + return "cpu" + return device + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Load model on startup.""" + global model + model_name = get_model_name() + print(f"Loading Qwen3-TTS model: {model_name}") + start = time.time() + + from qwen_tts import Qwen3TTSModel + + # Check if flash attention is available + try: + import flash_attn + attn_impl = "flash_attention_2" + print("Using Flash Attention 2") + except ImportError: + attn_impl = "eager" + print("Flash Attention not available, using eager attention") + + device = get_device() + print(f"Using device: {device}") + + # Use float32 for CPU (bfloat16 can be slow on CPU) + dtype = torch.float32 if device == "cpu" else torch.bfloat16 + + model = Qwen3TTSModel.from_pretrained( + model_name, + torch_dtype=dtype, + attn_implementation=attn_impl, + device_map=device, + ) + + print(f"Model loaded in {time.time() - start:.2f}s") + yield + # Cleanup + model = None + + +app = FastAPI( + title="Qwen3-TTS Server", + description="HTTP API for Qwen3-TTS text-to-speech", + lifespan=lifespan, +) + + +@app.get("/health") +async def health(): + """Health check endpoint.""" + return {"status": "ok", "model_loaded": model is not None} + + +@app.post("/tts", response_model=TTSResponse) +async def generate_tts(request: TTSRequest): + """Generate speech from text.""" + if model is None: + raise HTTPException(status_code=503, detail="Model not loaded") + + try: + start = time.time() + + # Decode reference audio if provided + ref_audio = None + if request.reference_audio: + audio_bytes = base64.b64decode(request.reference_audio) + audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes)) + # qwen-tts expects tuple of (audio, sample_rate) for numpy input + ref_audio = (audio_data, audio_sr) + + # Voice cloning requires reference audio + if ref_audio is None: + raise HTTPException( + status_code=400, + detail="reference_audio is required for the Base model. Please provide a base64-encoded WAV file." + ) + + # Use x_vector_only_mode if no reference text provided (simpler voice extraction) + use_x_vector_only = request.reference_text is None or request.reference_text.strip() == "" + + wavs, sample_rate = model.generate_voice_clone( + text=request.text, + language=request.language, + ref_audio=ref_audio, + ref_text=request.reference_text if not use_x_vector_only else None, + x_vector_only_mode=use_x_vector_only, + max_new_tokens=2048, + temperature=0.9, + top_k=50, + repetition_penalty=1.05, + ) + + # Get first waveform + waveform = wavs[0] if isinstance(wavs, list) else wavs + + # Convert to numpy if tensor + if torch.is_tensor(waveform): + waveform = waveform.cpu().numpy() + + # Ensure 1D array + if waveform.ndim > 1: + waveform = waveform.squeeze() + + # Encode as WAV + buffer = io.BytesIO() + sf.write(buffer, waveform, sample_rate, format="WAV") + audio_bytes = buffer.getvalue() + + duration = len(waveform) / sample_rate + elapsed = time.time() - start + print(f"Generated {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})") + + return TTSResponse( + audio=base64.b64encode(audio_bytes).decode("utf-8"), + sample_rate=sample_rate, + duration_seconds=duration, + ) + + except Exception as e: + import traceback + traceback.print_exc() + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/tts/raw") +async def generate_tts_raw(request: TTSRequest): + """Generate speech and return raw WAV bytes.""" + if model is None: + raise HTTPException(status_code=503, detail="Model not loaded") + + try: + # Decode reference audio if provided + ref_audio = None + if request.reference_audio: + audio_bytes = base64.b64decode(request.reference_audio) + audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes)) + ref_audio = (audio_data, audio_sr) + + # Voice cloning requires reference audio + if ref_audio is None: + raise HTTPException( + status_code=400, + detail="reference_audio is required for the Base model." + ) + + use_x_vector_only = request.reference_text is None or request.reference_text.strip() == "" + + wavs, sample_rate = model.generate_voice_clone( + text=request.text, + language=request.language, + ref_audio=ref_audio, + ref_text=request.reference_text if not use_x_vector_only else None, + x_vector_only_mode=use_x_vector_only, + max_new_tokens=2048, + temperature=0.9, + top_k=50, + repetition_penalty=1.05, + ) + + waveform = wavs[0] if isinstance(wavs, list) else wavs + + if torch.is_tensor(waveform): + waveform = waveform.cpu().numpy() + + if waveform.ndim > 1: + waveform = waveform.squeeze() + + # Return raw WAV + buffer = io.BytesIO() + sf.write(buffer, waveform, sample_rate, format="WAV") + + return Response( + content=buffer.getvalue(), + media_type="audio/wav", + ) + + except Exception as e: + import traceback + traceback.print_exc() + raise HTTPException(status_code=500, detail=str(e)) + + +@app.websocket("/tts/stream") +async def stream_tts(websocket: WebSocket): + """ + WebSocket endpoint for streaming TTS. + + Protocol: + - Client sends JSON: {"text": "...", "language": "english", "reference_audio": "base64...", "reference_text": "..."} + - Server sends binary PCM16 chunks as they're generated + - Server sends JSON {"type": "audio_end", "sample_rate": 24000} when done + - Server sends JSON {"type": "error", "message": "..."} on error + """ + await websocket.accept() + + if model is None: + await websocket.send_json({"type": "error", "message": "Model not loaded"}) + await websocket.close() + return + + try: + # Wait for request + data = await websocket.receive_json() + text = data.get("text", "") + language = data.get("language", "english") + ref_audio_b64 = data.get("reference_audio") + ref_text = data.get("reference_text") + + if not text.strip(): + await websocket.send_json({"type": "error", "message": "No text provided"}) + await websocket.close() + return + + # Decode reference audio + ref_audio = None + if ref_audio_b64: + audio_bytes = base64.b64decode(ref_audio_b64) + audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes)) + ref_audio = (audio_data, audio_sr) + + if ref_audio is None: + await websocket.send_json({"type": "error", "message": "reference_audio is required"}) + await websocket.close() + return + + use_x_vector_only = ref_text is None or ref_text.strip() == "" + + print(f"Streaming TTS for {len(text)} chars...") + start = time.time() + + # Use streaming mode (non_streaming_mode=False is default) + # This returns a generator that yields audio chunks + generator = model.generate_voice_clone( + text=text, + language=language, + ref_audio=ref_audio, + ref_text=ref_text if not use_x_vector_only else None, + x_vector_only_mode=use_x_vector_only, + max_new_tokens=2048, + temperature=0.9, + top_k=50, + repetition_penalty=1.05, + non_streaming_mode=False, # Enable streaming + ) + + total_samples = 0 + sample_rate = 24000 + + # Check if generator is actually a generator or just the result + if hasattr(generator, '__iter__') and not isinstance(generator, tuple): + for chunk_data in generator: + # chunk_data might be (wav_chunk, sr) or just wav_chunk + if isinstance(chunk_data, tuple): + wav_chunk, sample_rate = chunk_data + else: + wav_chunk = chunk_data + + if torch.is_tensor(wav_chunk): + wav_chunk = wav_chunk.cpu().numpy() + + if wav_chunk.ndim > 1: + wav_chunk = wav_chunk.squeeze() + + # Convert to PCM16 + pcm16 = (wav_chunk * 32767).astype(np.int16) + total_samples += len(pcm16) + + # Send binary audio chunk + await websocket.send_bytes(pcm16.tobytes()) + + # Yield to allow other tasks + await asyncio.sleep(0) + else: + # Non-streaming fallback - model returned full result + wavs, sample_rate = generator if isinstance(generator, tuple) else (generator, 24000) + waveform = wavs[0] if isinstance(wavs, list) else wavs + + if torch.is_tensor(waveform): + waveform = waveform.cpu().numpy() + + if waveform.ndim > 1: + waveform = waveform.squeeze() + + # Send in chunks for better streaming behavior + chunk_size = sample_rate // 4 # 250ms chunks + for i in range(0, len(waveform), chunk_size): + chunk = waveform[i:i + chunk_size] + pcm16 = (chunk * 32767).astype(np.int16) + total_samples += len(pcm16) + await websocket.send_bytes(pcm16.tobytes()) + await asyncio.sleep(0) + + duration = total_samples / sample_rate + elapsed = time.time() - start + print(f"Streamed {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})") + + # Send completion message + await websocket.send_json({ + "type": "audio_end", + "sample_rate": sample_rate, + "duration_seconds": duration, + }) + + except WebSocketDisconnect: + print("Client disconnected during streaming") + except Exception as e: + import traceback + traceback.print_exc() + try: + await websocket.send_json({"type": "error", "message": str(e)}) + except: + pass + finally: + try: + await websocket.close() + except: + pass + + +if __name__ == "__main__": + import uvicorn + port = int(os.environ.get("PORT", "8100")) + uvicorn.run( + app, + host="0.0.0.0", + port=port, + # Increase keep-alive timeout to avoid connection resets + timeout_keep_alive=120, + ) diff --git a/makima/src/db/models.rs b/makima/src/db/models.rs index 636d81a..be3350b 100644 --- a/makima/src/db/models.rs +++ b/makima/src/db/models.rs @@ -1920,7 +1920,6 @@ pub struct TaskCheckpoint { /// Commit message pub message: String, /// Files changed in this commit: [{path, action: 'A'|'M'|'D'}] - #[sqlx(json)] pub files_changed: Option<serde_json::Value>, /// Lines added in this commit pub lines_added: Option<i32>, |
