diff options
Diffstat (limited to 'makima/src/server/messages.rs')
| -rw-r--r-- | makima/src/server/messages.rs | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/makima/src/server/messages.rs b/makima/src/server/messages.rs index 9c50334..cecb622 100644 --- a/makima/src/server/messages.rs +++ b/makima/src/server/messages.rs @@ -103,3 +103,164 @@ impl ApiError { } } } + +// ============================================================================= +// TTS (Text-to-Speech) Message Types +// ============================================================================= + +/// TTS audio encoding format for WebSocket streaming. +#[derive(Debug, Clone, Copy, Deserialize, Serialize, ToSchema, PartialEq, Default)] +#[serde(rename_all = "lowercase")] +pub enum TtsAudioEncoding { + /// 16-bit signed integer PCM samples + #[default] + Pcm16, + /// 32-bit floating point PCM samples + Pcm32f, +} + +/// TTS synthesis priority level. +#[derive(Debug, Clone, Copy, Deserialize, Serialize, ToSchema, PartialEq, Default)] +#[serde(rename_all = "lowercase")] +pub enum TtsPriority { + /// Low priority - may be queued + Low, + /// Normal priority (default) + #[default] + Normal, + /// High priority - processed immediately + High, +} + +/// TTS session start message from client. +#[derive(Debug, Clone, Deserialize, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsStartMessage { + /// Audio sample rate in Hz (default: 24000) + #[serde(default = "default_tts_sample_rate")] + pub sample_rate: u32, + /// Audio encoding format + #[serde(default)] + pub encoding: TtsAudioEncoding, + /// Voice identifier (default: "makima") + #[serde(default = "default_tts_voice")] + pub voice: String, + /// Language for synthesis (default: "English") + #[serde(default = "default_tts_language")] + pub language: String, +} + +fn default_tts_sample_rate() -> u32 { + 24000 +} + +fn default_tts_voice() -> String { + "makima".to_string() +} + +fn default_tts_language() -> String { + "English".to_string() +} + +/// TTS speak request message from client. +#[derive(Debug, Clone, Deserialize, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsSpeakMessage { + /// Text to synthesize (max 1000 characters) + pub text: String, + /// Synthesis priority + #[serde(default)] + pub priority: TtsPriority, +} + +/// TTS stop request message from client. +#[derive(Debug, Clone, Deserialize, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsStopMessage { + /// Optional reason for stopping + pub reason: Option<String>, +} + +/// Wrapper for all TTS WebSocket messages from client to server. +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum TtsClientMessage { + /// Start a new TTS session + Start(TtsStartMessage), + /// Request speech synthesis + Speak(TtsSpeakMessage), + /// Stop the current session + Stop(TtsStopMessage), +} + +/// TTS session ready message sent from server to client. +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsReadyMessage { + /// Unique session identifier + pub session_id: String, + /// Confirmed sample rate + pub sample_rate: u32, + /// Confirmed encoding format + pub encoding: TtsAudioEncoding, + /// Confirmed voice + pub voice: String, +} + +/// TTS audio chunk message sent from server to client. +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsAudioChunkMessage { + /// Base64-encoded audio data + pub data: String, + /// Whether this is the final chunk + pub is_final: bool, + /// Timestamp in seconds from start of audio + pub timestamp: f64, +} + +/// TTS synthesis complete message sent from server to client. +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsCompleteMessage { + /// Total synthesis duration in milliseconds + pub duration_ms: u64, + /// Total number of chunks sent + pub total_chunks: u32, + /// Length of input text + pub text_length: u32, +} + +/// TTS error message sent from server to client. +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsErrorMessage { + /// Error code for programmatic handling + pub code: String, + /// Human-readable error message + pub message: String, +} + +/// TTS session stopped message sent from server to client. +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct TtsStoppedMessage { + /// Reason for stopping + pub reason: String, +} + +/// Wrapper for all TTS WebSocket messages from server to client. +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum TtsServerMessage { + /// Session is ready for synthesis requests + Ready(TtsReadyMessage), + /// Audio chunk (streamed during synthesis) + AudioChunk(TtsAudioChunkMessage), + /// Synthesis completed + Complete(TtsCompleteMessage), + /// Error occurred + Error(TtsErrorMessage), + /// Session has been stopped + Stopped(TtsStoppedMessage), +} |
