Split text into sentences to stream audio chunk-by-chunk

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 19:41:19 -04:00
parent 067a853d3b
commit 83aa2ec08c
1 changed files with 19 additions and 10 deletions
--- a/server.py
+++ b/server.py
@@ -4,6 +4,7 @@
 import argparse
 import asyncio
 import logging
+import re
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
@@ -22,6 +23,8 @@ from wyoming.tts import Synthesize
 _LOGGER = logging.getLogger(__name__)

 KOKORO_SAMPLE_RATE = 24000
+
+_SENTENCE_RE = re.compile(r'(?<=[.!?])\s+')
 SAMPLE_WIDTH = 2   # 16-bit PCM
 CHANNELS = 1

@@ -125,7 +128,13 @@ class KokoroEventHandler(AsyncEventHandler):
                chunk_count = 0
                try:
                    _LOGGER.debug("Pipeline thread started")
-                    for _, _, audio in self.pipeline(text, voice=voice_name, speed=speed):
+                    sentences = [s for s in _SENTENCE_RE.split(text.strip()) if s]
+                    if not sentences:
+                        sentences = [text]
+                    _LOGGER.debug("Split into %d sentence(s)", len(sentences))
+                    for sentence in sentences:
+                        _LOGGER.debug("Synthesizing: %r", sentence[:60])
+                        for _, _, audio in self.pipeline(sentence, voice=voice_name, speed=speed):
                            if audio is None:
                                continue
                            # float32 [-1, 1] → int16