Files
kokoro/server.py
scott 5e8e7ad6d4
Some checks failed
Build and Push Docker Image / build (push) Failing after 47s
Replace upstream library with ROCm/Wyoming deployment project
Remove original Kokoro library source, demo, examples, tests, JS port,
and GitHub config. Add Dockerfile (ROCm 6.1 / PyTorch 2.5.1), Wyoming
TCP server, docker-compose with GPU passthrough, config, entrypoint,
and Gitea Actions build workflow.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 13:30:54 -04:00

216 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""Kokoro TTS Wyoming protocol server for Home Assistant integration."""
import argparse
import asyncio
import logging
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from pathlib import Path
import numpy as np
import yaml
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.handler import AsyncEventHandler
from wyoming.info import Attribution, Describe, Info, TtsProgram, TtsVoice
from wyoming.server import AsyncServer
from wyoming.tts import Synthesize
_LOGGER = logging.getLogger(__name__)
KOKORO_SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2 # 16-bit PCM
CHANNELS = 1
def load_config(path: str) -> dict:
with open(path) as f:
return yaml.safe_load(f)
def build_wyoming_info(config: dict) -> Info:
voices = [
TtsVoice(
name=v["name"],
description=v.get("description", v["name"]),
attribution=Attribution(
name="Kokoro-82M",
url="https://huggingface.co/hexgrad/Kokoro-82M",
),
installed=True,
version="1.0",
languages=[v.get("language", "en-us")],
)
for v in config["tts"]["voices"]
]
return Info(
tts=[
TtsProgram(
name="kokoro",
description="Kokoro 82M TTS via ROCm",
attribution=Attribution(
name="hexgrad/Kokoro-82M",
url="https://huggingface.co/hexgrad/Kokoro-82M",
),
installed=True,
version="0.9.4",
voices=voices,
)
]
)
class KokoroEventHandler(AsyncEventHandler):
def __init__(
self,
wyoming_info: Info,
pipeline,
config: dict,
executor: ThreadPoolExecutor,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.wyoming_info = wyoming_info
self.pipeline = pipeline
self.config = config
self.executor = executor
async def handle_event(self, event: Event) -> bool:
if Describe.is_type(event.type):
await self.write_event(self.wyoming_info.event())
return True
if Synthesize.is_type(event.type):
await self._handle_synthesize(Synthesize.from_event(event))
return True
return True
async def _handle_synthesize(self, synth: Synthesize) -> None:
text = synth.text.strip()
if not text:
await self.write_event(AudioStart(
rate=KOKORO_SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS
).event())
await self.write_event(AudioStop().event())
return
# Resolve voice: prefer client request, fall back to config default
default_voice = self.config["tts"].get("default_voice", "af_heart")
voice_name = (
synth.voice.name
if (synth.voice and synth.voice.name)
else default_voice
)
speed = self.config["tts"].get("default_speed", 1.0)
_LOGGER.info("Synthesize: voice=%s text=%r", voice_name, text[:80])
await self.write_event(
AudioStart(
rate=KOKORO_SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS
).event()
)
try:
chunk_queue: asyncio.Queue = asyncio.Queue()
loop = asyncio.get_event_loop()
def _generate():
try:
for _, _, audio in self.pipeline(text, voice=voice_name, speed=speed):
if audio is None:
continue
# float32 [-1, 1] → int16
pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
asyncio.run_coroutine_threadsafe(
chunk_queue.put(pcm.tobytes()), loop
)
except Exception as exc:
asyncio.run_coroutine_threadsafe(chunk_queue.put(exc), loop)
finally:
asyncio.run_coroutine_threadsafe(chunk_queue.put(None), loop)
self.executor.submit(_generate)
while True:
item = await chunk_queue.get()
if item is None:
break
if isinstance(item, Exception):
raise item
await self.write_event(
AudioChunk(
rate=KOKORO_SAMPLE_RATE,
width=SAMPLE_WIDTH,
channels=CHANNELS,
audio=item,
).event()
)
except Exception:
_LOGGER.exception("Error during synthesis")
finally:
await self.write_event(AudioStop().event())
async def main() -> None:
parser = argparse.ArgumentParser(description="Kokoro TTS Wyoming server")
parser.add_argument(
"--config", default="/app/config.yaml", help="Path to config.yaml"
)
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
config = load_config(args.config)
uri = config["server"]["uri"]
device = config["tts"].get("device", "cuda")
lang = config["tts"].get("language", "a")
_LOGGER.info("Loading Kokoro pipeline (device=%s, lang=%s)...", device, lang)
# Import here so startup logging appears first
import torch
from kokoro import KPipeline
if device == "cuda" and not torch.cuda.is_available():
_LOGGER.warning("CUDA/ROCm not available, falling back to CPU")
device = "cpu"
_LOGGER.info("GPU available: %s", torch.cuda.is_available())
if torch.cuda.is_available():
_LOGGER.info("Device name: %s", torch.cuda.get_device_name(0))
pipeline = KPipeline(lang_code=lang, device=device)
wyoming_info = build_wyoming_info(config)
executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="kokoro")
_LOGGER.info("Starting Wyoming server at %s", uri)
server = AsyncServer.from_uri(uri)
await server.run(
partial(
KokoroEventHandler,
wyoming_info,
pipeline,
config,
executor,
)
)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
pass