Add a CLI interface (#102)

* Add a CLI interface and update packaging configuration * Support multiple lines in stdin --------- Co-authored-by: Eric Trotta <eric.oliveira@magva.com.br>
2025-02-18 02:07:21 -03:00
parent 5229a254b7
commit cd7afb5c12
5 changed files with 1990 additions and 35 deletions
--- a/kokoro/main.py
+++ b/kokoro/main.py
@@ -0,0 +1,148 @@
+"""Kokoro TTS CLI
+Example usage:
+python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
+
+echo "Bom dia mundo, como vão vocês" > text.txt
+python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
+
+Common issues:
+pip not installed: `uv pip install pip`
+(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
+
+espeak not installed: `apt-get install espeak-ng`
+"""
+
+import argparse
+import wave
+from pathlib import Path
+from typing import Generator, TYPE_CHECKING
+
+import numpy as np
+from loguru import logger
+
+languages = [
+    "a",  # American English
+    "b",  # British English
+    "h",  # Hindi
+    "e",  # Spanish
+    "f",  # French
+    "i",  # Italian
+    "p",  # Brazilian Portuguese
+    "j",  # Japanese
+    "z",  # Mandarin Chinese
+]
+
+if TYPE_CHECKING:
+    from kokoro import KPipeline
+
+
+def generate_audio(
+    text: str, kokoro_language: str, voice: str, speed=1
+) -> Generator["KPipeline.Result", None, None]:
+    from kokoro import KPipeline
+
+    if not voice.startswith(kokoro_language):
+        logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
+    pipeline = KPipeline(lang_code=kokoro_language)
+    yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
+
+
+def generate_and_save_audio(
+    output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
+) -> None:
+    with wave.open(str(output_file.resolve()), "wb") as wav_file:
+        wav_file.setnchannels(1)  # Mono audio
+        wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
+        wav_file.setframerate(24000)  # Sample rate
+
+        for result in generate_audio(
+            text, kokoro_language=kokoro_language, voice=voice, speed=speed
+        ):
+            logger.debug(result.phonemes)
+            if result.audio is None:
+                continue
+            audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
+            wav_file.writeframes(audio_bytes)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--voice",
+        default="af_heart",
+        help="Voice to use",
+    )
+    parser.add_argument(
+        "-l",
+        "--language",
+        help="Language to use (defaults to the one corresponding to the voice)",
+        choices=languages,
+    )
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        "--output_file",
+        type=Path,
+        help="Path to output WAV file",
+        required=True,
+    )
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        "--input_file",
+        type=Path,
+        help="Path to input text file (default: stdin)",
+    )
+    parser.add_argument(
+        "-t",
+        "--text",
+        help="Text to use instead of reading from stdin",
+    )
+    parser.add_argument(
+        "-s",
+        "--speed",
+        type=float,
+        default=1.0,
+        help="Speech speed",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Print DEBUG messages to console",
+    )
+    args = parser.parse_args()
+    if args.debug:
+        logger.level("DEBUG")
+    logger.debug(args)
+
+    lang = args.language or args.voice[0]
+
+    if args.text is not None and args.input_file is not None:
+        raise Exception("You cannot specify both 'text' and 'input_file'")
+    elif args.text:
+        text = args.text
+    elif args.input_file:
+        file: Path = args.input_file
+        text = file.read_text()
+    else:
+        import sys
+        print("Press Ctrl+D to stop reading input and start generating", flush=True)
+        text = '\n'.join(sys.stdin)
+
+    logger.debug(f"Input text: {text!r}")
+
+    out_file: Path = args.output_file
+    if not out_file.suffix == ".wav":
+        logger.warning("The output file name should end with .wav")
+    generate_and_save_audio(
+        output_file=out_file,
+        text=text,
+        kokoro_language=lang,
+        voice=args.voice,
+        speed=args.speed,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/kokoro/pipeline.py
+++ b/kokoro/pipeline.py
@@ -3,7 +3,6 @@ from dataclasses import dataclass
 from huggingface_hub import hf_hub_download
 from loguru import logger
 from misaki import en, espeak
-from numbers import Number
 from typing import Generator, List, Optional, Tuple, Union
 import re
 import torch
@@ -219,7 +218,7 @@ class KPipeline:
        model: KModel,
        ps: str,
        pack: torch.FloatTensor,
-        speed: Number = 1
+        speed: float = 1
    ) -> KModel.Output:
        return model(ps, pack[len(ps)-1], speed, return_output=True)

@@ -227,7 +226,7 @@ class KPipeline:
        self,
        tokens: Union[str, List[en.MToken]],
        voice: str,
-        speed: Number = 1,
+        speed: float = 1,
        model: Optional[KModel] = None
    ) -> Generator['KPipeline.Result', None, None]:
        """Generate audio from either raw phonemes or pre-processed tokens.
@@ -343,7 +342,7 @@ class KPipeline:
        self,
        text: Union[str, List[str]],
        voice: Optional[str] = None,
-        speed: Number = 1,
+        speed: float = 1,
        split_pattern: Optional[str] = r'\n+',
        model: Optional[KModel] = None
    ) -> Generator['KPipeline.Result', None, None]:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,39 @@
 [build-system]
-requires = ["setuptools", "wheel"]
-build-backend = "setuptools.build_meta"
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "kokoro"
+version = "0.7.16"
+description = "TTS"
+readme = "README.md"
+authors = [
+    { name="hexgrad", email="hello@hexgrad.com" }
+]
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent"
+]
+requires-python = ">=3.10, <3.13"
+dependencies = [
+    "huggingface_hub",
+    "loguru",
+    "misaki[en]>=0.7.16",
+    "numpy==1.26.4",
+    "scipy",
+    "torch",
+    "transformers"
+]
+
+[project.scripts]
+kokoro = "kokoro.__main__:main"
+
+[tool.hatch.build.targets.wheel]
+only-include = ["kokoro"]
+only-packages = true
+
+[project.urls]
+Homepage = "https://github.com/hexgrad/kokoros"
+Repository = "https://github.com/hexgrad/kokoro"
--- a/setup.py
+++ b/setup.py
@@ -1,29 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
-    name='kokoro',
-    version='0.7.16',
-    packages=find_packages(),
-    install_requires=[
-        'huggingface_hub',
-        'loguru',
-        'misaki[en]>=0.7.16',
-        'numpy==1.26.4',
-        'scipy',
-        'torch',
-        'transformers',
-    ],
-    python_requires='>=3.7',
-    author='hexgrad',
-    author_email='hello@hexgrad.com',
-    description='TTS',
-    long_description=open('README.md').read(),
-    long_description_content_type='text/markdown',
-    url='https://github.com/hexgrad/kokoro',
-    license='Apache 2.0',
-    classifiers=[
-        'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: Apache Software License',
-        'Operating System :: OS Independent',
-    ],
-)
--- a/uv.lock
+++ b/uv.lock