Add a CLI interface (#102)

* Add a CLI interface and update packaging configuration * Support multiple lines in stdin --------- Co-authored-by: Eric Trotta <eric.oliveira@magva.com.br>
2025-02-18 02:07:21 -03:00
parent 5229a254b7
commit cd7afb5c12
5 changed files with 1990 additions and 35 deletions
--- a/kokoro/main.py
+++ b/kokoro/main.py
@@ -0,0 +1,148 @@
 """Kokoro TTS CLI
 Example usage:
 python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
 echo "Bom dia mundo, como vão vocês" > text.txt
 python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
 Common issues:
 pip not installed: `uv pip install pip`
 (Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
 espeak not installed: `apt-get install espeak-ng`
 """
 import argparse
 import wave
 from pathlib import Path
 from typing import Generator, TYPE_CHECKING
 import numpy as np
 from loguru import logger
 languages = [
    "a",  # American English
    "b",  # British English
    "h",  # Hindi
    "e",  # Spanish
    "f",  # French
    "i",  # Italian
    "p",  # Brazilian Portuguese
    "j",  # Japanese
    "z",  # Mandarin Chinese
 ]
 if TYPE_CHECKING:
    from kokoro import KPipeline
 def generate_audio(
    text: str, kokoro_language: str, voice: str, speed=1
 ) -> Generator["KPipeline.Result", None, None]:
    from kokoro import KPipeline
    if not voice.startswith(kokoro_language):
        logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
    pipeline = KPipeline(lang_code=kokoro_language)
    yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
 def generate_and_save_audio(
    output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
 ) -> None:
    with wave.open(str(output_file.resolve()), "wb") as wav_file:
        wav_file.setnchannels(1)  # Mono audio
        wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
        wav_file.setframerate(24000)  # Sample rate
        for result in generate_audio(
            text, kokoro_language=kokoro_language, voice=voice, speed=speed
        ):
            logger.debug(result.phonemes)
            if result.audio is None:
                continue
            audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
            wav_file.writeframes(audio_bytes)
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m",
        "--voice",
        default="af_heart",
        help="Voice to use",
    )
    parser.add_argument(
        "-l",
        "--language",
        help="Language to use (defaults to the one corresponding to the voice)",
        choices=languages,
    )
    parser.add_argument(
        "-o",
        "--output-file",
        "--output_file",
        type=Path,
        help="Path to output WAV file",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input-file",
        "--input_file",
        type=Path,
        help="Path to input text file (default: stdin)",
    )
    parser.add_argument(
        "-t",
        "--text",
        help="Text to use instead of reading from stdin",
    )
    parser.add_argument(
        "-s",
        "--speed",
        type=float,
        default=1.0,
        help="Speech speed",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Print DEBUG messages to console",
    )
    args = parser.parse_args()
    if args.debug:
        logger.level("DEBUG")
    logger.debug(args)
    lang = args.language or args.voice[0]
    if args.text is not None and args.input_file is not None:
        raise Exception("You cannot specify both 'text' and 'input_file'")
    elif args.text:
        text = args.text
    elif args.input_file:
        file: Path = args.input_file
        text = file.read_text()
    else:
        import sys
        print("Press Ctrl+D to stop reading input and start generating", flush=True)
        text = '\n'.join(sys.stdin)
    logger.debug(f"Input text: {text!r}")
    out_file: Path = args.output_file
    if not out_file.suffix == ".wav":
        logger.warning("The output file name should end with .wav")
    generate_and_save_audio(
        output_file=out_file,
        text=text,
        kokoro_language=lang,
        voice=args.voice,
        speed=args.speed,
    )
 if __name__ == "__main__":
    main()
--- a/kokoro/pipeline.py
+++ b/kokoro/pipeline.py
@@ -3,7 +3,6 @@ from dataclasses import dataclass
 from huggingface_hub import hf_hub_download
 from loguru import logger
 from misaki import en, espeak
 from numbers import Number
 from typing import Generator, List, Optional, Tuple, Union
 import re
 import torch
@@ -219,7 +218,7 @@ class KPipeline:
        model: KModel,
        ps: str,
        pack: torch.FloatTensor,
-        speed: Number = 1
+        speed: float = 1
    ) -> KModel.Output:
        return model(ps, pack[len(ps)-1], speed, return_output=True)
@@ -227,7 +226,7 @@ class KPipeline:
        self,
        tokens: Union[str, List[en.MToken]],
        voice: str,
-        speed: Number = 1,
+        speed: float = 1,
        model: Optional[KModel] = None
    ) -> Generator['KPipeline.Result', None, None]:
        """Generate audio from either raw phonemes or pre-processed tokens.
@@ -343,7 +342,7 @@ class KPipeline:
        self,
        text: Union[str, List[str]],
        voice: Optional[str] = None,
-        speed: Number = 1,
+        speed: float = 1,
        split_pattern: Optional[str] = r'\n+',
        model: Optional[KModel] = None
    ) -> Generator['KPipeline.Result', None, None]:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,39 @@
 [build-system]
-requires = ["setuptools", "wheel"]
+requires = ["hatchling"]
-build-backend = "setuptools.build_meta"
+build-backend = "hatchling.build"
 [project]
 name = "kokoro"
 version = "0.7.16"
 description = "TTS"
 readme = "README.md"
 authors = [
    { name="hexgrad", email="hello@hexgrad.com" }
 ]
 license = { file = "LICENSE" }
 classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: Apache Software License",
    "Operating System :: OS Independent"
 ]
 requires-python = ">=3.10, <3.13"
 dependencies = [
    "huggingface_hub",
    "loguru",
    "misaki[en]>=0.7.16",
    "numpy==1.26.4",
    "scipy",
    "torch",
    "transformers"
 ]
 [project.scripts]
 kokoro = "kokoro.__main__:main"
 [tool.hatch.build.targets.wheel]
 only-include = ["kokoro"]
 only-packages = true
 [project.urls]
 Homepage = "https://github.com/hexgrad/kokoros"
 Repository = "https://github.com/hexgrad/kokoro"
--- a/setup.py
+++ b/setup.py
@@ -1,29 +0,0 @@
 from setuptools import setup, find_packages
 setup(
    name='kokoro',
    version='0.7.16',
    packages=find_packages(),
    install_requires=[
        'huggingface_hub',
        'loguru',
        'misaki[en]>=0.7.16',
        'numpy==1.26.4',
        'scipy',
        'torch',
        'transformers',
    ],
    python_requires='>=3.7',
    author='hexgrad',
    author_email='hello@hexgrad.com',
    description='TTS',
    long_description=open('README.md').read(),
    long_description_content_type='text/markdown',
    url='https://github.com/hexgrad/kokoro',
    license='Apache 2.0',
    classifiers=[
        'Programming Language :: Python :: 3',
        'License :: OSI Approved :: Apache Software License',
        'Operating System :: OS Independent',
    ],
 )
--- a/uv.lock
+++ b/uv.lock