Add a CLI interface (#102)

* Add a CLI interface and update packaging configuration

* Support multiple lines in stdin

---------

Co-authored-by: Eric Trotta <eric.oliveira@magva.com.br>
This commit is contained in:
etrotta
2025-02-18 02:07:21 -03:00
committed by GitHub
parent 5229a254b7
commit cd7afb5c12
5 changed files with 1990 additions and 35 deletions

148
kokoro/__main__.py Normal file
View File

@@ -0,0 +1,148 @@
"""Kokoro TTS CLI
Example usage:
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
echo "Bom dia mundo, como vão vocês" > text.txt
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
Common issues:
pip not installed: `uv pip install pip`
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
espeak not installed: `apt-get install espeak-ng`
"""
import argparse
import wave
from pathlib import Path
from typing import Generator, TYPE_CHECKING
import numpy as np
from loguru import logger
languages = [
"a", # American English
"b", # British English
"h", # Hindi
"e", # Spanish
"f", # French
"i", # Italian
"p", # Brazilian Portuguese
"j", # Japanese
"z", # Mandarin Chinese
]
if TYPE_CHECKING:
from kokoro import KPipeline
def generate_audio(
text: str, kokoro_language: str, voice: str, speed=1
) -> Generator["KPipeline.Result", None, None]:
from kokoro import KPipeline
if not voice.startswith(kokoro_language):
logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
pipeline = KPipeline(lang_code=kokoro_language)
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
def generate_and_save_audio(
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
) -> None:
with wave.open(str(output_file.resolve()), "wb") as wav_file:
wav_file.setnchannels(1) # Mono audio
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
wav_file.setframerate(24000) # Sample rate
for result in generate_audio(
text, kokoro_language=kokoro_language, voice=voice, speed=speed
):
logger.debug(result.phonemes)
if result.audio is None:
continue
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
wav_file.writeframes(audio_bytes)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--voice",
default="af_heart",
help="Voice to use",
)
parser.add_argument(
"-l",
"--language",
help="Language to use (defaults to the one corresponding to the voice)",
choices=languages,
)
parser.add_argument(
"-o",
"--output-file",
"--output_file",
type=Path,
help="Path to output WAV file",
required=True,
)
parser.add_argument(
"-i",
"--input-file",
"--input_file",
type=Path,
help="Path to input text file (default: stdin)",
)
parser.add_argument(
"-t",
"--text",
help="Text to use instead of reading from stdin",
)
parser.add_argument(
"-s",
"--speed",
type=float,
default=1.0,
help="Speech speed",
)
parser.add_argument(
"--debug",
action="store_true",
help="Print DEBUG messages to console",
)
args = parser.parse_args()
if args.debug:
logger.level("DEBUG")
logger.debug(args)
lang = args.language or args.voice[0]
if args.text is not None and args.input_file is not None:
raise Exception("You cannot specify both 'text' and 'input_file'")
elif args.text:
text = args.text
elif args.input_file:
file: Path = args.input_file
text = file.read_text()
else:
import sys
print("Press Ctrl+D to stop reading input and start generating", flush=True)
text = '\n'.join(sys.stdin)
logger.debug(f"Input text: {text!r}")
out_file: Path = args.output_file
if not out_file.suffix == ".wav":
logger.warning("The output file name should end with .wav")
generate_and_save_audio(
output_file=out_file,
text=text,
kokoro_language=lang,
voice=args.voice,
speed=args.speed,
)
if __name__ == "__main__":
main()

View File

@@ -3,7 +3,6 @@ from dataclasses import dataclass
from huggingface_hub import hf_hub_download
from loguru import logger
from misaki import en, espeak
from numbers import Number
from typing import Generator, List, Optional, Tuple, Union
import re
import torch
@@ -219,7 +218,7 @@ class KPipeline:
model: KModel,
ps: str,
pack: torch.FloatTensor,
speed: Number = 1
speed: float = 1
) -> KModel.Output:
return model(ps, pack[len(ps)-1], speed, return_output=True)
@@ -227,7 +226,7 @@ class KPipeline:
self,
tokens: Union[str, List[en.MToken]],
voice: str,
speed: Number = 1,
speed: float = 1,
model: Optional[KModel] = None
) -> Generator['KPipeline.Result', None, None]:
"""Generate audio from either raw phonemes or pre-processed tokens.
@@ -343,7 +342,7 @@ class KPipeline:
self,
text: Union[str, List[str]],
voice: Optional[str] = None,
speed: Number = 1,
speed: float = 1,
split_pattern: Optional[str] = r'\n+',
model: Optional[KModel] = None
) -> Generator['KPipeline.Result', None, None]:

View File

@@ -1,3 +1,39 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "kokoro"
version = "0.7.16"
description = "TTS"
readme = "README.md"
authors = [
{ name="hexgrad", email="hello@hexgrad.com" }
]
license = { file = "LICENSE" }
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent"
]
requires-python = ">=3.10, <3.13"
dependencies = [
"huggingface_hub",
"loguru",
"misaki[en]>=0.7.16",
"numpy==1.26.4",
"scipy",
"torch",
"transformers"
]
[project.scripts]
kokoro = "kokoro.__main__:main"
[tool.hatch.build.targets.wheel]
only-include = ["kokoro"]
only-packages = true
[project.urls]
Homepage = "https://github.com/hexgrad/kokoros"
Repository = "https://github.com/hexgrad/kokoro"

View File

@@ -1,29 +0,0 @@
from setuptools import setup, find_packages
setup(
name='kokoro',
version='0.7.16',
packages=find_packages(),
install_requires=[
'huggingface_hub',
'loguru',
'misaki[en]>=0.7.16',
'numpy==1.26.4',
'scipy',
'torch',
'transformers',
],
python_requires='>=3.7',
author='hexgrad',
author_email='hello@hexgrad.com',
description='TTS',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
url='https://github.com/hexgrad/kokoro',
license='Apache 2.0',
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
],
)

1801
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff