Add a CLI interface (#102)

* Add a CLI interface and update packaging configuration

* Support multiple lines in stdin

---------

Co-authored-by: Eric Trotta <eric.oliveira@magva.com.br>
This commit is contained in:
etrotta
2025-02-18 02:07:21 -03:00
committed by GitHub
parent 5229a254b7
commit cd7afb5c12
5 changed files with 1990 additions and 35 deletions

148
kokoro/__main__.py Normal file
View File

@@ -0,0 +1,148 @@
"""Kokoro TTS CLI
Example usage:
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
echo "Bom dia mundo, como vão vocês" > text.txt
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
Common issues:
pip not installed: `uv pip install pip`
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
espeak not installed: `apt-get install espeak-ng`
"""
import argparse
import wave
from pathlib import Path
from typing import Generator, TYPE_CHECKING
import numpy as np
from loguru import logger
languages = [
"a", # American English
"b", # British English
"h", # Hindi
"e", # Spanish
"f", # French
"i", # Italian
"p", # Brazilian Portuguese
"j", # Japanese
"z", # Mandarin Chinese
]
if TYPE_CHECKING:
from kokoro import KPipeline
def generate_audio(
text: str, kokoro_language: str, voice: str, speed=1
) -> Generator["KPipeline.Result", None, None]:
from kokoro import KPipeline
if not voice.startswith(kokoro_language):
logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
pipeline = KPipeline(lang_code=kokoro_language)
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
def generate_and_save_audio(
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
) -> None:
with wave.open(str(output_file.resolve()), "wb") as wav_file:
wav_file.setnchannels(1) # Mono audio
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
wav_file.setframerate(24000) # Sample rate
for result in generate_audio(
text, kokoro_language=kokoro_language, voice=voice, speed=speed
):
logger.debug(result.phonemes)
if result.audio is None:
continue
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
wav_file.writeframes(audio_bytes)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--voice",
default="af_heart",
help="Voice to use",
)
parser.add_argument(
"-l",
"--language",
help="Language to use (defaults to the one corresponding to the voice)",
choices=languages,
)
parser.add_argument(
"-o",
"--output-file",
"--output_file",
type=Path,
help="Path to output WAV file",
required=True,
)
parser.add_argument(
"-i",
"--input-file",
"--input_file",
type=Path,
help="Path to input text file (default: stdin)",
)
parser.add_argument(
"-t",
"--text",
help="Text to use instead of reading from stdin",
)
parser.add_argument(
"-s",
"--speed",
type=float,
default=1.0,
help="Speech speed",
)
parser.add_argument(
"--debug",
action="store_true",
help="Print DEBUG messages to console",
)
args = parser.parse_args()
if args.debug:
logger.level("DEBUG")
logger.debug(args)
lang = args.language or args.voice[0]
if args.text is not None and args.input_file is not None:
raise Exception("You cannot specify both 'text' and 'input_file'")
elif args.text:
text = args.text
elif args.input_file:
file: Path = args.input_file
text = file.read_text()
else:
import sys
print("Press Ctrl+D to stop reading input and start generating", flush=True)
text = '\n'.join(sys.stdin)
logger.debug(f"Input text: {text!r}")
out_file: Path = args.output_file
if not out_file.suffix == ".wav":
logger.warning("The output file name should end with .wav")
generate_and_save_audio(
output_file=out_file,
text=text,
kokoro_language=lang,
voice=args.voice,
speed=args.speed,
)
if __name__ == "__main__":
main()

View File

@@ -3,7 +3,6 @@ from dataclasses import dataclass
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from loguru import logger from loguru import logger
from misaki import en, espeak from misaki import en, espeak
from numbers import Number
from typing import Generator, List, Optional, Tuple, Union from typing import Generator, List, Optional, Tuple, Union
import re import re
import torch import torch
@@ -219,7 +218,7 @@ class KPipeline:
model: KModel, model: KModel,
ps: str, ps: str,
pack: torch.FloatTensor, pack: torch.FloatTensor,
speed: Number = 1 speed: float = 1
) -> KModel.Output: ) -> KModel.Output:
return model(ps, pack[len(ps)-1], speed, return_output=True) return model(ps, pack[len(ps)-1], speed, return_output=True)
@@ -227,7 +226,7 @@ class KPipeline:
self, self,
tokens: Union[str, List[en.MToken]], tokens: Union[str, List[en.MToken]],
voice: str, voice: str,
speed: Number = 1, speed: float = 1,
model: Optional[KModel] = None model: Optional[KModel] = None
) -> Generator['KPipeline.Result', None, None]: ) -> Generator['KPipeline.Result', None, None]:
"""Generate audio from either raw phonemes or pre-processed tokens. """Generate audio from either raw phonemes or pre-processed tokens.
@@ -343,7 +342,7 @@ class KPipeline:
self, self,
text: Union[str, List[str]], text: Union[str, List[str]],
voice: Optional[str] = None, voice: Optional[str] = None,
speed: Number = 1, speed: float = 1,
split_pattern: Optional[str] = r'\n+', split_pattern: Optional[str] = r'\n+',
model: Optional[KModel] = None model: Optional[KModel] = None
) -> Generator['KPipeline.Result', None, None]: ) -> Generator['KPipeline.Result', None, None]:

View File

@@ -1,3 +1,39 @@
[build-system] [build-system]
requires = ["setuptools", "wheel"] requires = ["hatchling"]
build-backend = "setuptools.build_meta" build-backend = "hatchling.build"
[project]
name = "kokoro"
version = "0.7.16"
description = "TTS"
readme = "README.md"
authors = [
{ name="hexgrad", email="hello@hexgrad.com" }
]
license = { file = "LICENSE" }
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent"
]
requires-python = ">=3.10, <3.13"
dependencies = [
"huggingface_hub",
"loguru",
"misaki[en]>=0.7.16",
"numpy==1.26.4",
"scipy",
"torch",
"transformers"
]
[project.scripts]
kokoro = "kokoro.__main__:main"
[tool.hatch.build.targets.wheel]
only-include = ["kokoro"]
only-packages = true
[project.urls]
Homepage = "https://github.com/hexgrad/kokoros"
Repository = "https://github.com/hexgrad/kokoro"

View File

@@ -1,29 +0,0 @@
from setuptools import setup, find_packages
setup(
name='kokoro',
version='0.7.16',
packages=find_packages(),
install_requires=[
'huggingface_hub',
'loguru',
'misaki[en]>=0.7.16',
'numpy==1.26.4',
'scipy',
'torch',
'transformers',
],
python_requires='>=3.7',
author='hexgrad',
author_email='hello@hexgrad.com',
description='TTS',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
url='https://github.com/hexgrad/kokoro',
license='Apache 2.0',
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
],
)

1801
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff