Add a CLI interface (#102)
* Add a CLI interface and update packaging configuration * Support multiple lines in stdin --------- Co-authored-by: Eric Trotta <eric.oliveira@magva.com.br>
This commit is contained in:
148
kokoro/__main__.py
Normal file
148
kokoro/__main__.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""Kokoro TTS CLI
|
||||
Example usage:
|
||||
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
|
||||
|
||||
echo "Bom dia mundo, como vão vocês" > text.txt
|
||||
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
|
||||
|
||||
Common issues:
|
||||
pip not installed: `uv pip install pip`
|
||||
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
|
||||
|
||||
espeak not installed: `apt-get install espeak-ng`
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Generator, TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
languages = [
|
||||
"a", # American English
|
||||
"b", # British English
|
||||
"h", # Hindi
|
||||
"e", # Spanish
|
||||
"f", # French
|
||||
"i", # Italian
|
||||
"p", # Brazilian Portuguese
|
||||
"j", # Japanese
|
||||
"z", # Mandarin Chinese
|
||||
]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from kokoro import KPipeline
|
||||
|
||||
|
||||
def generate_audio(
|
||||
text: str, kokoro_language: str, voice: str, speed=1
|
||||
) -> Generator["KPipeline.Result", None, None]:
|
||||
from kokoro import KPipeline
|
||||
|
||||
if not voice.startswith(kokoro_language):
|
||||
logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
|
||||
pipeline = KPipeline(lang_code=kokoro_language)
|
||||
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
|
||||
|
||||
|
||||
def generate_and_save_audio(
|
||||
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
|
||||
) -> None:
|
||||
with wave.open(str(output_file.resolve()), "wb") as wav_file:
|
||||
wav_file.setnchannels(1) # Mono audio
|
||||
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
|
||||
wav_file.setframerate(24000) # Sample rate
|
||||
|
||||
for result in generate_audio(
|
||||
text, kokoro_language=kokoro_language, voice=voice, speed=speed
|
||||
):
|
||||
logger.debug(result.phonemes)
|
||||
if result.audio is None:
|
||||
continue
|
||||
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
|
||||
wav_file.writeframes(audio_bytes)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--voice",
|
||||
default="af_heart",
|
||||
help="Voice to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--language",
|
||||
help="Language to use (defaults to the one corresponding to the voice)",
|
||||
choices=languages,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output-file",
|
||||
"--output_file",
|
||||
type=Path,
|
||||
help="Path to output WAV file",
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input-file",
|
||||
"--input_file",
|
||||
type=Path,
|
||||
help="Path to input text file (default: stdin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--text",
|
||||
help="Text to use instead of reading from stdin",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--speed",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Speech speed",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="Print DEBUG messages to console",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
if args.debug:
|
||||
logger.level("DEBUG")
|
||||
logger.debug(args)
|
||||
|
||||
lang = args.language or args.voice[0]
|
||||
|
||||
if args.text is not None and args.input_file is not None:
|
||||
raise Exception("You cannot specify both 'text' and 'input_file'")
|
||||
elif args.text:
|
||||
text = args.text
|
||||
elif args.input_file:
|
||||
file: Path = args.input_file
|
||||
text = file.read_text()
|
||||
else:
|
||||
import sys
|
||||
print("Press Ctrl+D to stop reading input and start generating", flush=True)
|
||||
text = '\n'.join(sys.stdin)
|
||||
|
||||
logger.debug(f"Input text: {text!r}")
|
||||
|
||||
out_file: Path = args.output_file
|
||||
if not out_file.suffix == ".wav":
|
||||
logger.warning("The output file name should end with .wav")
|
||||
generate_and_save_audio(
|
||||
output_file=out_file,
|
||||
text=text,
|
||||
kokoro_language=lang,
|
||||
voice=args.voice,
|
||||
speed=args.speed,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -3,7 +3,6 @@ from dataclasses import dataclass
|
||||
from huggingface_hub import hf_hub_download
|
||||
from loguru import logger
|
||||
from misaki import en, espeak
|
||||
from numbers import Number
|
||||
from typing import Generator, List, Optional, Tuple, Union
|
||||
import re
|
||||
import torch
|
||||
@@ -219,7 +218,7 @@ class KPipeline:
|
||||
model: KModel,
|
||||
ps: str,
|
||||
pack: torch.FloatTensor,
|
||||
speed: Number = 1
|
||||
speed: float = 1
|
||||
) -> KModel.Output:
|
||||
return model(ps, pack[len(ps)-1], speed, return_output=True)
|
||||
|
||||
@@ -227,7 +226,7 @@ class KPipeline:
|
||||
self,
|
||||
tokens: Union[str, List[en.MToken]],
|
||||
voice: str,
|
||||
speed: Number = 1,
|
||||
speed: float = 1,
|
||||
model: Optional[KModel] = None
|
||||
) -> Generator['KPipeline.Result', None, None]:
|
||||
"""Generate audio from either raw phonemes or pre-processed tokens.
|
||||
@@ -343,7 +342,7 @@ class KPipeline:
|
||||
self,
|
||||
text: Union[str, List[str]],
|
||||
voice: Optional[str] = None,
|
||||
speed: Number = 1,
|
||||
speed: float = 1,
|
||||
split_pattern: Optional[str] = r'\n+',
|
||||
model: Optional[KModel] = None
|
||||
) -> Generator['KPipeline.Result', None, None]:
|
||||
|
||||
@@ -1,3 +1,39 @@
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "kokoro"
|
||||
version = "0.7.16"
|
||||
description = "TTS"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name="hexgrad", email="hello@hexgrad.com" }
|
||||
]
|
||||
license = { file = "LICENSE" }
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent"
|
||||
]
|
||||
requires-python = ">=3.10, <3.13"
|
||||
dependencies = [
|
||||
"huggingface_hub",
|
||||
"loguru",
|
||||
"misaki[en]>=0.7.16",
|
||||
"numpy==1.26.4",
|
||||
"scipy",
|
||||
"torch",
|
||||
"transformers"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
kokoro = "kokoro.__main__:main"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
only-include = ["kokoro"]
|
||||
only-packages = true
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/hexgrad/kokoros"
|
||||
Repository = "https://github.com/hexgrad/kokoro"
|
||||
|
||||
29
setup.py
29
setup.py
@@ -1,29 +0,0 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='kokoro',
|
||||
version='0.7.16',
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
'huggingface_hub',
|
||||
'loguru',
|
||||
'misaki[en]>=0.7.16',
|
||||
'numpy==1.26.4',
|
||||
'scipy',
|
||||
'torch',
|
||||
'transformers',
|
||||
],
|
||||
python_requires='>=3.7',
|
||||
author='hexgrad',
|
||||
author_email='hello@hexgrad.com',
|
||||
description='TTS',
|
||||
long_description=open('README.md').read(),
|
||||
long_description_content_type='text/markdown',
|
||||
url='https://github.com/hexgrad/kokoro',
|
||||
license='Apache 2.0',
|
||||
classifiers=[
|
||||
'Programming Language :: Python :: 3',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Operating System :: OS Independent',
|
||||
],
|
||||
)
|
||||
Reference in New Issue
Block a user