Add a CLI interface (#102)
* Add a CLI interface and update packaging configuration * Support multiple lines in stdin --------- Co-authored-by: Eric Trotta <eric.oliveira@magva.com.br>
This commit is contained in:
148
kokoro/__main__.py
Normal file
148
kokoro/__main__.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
"""Kokoro TTS CLI
|
||||||
|
Example usage:
|
||||||
|
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
|
||||||
|
|
||||||
|
echo "Bom dia mundo, como vão vocês" > text.txt
|
||||||
|
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
|
||||||
|
|
||||||
|
Common issues:
|
||||||
|
pip not installed: `uv pip install pip`
|
||||||
|
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
|
||||||
|
|
||||||
|
espeak not installed: `apt-get install espeak-ng`
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import wave
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Generator, TYPE_CHECKING
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
languages = [
|
||||||
|
"a", # American English
|
||||||
|
"b", # British English
|
||||||
|
"h", # Hindi
|
||||||
|
"e", # Spanish
|
||||||
|
"f", # French
|
||||||
|
"i", # Italian
|
||||||
|
"p", # Brazilian Portuguese
|
||||||
|
"j", # Japanese
|
||||||
|
"z", # Mandarin Chinese
|
||||||
|
]
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kokoro import KPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio(
|
||||||
|
text: str, kokoro_language: str, voice: str, speed=1
|
||||||
|
) -> Generator["KPipeline.Result", None, None]:
|
||||||
|
from kokoro import KPipeline
|
||||||
|
|
||||||
|
if not voice.startswith(kokoro_language):
|
||||||
|
logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
|
||||||
|
pipeline = KPipeline(lang_code=kokoro_language)
|
||||||
|
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
|
||||||
|
|
||||||
|
|
||||||
|
def generate_and_save_audio(
|
||||||
|
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
|
||||||
|
) -> None:
|
||||||
|
with wave.open(str(output_file.resolve()), "wb") as wav_file:
|
||||||
|
wav_file.setnchannels(1) # Mono audio
|
||||||
|
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
|
||||||
|
wav_file.setframerate(24000) # Sample rate
|
||||||
|
|
||||||
|
for result in generate_audio(
|
||||||
|
text, kokoro_language=kokoro_language, voice=voice, speed=speed
|
||||||
|
):
|
||||||
|
logger.debug(result.phonemes)
|
||||||
|
if result.audio is None:
|
||||||
|
continue
|
||||||
|
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
|
||||||
|
wav_file.writeframes(audio_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"-m",
|
||||||
|
"--voice",
|
||||||
|
default="af_heart",
|
||||||
|
help="Voice to use",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--language",
|
||||||
|
help="Language to use (defaults to the one corresponding to the voice)",
|
||||||
|
choices=languages,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output-file",
|
||||||
|
"--output_file",
|
||||||
|
type=Path,
|
||||||
|
help="Path to output WAV file",
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-i",
|
||||||
|
"--input-file",
|
||||||
|
"--input_file",
|
||||||
|
type=Path,
|
||||||
|
help="Path to input text file (default: stdin)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--text",
|
||||||
|
help="Text to use instead of reading from stdin",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--speed",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="Speech speed",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--debug",
|
||||||
|
action="store_true",
|
||||||
|
help="Print DEBUG messages to console",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.debug:
|
||||||
|
logger.level("DEBUG")
|
||||||
|
logger.debug(args)
|
||||||
|
|
||||||
|
lang = args.language or args.voice[0]
|
||||||
|
|
||||||
|
if args.text is not None and args.input_file is not None:
|
||||||
|
raise Exception("You cannot specify both 'text' and 'input_file'")
|
||||||
|
elif args.text:
|
||||||
|
text = args.text
|
||||||
|
elif args.input_file:
|
||||||
|
file: Path = args.input_file
|
||||||
|
text = file.read_text()
|
||||||
|
else:
|
||||||
|
import sys
|
||||||
|
print("Press Ctrl+D to stop reading input and start generating", flush=True)
|
||||||
|
text = '\n'.join(sys.stdin)
|
||||||
|
|
||||||
|
logger.debug(f"Input text: {text!r}")
|
||||||
|
|
||||||
|
out_file: Path = args.output_file
|
||||||
|
if not out_file.suffix == ".wav":
|
||||||
|
logger.warning("The output file name should end with .wav")
|
||||||
|
generate_and_save_audio(
|
||||||
|
output_file=out_file,
|
||||||
|
text=text,
|
||||||
|
kokoro_language=lang,
|
||||||
|
voice=args.voice,
|
||||||
|
speed=args.speed,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -3,7 +3,6 @@ from dataclasses import dataclass
|
|||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from misaki import en, espeak
|
from misaki import en, espeak
|
||||||
from numbers import Number
|
|
||||||
from typing import Generator, List, Optional, Tuple, Union
|
from typing import Generator, List, Optional, Tuple, Union
|
||||||
import re
|
import re
|
||||||
import torch
|
import torch
|
||||||
@@ -219,7 +218,7 @@ class KPipeline:
|
|||||||
model: KModel,
|
model: KModel,
|
||||||
ps: str,
|
ps: str,
|
||||||
pack: torch.FloatTensor,
|
pack: torch.FloatTensor,
|
||||||
speed: Number = 1
|
speed: float = 1
|
||||||
) -> KModel.Output:
|
) -> KModel.Output:
|
||||||
return model(ps, pack[len(ps)-1], speed, return_output=True)
|
return model(ps, pack[len(ps)-1], speed, return_output=True)
|
||||||
|
|
||||||
@@ -227,7 +226,7 @@ class KPipeline:
|
|||||||
self,
|
self,
|
||||||
tokens: Union[str, List[en.MToken]],
|
tokens: Union[str, List[en.MToken]],
|
||||||
voice: str,
|
voice: str,
|
||||||
speed: Number = 1,
|
speed: float = 1,
|
||||||
model: Optional[KModel] = None
|
model: Optional[KModel] = None
|
||||||
) -> Generator['KPipeline.Result', None, None]:
|
) -> Generator['KPipeline.Result', None, None]:
|
||||||
"""Generate audio from either raw phonemes or pre-processed tokens.
|
"""Generate audio from either raw phonemes or pre-processed tokens.
|
||||||
@@ -343,7 +342,7 @@ class KPipeline:
|
|||||||
self,
|
self,
|
||||||
text: Union[str, List[str]],
|
text: Union[str, List[str]],
|
||||||
voice: Optional[str] = None,
|
voice: Optional[str] = None,
|
||||||
speed: Number = 1,
|
speed: float = 1,
|
||||||
split_pattern: Optional[str] = r'\n+',
|
split_pattern: Optional[str] = r'\n+',
|
||||||
model: Optional[KModel] = None
|
model: Optional[KModel] = None
|
||||||
) -> Generator['KPipeline.Result', None, None]:
|
) -> Generator['KPipeline.Result', None, None]:
|
||||||
|
|||||||
@@ -1,3 +1,39 @@
|
|||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "wheel"]
|
requires = ["hatchling"]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "kokoro"
|
||||||
|
version = "0.7.16"
|
||||||
|
description = "TTS"
|
||||||
|
readme = "README.md"
|
||||||
|
authors = [
|
||||||
|
{ name="hexgrad", email="hello@hexgrad.com" }
|
||||||
|
]
|
||||||
|
license = { file = "LICENSE" }
|
||||||
|
classifiers = [
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Operating System :: OS Independent"
|
||||||
|
]
|
||||||
|
requires-python = ">=3.10, <3.13"
|
||||||
|
dependencies = [
|
||||||
|
"huggingface_hub",
|
||||||
|
"loguru",
|
||||||
|
"misaki[en]>=0.7.16",
|
||||||
|
"numpy==1.26.4",
|
||||||
|
"scipy",
|
||||||
|
"torch",
|
||||||
|
"transformers"
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
kokoro = "kokoro.__main__:main"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
only-include = ["kokoro"]
|
||||||
|
only-packages = true
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/hexgrad/kokoros"
|
||||||
|
Repository = "https://github.com/hexgrad/kokoro"
|
||||||
|
|||||||
29
setup.py
29
setup.py
@@ -1,29 +0,0 @@
|
|||||||
from setuptools import setup, find_packages
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name='kokoro',
|
|
||||||
version='0.7.16',
|
|
||||||
packages=find_packages(),
|
|
||||||
install_requires=[
|
|
||||||
'huggingface_hub',
|
|
||||||
'loguru',
|
|
||||||
'misaki[en]>=0.7.16',
|
|
||||||
'numpy==1.26.4',
|
|
||||||
'scipy',
|
|
||||||
'torch',
|
|
||||||
'transformers',
|
|
||||||
],
|
|
||||||
python_requires='>=3.7',
|
|
||||||
author='hexgrad',
|
|
||||||
author_email='hello@hexgrad.com',
|
|
||||||
description='TTS',
|
|
||||||
long_description=open('README.md').read(),
|
|
||||||
long_description_content_type='text/markdown',
|
|
||||||
url='https://github.com/hexgrad/kokoro',
|
|
||||||
license='Apache 2.0',
|
|
||||||
classifiers=[
|
|
||||||
'Programming Language :: Python :: 3',
|
|
||||||
'License :: OSI Approved :: Apache Software License',
|
|
||||||
'Operating System :: OS Independent',
|
|
||||||
],
|
|
||||||
)
|
|
||||||
Reference in New Issue
Block a user