From e43d62643e8b314018193eda9c0e49ba3c3d23f3 Mon Sep 17 00:00:00 2001 From: hexgrad <166769057+hexgrad@users.noreply.github.com> Date: Tue, 18 Mar 2025 11:16:34 -0700 Subject: [PATCH] Remove scipy (#139) * Remove scipy * No longer need to replace T * Update README.md * Remove numpy version lock * Update README.md * Update uv.lock --- README.md | 40 ++++++++++++++++++++++---------- kokoro/__init__.py | 2 +- kokoro/custom_stft.py | 7 +++--- kokoro/istftnet.py | 22 ++++++++---------- kokoro/pipeline.py | 2 +- pyproject.toml | 7 +++--- uv.lock | 53 +++++++++++++------------------------------ 7 files changed, 62 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 162fc53..b8cf15b 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,31 @@ An inference library for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) > **Kokoro** is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects. ### Usage -You can run this cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md). +You can run this basic cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md). +```py +!pip install -q kokoro>=0.9.2 soundfile +!apt-get -qq -y install espeak-ng > /dev/null 2>&1 +from kokoro import KPipeline +from IPython.display import display, Audio +import soundfile as sf +import torch +pipeline = KPipeline(lang_code='a') +text = ''' +[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects. +''' +generator = pipeline(text, voice='af_heart') +for i, (gs, ps, audio) in enumerate(generator): + print(i, gs, ps) + display(Audio(data=audio, rate=24000, autoplay=i==0)) + sf.write(f'{i}.wav', audio, 24000) +``` +Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki + +### Advanced Usage +You can run this advanced cell on [Google Colab](https://colab.research.google.com/). ```py # 1️⃣ Install kokoro -!pip install -q kokoro>=0.8.4 soundfile +!pip install -q kokoro>=0.9.2 soundfile # 2️⃣ Install espeak, used for English OOD fallback and some non-English languages !apt-get -qq -y install espeak-ng > /dev/null 2>&1 # 🇪🇸 'e' => Spanish es @@ -50,13 +71,12 @@ generator = pipeline( text, voice='af_heart', # <= change voice here speed=1, split_pattern=r'\n+' ) - # Alternatively, load voice tensor directly: -voice_tensor = torch.load('path/to/voice.pt', weights_only=True) -generator = pipeline( - text, voice=voice_tensor, - speed=1, split_pattern=r'\n+' -) +# voice_tensor = torch.load('path/to/voice.pt', weights_only=True) +# generator = pipeline( +# text, voice=voice_tensor, +# speed=1, split_pattern=r'\n+' +# ) for i, (gs, ps, audio) in enumerate(generator): print(i) # i => index @@ -66,10 +86,7 @@ for i, (gs, ps, audio) in enumerate(generator): sf.write(f'{i}.wav', audio, 24000) # save each audio file ``` -Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki - ### Conda Environment - Use the following conda `environment.yml` if you're facing any dependency issues. ```yaml name: kokoro @@ -85,7 +102,6 @@ dependencies: ``` ### Acknowledgements - - 🛠️ [@yl4579](https://huggingface.co/yl4579) for architecting StyleTTS 2. - 🏆 [@Pendrokar](https://huggingface.co/Pendrokar) for adding Kokoro as a contender in the TTS Spaces Arena. - 📊 Thank you to everyone who contributed synthetic training data. diff --git a/kokoro/__init__.py b/kokoro/__init__.py index 9669a38..3dfefcb 100644 --- a/kokoro/__init__.py +++ b/kokoro/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.8.4' +__version__ = '0.9.2' from loguru import logger import sys diff --git a/kokoro/custom_stft.py b/kokoro/custom_stft.py index 15f3378..c9cf0d2 100644 --- a/kokoro/custom_stft.py +++ b/kokoro/custom_stft.py @@ -1,9 +1,8 @@ from attr import attr +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -import numpy as np -from scipy.signal import get_window class CustomSTFT(nn.Module): """ @@ -37,8 +36,8 @@ class CustomSTFT(nn.Module): self.freq_bins = self.n_fft // 2 + 1 # Build window - win_np = get_window(window, self.win_length, fftbins=True).astype(np.float32) - window_tensor = torch.from_numpy(win_np) + assert window == 'hann', window + window_tensor = torch.hann_window(win_length, periodic=True, dtype=torch.float32) if self.win_length < self.n_fft: # Zero-pad up to n_fft extra = self.n_fft - self.win_length diff --git a/kokoro/istftnet.py b/kokoro/istftnet.py index 5e4143e..cb22279 100644 --- a/kokoro/istftnet.py +++ b/kokoro/istftnet.py @@ -1,14 +1,11 @@ -# https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py -import math -from scipy.signal import get_window +# ADAPTED from https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py +from kokoro.custom_stft import CustomSTFT from torch.nn.utils import weight_norm -import numpy as np +import math import torch import torch.nn as nn import torch.nn.functional as F -from kokoro.custom_stft import CustomSTFT - # https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py def init_weights(m, mean=0.0, std=0.01): @@ -86,7 +83,8 @@ class TorchSTFT(nn.Module): self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length - self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32)) + assert window == 'hann', window + self.window = torch.hann_window(win_length, periodic=True, dtype=torch.float32) def transform(self, input_data): forward_transform = torch.stft( @@ -120,7 +118,7 @@ class SineGen(nn.Module): voiced_thoreshold: F0 threshold for U/V classification (default 0) flag_for_pulse: this SinGen is used inside PulseGen (default False) Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) + segment is always sin(torch.pi) or cos(0) """ def __init__(self, samp_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, noise_std=0.003, @@ -146,7 +144,7 @@ class SineGen(nn.Module): where dim indicates fundamental tone and overtones """ # convert to F0 in rad. The interger part n can be ignored - # because 2 * np.pi * n doesn't affect phase + # because 2 * torch.pi * n doesn't affect phase rad_values = (f0_values / self.sampling_rate) % 1 # initial phase noise (no noise for fundamental component) rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device) @@ -155,7 +153,7 @@ class SineGen(nn.Module): # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) if not self.flag_for_pulse: rad_values = F.interpolate(rad_values.transpose(1, 2), scale_factor=1/self.upsample_scale, mode="linear").transpose(1, 2) - phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + phase = torch.cumsum(rad_values, dim=1) * 2 * torch.pi phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear").transpose(1, 2) sines = torch.sin(phase) else: @@ -181,7 +179,7 @@ class SineGen(nn.Module): # within the previous voiced segment. i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) # get the sines - sines = torch.cos(i_phase * 2 * np.pi) + sines = torch.cos(i_phase * 2 * torch.pi) return sines def forward(self, f0): @@ -379,7 +377,7 @@ class AdainResBlk1d(nn.Module): def forward(self, x, s): out = self._residual(x, s) - out = (out + self._shortcut(x)) / np.sqrt(2) + out = (out + self._shortcut(x)) * torch.rsqrt(torch.tensor(2)) return out diff --git a/kokoro/pipeline.py b/kokoro/pipeline.py index 6afa9a4..f23b13e 100644 --- a/kokoro/pipeline.py +++ b/kokoro/pipeline.py @@ -200,7 +200,7 @@ class KPipeline: pcount = 0 for t in tokens: # American English: ɾ => T - t.phonemes = '' if t.phonemes is None else t.phonemes.replace('ɾ', 'T') + t.phonemes = '' if t.phonemes is None else t.phonemes#.replace('ɾ', 'T') next_ps = t.phonemes + (' ' if t.whitespace else '') next_pcount = pcount + len(next_ps.rstrip()) if next_pcount > 510: diff --git a/pyproject.toml b/pyproject.toml index ea7d147..0220ec1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "kokoro" -version = "0.8.4" +version = "0.9.2" description = "TTS" readme = "README.md" authors = [ @@ -20,9 +20,8 @@ requires-python = ">=3.10, <3.13" dependencies = [ "huggingface_hub", "loguru", - "misaki[en]>=0.8.4", - "numpy==1.26.4", - "scipy", + "misaki[en]>=0.9.2", + "numpy", "torch", "transformers" ] diff --git a/uv.lock b/uv.lock index 028b5b0..ab369dc 100644 --- a/uv.lock +++ b/uv.lock @@ -5,6 +5,15 @@ resolution-markers = [ "python_full_version < '3.12'", ] +[[package]] +name = "addict" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832 }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -428,14 +437,13 @@ wheels = [ [[package]] name = "kokoro" -version = "0.7.16" +version = "0.9.2" source = { editable = "." } dependencies = [ { name = "huggingface-hub" }, { name = "loguru" }, { name = "misaki", extra = ["en"] }, { name = "numpy" }, - { name = "scipy" }, { name = "torch" }, { name = "transformers" }, ] @@ -444,9 +452,8 @@ dependencies = [ requires-dist = [ { name = "huggingface-hub" }, { name = "loguru" }, - { name = "misaki", extras = ["en"], specifier = ">=0.7.16" }, - { name = "numpy", specifier = "==1.26.4" }, - { name = "scipy" }, + { name = "misaki", extras = ["en"], specifier = ">=0.9.2" }, + { name = "numpy" }, { name = "torch" }, { name = "transformers" }, ] @@ -676,14 +683,15 @@ wheels = [ [[package]] name = "misaki" -version = "0.7.16" +version = "0.9.2" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "addict" }, { name = "regex" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6d/ea/704d49d375d475ac1dee144012fc9f4c4a7e408feaa4e15d5999226a147a/misaki-0.7.16.tar.gz", hash = "sha256:ca5a46456391934dc72edc31f20e1db89ddc30897866202cae67152e21bfafa6", size = 3663801 } +sdist = { url = "https://files.pythonhosted.org/packages/e1/3c/81960011deaeeb4a4a74ea418e3a321d615c484e6d9ad99388c13e082f31/misaki-0.9.2.tar.gz", hash = "sha256:cccd6b724c98efc5209ae7e26f8ea5d36ef02619b7a0ea434a6b960388ace8f3", size = 3708863 } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/96/979a05fb666983478efc40eb42436e39419e37c26e6d6b0fd4e55cae81db/misaki-0.7.16-py3-none-any.whl", hash = "sha256:a847cbeb69a1082c8c6f2b12ee94fd4e2546db13e534f1e6e0edd2da6892f30d", size = 3517386 }, + { url = "https://files.pythonhosted.org/packages/7c/8a/25315cf16af32560b4a47797f47b023da97fb83f7c45410683d6d30dc7bf/misaki-0.9.2-py3-none-any.whl", hash = "sha256:559232c0f5d8d9f9c2e006eebd0a3d1d714ec18e3d207bde110dc299547fe050", size = 3556820 }, ] [package.optional-dependencies] @@ -1317,35 +1325,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/ca/aa489392ec6fb59223ffce825461e1f811a3affd417121a2088be7a5758b/safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589", size = 303756 }, ] -[[package]] -name = "scipy" -version = "1.13.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ae/00/48c2f661e2816ccf2ecd77982f6605b2950afe60f60a52b4cbbc2504aa8f/scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c", size = 57210720 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/59/41b2529908c002ade869623b87eecff3e11e3ce62e996d0bdcb536984187/scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca", size = 39328076 }, - { url = "https://files.pythonhosted.org/packages/d5/33/f1307601f492f764062ce7dd471a14750f3360e33cd0f8c614dae208492c/scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f", size = 30306232 }, - { url = "https://files.pythonhosted.org/packages/c0/66/9cd4f501dd5ea03e4a4572ecd874936d0da296bd04d1c45ae1a4a75d9c3a/scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989", size = 33743202 }, - { url = "https://files.pythonhosted.org/packages/a3/ba/7255e5dc82a65adbe83771c72f384d99c43063648456796436c9a5585ec3/scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f", size = 38577335 }, - { url = "https://files.pythonhosted.org/packages/49/a5/bb9ded8326e9f0cdfdc412eeda1054b914dfea952bda2097d174f8832cc0/scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94", size = 38820728 }, - { url = "https://files.pythonhosted.org/packages/12/30/df7a8fcc08f9b4a83f5f27cfaaa7d43f9a2d2ad0b6562cced433e5b04e31/scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54", size = 46210588 }, - { url = "https://files.pythonhosted.org/packages/b4/15/4a4bb1b15bbd2cd2786c4f46e76b871b28799b67891f23f455323a0cdcfb/scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9", size = 39333805 }, - { url = "https://files.pythonhosted.org/packages/ba/92/42476de1af309c27710004f5cdebc27bec62c204db42e05b23a302cb0c9a/scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326", size = 30317687 }, - { url = "https://files.pythonhosted.org/packages/80/ba/8be64fe225360a4beb6840f3cbee494c107c0887f33350d0a47d55400b01/scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299", size = 33694638 }, - { url = "https://files.pythonhosted.org/packages/36/07/035d22ff9795129c5a847c64cb43c1fa9188826b59344fee28a3ab02e283/scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa", size = 38569931 }, - { url = "https://files.pythonhosted.org/packages/d9/10/f9b43de37e5ed91facc0cfff31d45ed0104f359e4f9a68416cbf4e790241/scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59", size = 38838145 }, - { url = "https://files.pythonhosted.org/packages/4a/48/4513a1a5623a23e95f94abd675ed91cfb19989c58e9f6f7d03990f6caf3d/scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b", size = 46196227 }, - { url = "https://files.pythonhosted.org/packages/f2/7b/fb6b46fbee30fc7051913068758414f2721003a89dd9a707ad49174e3843/scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1", size = 39357301 }, - { url = "https://files.pythonhosted.org/packages/dc/5a/2043a3bde1443d94014aaa41e0b50c39d046dda8360abd3b2a1d3f79907d/scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d", size = 30363348 }, - { url = "https://files.pythonhosted.org/packages/e7/cb/26e4a47364bbfdb3b7fb3363be6d8a1c543bcd70a7753ab397350f5f189a/scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627", size = 33406062 }, - { url = "https://files.pythonhosted.org/packages/88/ab/6ecdc526d509d33814835447bbbeedbebdec7cca46ef495a61b00a35b4bf/scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884", size = 38218311 }, - { url = "https://files.pythonhosted.org/packages/0b/00/9f54554f0f8318100a71515122d8f4f503b1a2c4b4cfab3b4b68c0eb08fa/scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16", size = 38442493 }, - { url = "https://files.pythonhosted.org/packages/3e/df/963384e90733e08eac978cd103c34df181d1fec424de383cdc443f418dd4/scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949", size = 45910955 }, -] - [[package]] name = "segments" version = "2.2.1"