Remove scipy (#139)
* Remove scipy * No longer need to replace T * Update README.md * Remove numpy version lock * Update README.md * Update uv.lock
This commit is contained in:
40
README.md
40
README.md
@@ -5,10 +5,31 @@ An inference library for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)
|
||||
> **Kokoro** is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.
|
||||
|
||||
### Usage
|
||||
You can run this cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md).
|
||||
You can run this basic cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md).
|
||||
```py
|
||||
!pip install -q kokoro>=0.9.2 soundfile
|
||||
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
|
||||
from kokoro import KPipeline
|
||||
from IPython.display import display, Audio
|
||||
import soundfile as sf
|
||||
import torch
|
||||
pipeline = KPipeline(lang_code='a')
|
||||
text = '''
|
||||
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
||||
'''
|
||||
generator = pipeline(text, voice='af_heart')
|
||||
for i, (gs, ps, audio) in enumerate(generator):
|
||||
print(i, gs, ps)
|
||||
display(Audio(data=audio, rate=24000, autoplay=i==0))
|
||||
sf.write(f'{i}.wav', audio, 24000)
|
||||
```
|
||||
Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki
|
||||
|
||||
### Advanced Usage
|
||||
You can run this advanced cell on [Google Colab](https://colab.research.google.com/).
|
||||
```py
|
||||
# 1️⃣ Install kokoro
|
||||
!pip install -q kokoro>=0.8.4 soundfile
|
||||
!pip install -q kokoro>=0.9.2 soundfile
|
||||
# 2️⃣ Install espeak, used for English OOD fallback and some non-English languages
|
||||
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
|
||||
# 🇪🇸 'e' => Spanish es
|
||||
@@ -50,13 +71,12 @@ generator = pipeline(
|
||||
text, voice='af_heart', # <= change voice here
|
||||
speed=1, split_pattern=r'\n+'
|
||||
)
|
||||
|
||||
# Alternatively, load voice tensor directly:
|
||||
voice_tensor = torch.load('path/to/voice.pt', weights_only=True)
|
||||
generator = pipeline(
|
||||
text, voice=voice_tensor,
|
||||
speed=1, split_pattern=r'\n+'
|
||||
)
|
||||
# voice_tensor = torch.load('path/to/voice.pt', weights_only=True)
|
||||
# generator = pipeline(
|
||||
# text, voice=voice_tensor,
|
||||
# speed=1, split_pattern=r'\n+'
|
||||
# )
|
||||
|
||||
for i, (gs, ps, audio) in enumerate(generator):
|
||||
print(i) # i => index
|
||||
@@ -66,10 +86,7 @@ for i, (gs, ps, audio) in enumerate(generator):
|
||||
sf.write(f'{i}.wav', audio, 24000) # save each audio file
|
||||
```
|
||||
|
||||
Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki
|
||||
|
||||
### Conda Environment
|
||||
|
||||
Use the following conda `environment.yml` if you're facing any dependency issues.
|
||||
```yaml
|
||||
name: kokoro
|
||||
@@ -85,7 +102,6 @@ dependencies:
|
||||
```
|
||||
|
||||
### Acknowledgements
|
||||
|
||||
- 🛠️ [@yl4579](https://huggingface.co/yl4579) for architecting StyleTTS 2.
|
||||
- 🏆 [@Pendrokar](https://huggingface.co/Pendrokar) for adding Kokoro as a contender in the TTS Spaces Arena.
|
||||
- 📊 Thank you to everyone who contributed synthetic training data.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__version__ = '0.8.4'
|
||||
__version__ = '0.9.2'
|
||||
|
||||
from loguru import logger
|
||||
import sys
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
from attr import attr
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
from scipy.signal import get_window
|
||||
|
||||
class CustomSTFT(nn.Module):
|
||||
"""
|
||||
@@ -37,8 +36,8 @@ class CustomSTFT(nn.Module):
|
||||
self.freq_bins = self.n_fft // 2 + 1
|
||||
|
||||
# Build window
|
||||
win_np = get_window(window, self.win_length, fftbins=True).astype(np.float32)
|
||||
window_tensor = torch.from_numpy(win_np)
|
||||
assert window == 'hann', window
|
||||
window_tensor = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
|
||||
if self.win_length < self.n_fft:
|
||||
# Zero-pad up to n_fft
|
||||
extra = self.n_fft - self.win_length
|
||||
|
||||
@@ -1,14 +1,11 @@
|
||||
# https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
|
||||
import math
|
||||
from scipy.signal import get_window
|
||||
# ADAPTED from https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
|
||||
from kokoro.custom_stft import CustomSTFT
|
||||
from torch.nn.utils import weight_norm
|
||||
import numpy as np
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from kokoro.custom_stft import CustomSTFT
|
||||
|
||||
|
||||
# https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
@@ -86,7 +83,8 @@ class TorchSTFT(nn.Module):
|
||||
self.filter_length = filter_length
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
|
||||
assert window == 'hann', window
|
||||
self.window = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
|
||||
|
||||
def transform(self, input_data):
|
||||
forward_transform = torch.stft(
|
||||
@@ -120,7 +118,7 @@ class SineGen(nn.Module):
|
||||
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
||||
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
||||
Note: when flag_for_pulse is True, the first time step of a voiced
|
||||
segment is always sin(np.pi) or cos(0)
|
||||
segment is always sin(torch.pi) or cos(0)
|
||||
"""
|
||||
def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
|
||||
sine_amp=0.1, noise_std=0.003,
|
||||
@@ -146,7 +144,7 @@ class SineGen(nn.Module):
|
||||
where dim indicates fundamental tone and overtones
|
||||
"""
|
||||
# convert to F0 in rad. The interger part n can be ignored
|
||||
# because 2 * np.pi * n doesn't affect phase
|
||||
# because 2 * torch.pi * n doesn't affect phase
|
||||
rad_values = (f0_values / self.sampling_rate) % 1
|
||||
# initial phase noise (no noise for fundamental component)
|
||||
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
|
||||
@@ -155,7 +153,7 @@ class SineGen(nn.Module):
|
||||
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
|
||||
if not self.flag_for_pulse:
|
||||
rad_values = F.interpolate(rad_values.transpose(1, 2), scale_factor=1/self.upsample_scale, mode="linear").transpose(1, 2)
|
||||
phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
|
||||
phase = torch.cumsum(rad_values, dim=1) * 2 * torch.pi
|
||||
phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
|
||||
sines = torch.sin(phase)
|
||||
else:
|
||||
@@ -181,7 +179,7 @@ class SineGen(nn.Module):
|
||||
# within the previous voiced segment.
|
||||
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
|
||||
# get the sines
|
||||
sines = torch.cos(i_phase * 2 * np.pi)
|
||||
sines = torch.cos(i_phase * 2 * torch.pi)
|
||||
return sines
|
||||
|
||||
def forward(self, f0):
|
||||
@@ -379,7 +377,7 @@ class AdainResBlk1d(nn.Module):
|
||||
|
||||
def forward(self, x, s):
|
||||
out = self._residual(x, s)
|
||||
out = (out + self._shortcut(x)) / np.sqrt(2)
|
||||
out = (out + self._shortcut(x)) * torch.rsqrt(torch.tensor(2))
|
||||
return out
|
||||
|
||||
|
||||
|
||||
@@ -200,7 +200,7 @@ class KPipeline:
|
||||
pcount = 0
|
||||
for t in tokens:
|
||||
# American English: ɾ => T
|
||||
t.phonemes = '' if t.phonemes is None else t.phonemes.replace('ɾ', 'T')
|
||||
t.phonemes = '' if t.phonemes is None else t.phonemes#.replace('ɾ', 'T')
|
||||
next_ps = t.phonemes + (' ' if t.whitespace else '')
|
||||
next_pcount = pcount + len(next_ps.rstrip())
|
||||
if next_pcount > 510:
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "kokoro"
|
||||
version = "0.8.4"
|
||||
version = "0.9.2"
|
||||
description = "TTS"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
@@ -20,9 +20,8 @@ requires-python = ">=3.10, <3.13"
|
||||
dependencies = [
|
||||
"huggingface_hub",
|
||||
"loguru",
|
||||
"misaki[en]>=0.8.4",
|
||||
"numpy==1.26.4",
|
||||
"scipy",
|
||||
"misaki[en]>=0.9.2",
|
||||
"numpy",
|
||||
"torch",
|
||||
"transformers"
|
||||
]
|
||||
|
||||
53
uv.lock
generated
53
uv.lock
generated
@@ -5,6 +5,15 @@ resolution-markers = [
|
||||
"python_full_version < '3.12'",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "addict"
|
||||
version = "2.4.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
version = "0.7.0"
|
||||
@@ -428,14 +437,13 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "kokoro"
|
||||
version = "0.7.16"
|
||||
version = "0.9.2"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "huggingface-hub" },
|
||||
{ name = "loguru" },
|
||||
{ name = "misaki", extra = ["en"] },
|
||||
{ name = "numpy" },
|
||||
{ name = "scipy" },
|
||||
{ name = "torch" },
|
||||
{ name = "transformers" },
|
||||
]
|
||||
@@ -444,9 +452,8 @@ dependencies = [
|
||||
requires-dist = [
|
||||
{ name = "huggingface-hub" },
|
||||
{ name = "loguru" },
|
||||
{ name = "misaki", extras = ["en"], specifier = ">=0.7.16" },
|
||||
{ name = "numpy", specifier = "==1.26.4" },
|
||||
{ name = "scipy" },
|
||||
{ name = "misaki", extras = ["en"], specifier = ">=0.9.2" },
|
||||
{ name = "numpy" },
|
||||
{ name = "torch" },
|
||||
{ name = "transformers" },
|
||||
]
|
||||
@@ -676,14 +683,15 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "misaki"
|
||||
version = "0.7.16"
|
||||
version = "0.9.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "addict" },
|
||||
{ name = "regex" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6d/ea/704d49d375d475ac1dee144012fc9f4c4a7e408feaa4e15d5999226a147a/misaki-0.7.16.tar.gz", hash = "sha256:ca5a46456391934dc72edc31f20e1db89ddc30897866202cae67152e21bfafa6", size = 3663801 }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e1/3c/81960011deaeeb4a4a74ea418e3a321d615c484e6d9ad99388c13e082f31/misaki-0.9.2.tar.gz", hash = "sha256:cccd6b724c98efc5209ae7e26f8ea5d36ef02619b7a0ea434a6b960388ace8f3", size = 3708863 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/96/979a05fb666983478efc40eb42436e39419e37c26e6d6b0fd4e55cae81db/misaki-0.7.16-py3-none-any.whl", hash = "sha256:a847cbeb69a1082c8c6f2b12ee94fd4e2546db13e534f1e6e0edd2da6892f30d", size = 3517386 },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/8a/25315cf16af32560b4a47797f47b023da97fb83f7c45410683d6d30dc7bf/misaki-0.9.2-py3-none-any.whl", hash = "sha256:559232c0f5d8d9f9c2e006eebd0a3d1d714ec18e3d207bde110dc299547fe050", size = 3556820 },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
@@ -1317,35 +1325,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/86/ca/aa489392ec6fb59223ffce825461e1f811a3affd417121a2088be7a5758b/safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589", size = 303756 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scipy"
|
||||
version = "1.13.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ae/00/48c2f661e2816ccf2ecd77982f6605b2950afe60f60a52b4cbbc2504aa8f/scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c", size = 57210720 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/33/59/41b2529908c002ade869623b87eecff3e11e3ce62e996d0bdcb536984187/scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca", size = 39328076 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/33/f1307601f492f764062ce7dd471a14750f3360e33cd0f8c614dae208492c/scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f", size = 30306232 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/66/9cd4f501dd5ea03e4a4572ecd874936d0da296bd04d1c45ae1a4a75d9c3a/scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989", size = 33743202 },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/ba/7255e5dc82a65adbe83771c72f384d99c43063648456796436c9a5585ec3/scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f", size = 38577335 },
|
||||
{ url = "https://files.pythonhosted.org/packages/49/a5/bb9ded8326e9f0cdfdc412eeda1054b914dfea952bda2097d174f8832cc0/scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94", size = 38820728 },
|
||||
{ url = "https://files.pythonhosted.org/packages/12/30/df7a8fcc08f9b4a83f5f27cfaaa7d43f9a2d2ad0b6562cced433e5b04e31/scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54", size = 46210588 },
|
||||
{ url = "https://files.pythonhosted.org/packages/b4/15/4a4bb1b15bbd2cd2786c4f46e76b871b28799b67891f23f455323a0cdcfb/scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9", size = 39333805 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ba/92/42476de1af309c27710004f5cdebc27bec62c204db42e05b23a302cb0c9a/scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326", size = 30317687 },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/ba/8be64fe225360a4beb6840f3cbee494c107c0887f33350d0a47d55400b01/scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299", size = 33694638 },
|
||||
{ url = "https://files.pythonhosted.org/packages/36/07/035d22ff9795129c5a847c64cb43c1fa9188826b59344fee28a3ab02e283/scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa", size = 38569931 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/10/f9b43de37e5ed91facc0cfff31d45ed0104f359e4f9a68416cbf4e790241/scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59", size = 38838145 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4a/48/4513a1a5623a23e95f94abd675ed91cfb19989c58e9f6f7d03990f6caf3d/scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b", size = 46196227 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/7b/fb6b46fbee30fc7051913068758414f2721003a89dd9a707ad49174e3843/scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1", size = 39357301 },
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/5a/2043a3bde1443d94014aaa41e0b50c39d046dda8360abd3b2a1d3f79907d/scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d", size = 30363348 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e7/cb/26e4a47364bbfdb3b7fb3363be6d8a1c543bcd70a7753ab397350f5f189a/scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627", size = 33406062 },
|
||||
{ url = "https://files.pythonhosted.org/packages/88/ab/6ecdc526d509d33814835447bbbeedbebdec7cca46ef495a61b00a35b4bf/scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884", size = 38218311 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0b/00/9f54554f0f8318100a71515122d8f4f503b1a2c4b4cfab3b4b68c0eb08fa/scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16", size = 38442493 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3e/df/963384e90733e08eac978cd103c34df181d1fec424de383cdc443f418dd4/scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949", size = 45910955 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "segments"
|
||||
version = "2.2.1"
|
||||
|
||||
Reference in New Issue
Block a user