Remove scipy (#139)

* Remove scipy

* No longer need to replace T

* Update README.md

* Remove numpy version lock

* Update README.md

* Update uv.lock
This commit is contained in:
hexgrad
2025-03-18 11:16:34 -07:00
committed by GitHub
parent 3f9dd88d6f
commit e43d62643e
7 changed files with 62 additions and 71 deletions

View File

@@ -5,10 +5,31 @@ An inference library for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)
> **Kokoro** is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.
### Usage
You can run this cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md).
You can run this basic cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md).
```py
!pip install -q kokoro>=0.9.2 soundfile
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch
pipeline = KPipeline(lang_code='a')
text = '''
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
'''
generator = pipeline(text, voice='af_heart')
for i, (gs, ps, audio) in enumerate(generator):
print(i, gs, ps)
display(Audio(data=audio, rate=24000, autoplay=i==0))
sf.write(f'{i}.wav', audio, 24000)
```
Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki
### Advanced Usage
You can run this advanced cell on [Google Colab](https://colab.research.google.com/).
```py
# 1⃣ Install kokoro
!pip install -q kokoro>=0.8.4 soundfile
!pip install -q kokoro>=0.9.2 soundfile
# 2⃣ Install espeak, used for English OOD fallback and some non-English languages
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
# 🇪🇸 'e' => Spanish es
@@ -50,13 +71,12 @@ generator = pipeline(
text, voice='af_heart', # <= change voice here
speed=1, split_pattern=r'\n+'
)
# Alternatively, load voice tensor directly:
voice_tensor = torch.load('path/to/voice.pt', weights_only=True)
generator = pipeline(
text, voice=voice_tensor,
speed=1, split_pattern=r'\n+'
)
# voice_tensor = torch.load('path/to/voice.pt', weights_only=True)
# generator = pipeline(
# text, voice=voice_tensor,
# speed=1, split_pattern=r'\n+'
# )
for i, (gs, ps, audio) in enumerate(generator):
print(i) # i => index
@@ -66,10 +86,7 @@ for i, (gs, ps, audio) in enumerate(generator):
sf.write(f'{i}.wav', audio, 24000) # save each audio file
```
Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki
### Conda Environment
Use the following conda `environment.yml` if you're facing any dependency issues.
```yaml
name: kokoro
@@ -85,7 +102,6 @@ dependencies:
```
### Acknowledgements
- 🛠️ [@yl4579](https://huggingface.co/yl4579) for architecting StyleTTS 2.
- 🏆 [@Pendrokar](https://huggingface.co/Pendrokar) for adding Kokoro as a contender in the TTS Spaces Arena.
- 📊 Thank you to everyone who contributed synthetic training data.

View File

@@ -1,4 +1,4 @@
__version__ = '0.8.4'
__version__ = '0.9.2'
from loguru import logger
import sys

View File

@@ -1,9 +1,8 @@
from attr import attr
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from scipy.signal import get_window
class CustomSTFT(nn.Module):
"""
@@ -37,8 +36,8 @@ class CustomSTFT(nn.Module):
self.freq_bins = self.n_fft // 2 + 1
# Build window
win_np = get_window(window, self.win_length, fftbins=True).astype(np.float32)
window_tensor = torch.from_numpy(win_np)
assert window == 'hann', window
window_tensor = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
if self.win_length < self.n_fft:
# Zero-pad up to n_fft
extra = self.n_fft - self.win_length

View File

@@ -1,14 +1,11 @@
# https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
import math
from scipy.signal import get_window
# ADAPTED from https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
from kokoro.custom_stft import CustomSTFT
from torch.nn.utils import weight_norm
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from kokoro.custom_stft import CustomSTFT
# https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py
def init_weights(m, mean=0.0, std=0.01):
@@ -86,7 +83,8 @@ class TorchSTFT(nn.Module):
self.filter_length = filter_length
self.hop_length = hop_length
self.win_length = win_length
self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
assert window == 'hann', window
self.window = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
def transform(self, input_data):
forward_transform = torch.stft(
@@ -120,7 +118,7 @@ class SineGen(nn.Module):
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
segment is always sin(torch.pi) or cos(0)
"""
def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
@@ -146,7 +144,7 @@ class SineGen(nn.Module):
where dim indicates fundamental tone and overtones
"""
# convert to F0 in rad. The interger part n can be ignored
# because 2 * np.pi * n doesn't affect phase
# because 2 * torch.pi * n doesn't affect phase
rad_values = (f0_values / self.sampling_rate) % 1
# initial phase noise (no noise for fundamental component)
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
@@ -155,7 +153,7 @@ class SineGen(nn.Module):
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
if not self.flag_for_pulse:
rad_values = F.interpolate(rad_values.transpose(1, 2), scale_factor=1/self.upsample_scale, mode="linear").transpose(1, 2)
phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
phase = torch.cumsum(rad_values, dim=1) * 2 * torch.pi
phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
sines = torch.sin(phase)
else:
@@ -181,7 +179,7 @@ class SineGen(nn.Module):
# within the previous voiced segment.
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
# get the sines
sines = torch.cos(i_phase * 2 * np.pi)
sines = torch.cos(i_phase * 2 * torch.pi)
return sines
def forward(self, f0):
@@ -379,7 +377,7 @@ class AdainResBlk1d(nn.Module):
def forward(self, x, s):
out = self._residual(x, s)
out = (out + self._shortcut(x)) / np.sqrt(2)
out = (out + self._shortcut(x)) * torch.rsqrt(torch.tensor(2))
return out

View File

@@ -200,7 +200,7 @@ class KPipeline:
pcount = 0
for t in tokens:
# American English: ɾ => T
t.phonemes = '' if t.phonemes is None else t.phonemes.replace('ɾ', 'T')
t.phonemes = '' if t.phonemes is None else t.phonemes#.replace('ɾ', 'T')
next_ps = t.phonemes + (' ' if t.whitespace else '')
next_pcount = pcount + len(next_ps.rstrip())
if next_pcount > 510:

View File

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "kokoro"
version = "0.8.4"
version = "0.9.2"
description = "TTS"
readme = "README.md"
authors = [
@@ -20,9 +20,8 @@ requires-python = ">=3.10, <3.13"
dependencies = [
"huggingface_hub",
"loguru",
"misaki[en]>=0.8.4",
"numpy==1.26.4",
"scipy",
"misaki[en]>=0.9.2",
"numpy",
"torch",
"transformers"
]

53
uv.lock generated
View File

@@ -5,6 +5,15 @@ resolution-markers = [
"python_full_version < '3.12'",
]
[[package]]
name = "addict"
version = "2.4.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832 },
]
[[package]]
name = "annotated-types"
version = "0.7.0"
@@ -428,14 +437,13 @@ wheels = [
[[package]]
name = "kokoro"
version = "0.7.16"
version = "0.9.2"
source = { editable = "." }
dependencies = [
{ name = "huggingface-hub" },
{ name = "loguru" },
{ name = "misaki", extra = ["en"] },
{ name = "numpy" },
{ name = "scipy" },
{ name = "torch" },
{ name = "transformers" },
]
@@ -444,9 +452,8 @@ dependencies = [
requires-dist = [
{ name = "huggingface-hub" },
{ name = "loguru" },
{ name = "misaki", extras = ["en"], specifier = ">=0.7.16" },
{ name = "numpy", specifier = "==1.26.4" },
{ name = "scipy" },
{ name = "misaki", extras = ["en"], specifier = ">=0.9.2" },
{ name = "numpy" },
{ name = "torch" },
{ name = "transformers" },
]
@@ -676,14 +683,15 @@ wheels = [
[[package]]
name = "misaki"
version = "0.7.16"
version = "0.9.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "addict" },
{ name = "regex" },
]
sdist = { url = "https://files.pythonhosted.org/packages/6d/ea/704d49d375d475ac1dee144012fc9f4c4a7e408feaa4e15d5999226a147a/misaki-0.7.16.tar.gz", hash = "sha256:ca5a46456391934dc72edc31f20e1db89ddc30897866202cae67152e21bfafa6", size = 3663801 }
sdist = { url = "https://files.pythonhosted.org/packages/e1/3c/81960011deaeeb4a4a74ea418e3a321d615c484e6d9ad99388c13e082f31/misaki-0.9.2.tar.gz", hash = "sha256:cccd6b724c98efc5209ae7e26f8ea5d36ef02619b7a0ea434a6b960388ace8f3", size = 3708863 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f0/96/979a05fb666983478efc40eb42436e39419e37c26e6d6b0fd4e55cae81db/misaki-0.7.16-py3-none-any.whl", hash = "sha256:a847cbeb69a1082c8c6f2b12ee94fd4e2546db13e534f1e6e0edd2da6892f30d", size = 3517386 },
{ url = "https://files.pythonhosted.org/packages/7c/8a/25315cf16af32560b4a47797f47b023da97fb83f7c45410683d6d30dc7bf/misaki-0.9.2-py3-none-any.whl", hash = "sha256:559232c0f5d8d9f9c2e006eebd0a3d1d714ec18e3d207bde110dc299547fe050", size = 3556820 },
]
[package.optional-dependencies]
@@ -1317,35 +1325,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/86/ca/aa489392ec6fb59223ffce825461e1f811a3affd417121a2088be7a5758b/safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589", size = 303756 },
]
[[package]]
name = "scipy"
version = "1.13.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ae/00/48c2f661e2816ccf2ecd77982f6605b2950afe60f60a52b4cbbc2504aa8f/scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c", size = 57210720 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/33/59/41b2529908c002ade869623b87eecff3e11e3ce62e996d0bdcb536984187/scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca", size = 39328076 },
{ url = "https://files.pythonhosted.org/packages/d5/33/f1307601f492f764062ce7dd471a14750f3360e33cd0f8c614dae208492c/scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f", size = 30306232 },
{ url = "https://files.pythonhosted.org/packages/c0/66/9cd4f501dd5ea03e4a4572ecd874936d0da296bd04d1c45ae1a4a75d9c3a/scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989", size = 33743202 },
{ url = "https://files.pythonhosted.org/packages/a3/ba/7255e5dc82a65adbe83771c72f384d99c43063648456796436c9a5585ec3/scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f", size = 38577335 },
{ url = "https://files.pythonhosted.org/packages/49/a5/bb9ded8326e9f0cdfdc412eeda1054b914dfea952bda2097d174f8832cc0/scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94", size = 38820728 },
{ url = "https://files.pythonhosted.org/packages/12/30/df7a8fcc08f9b4a83f5f27cfaaa7d43f9a2d2ad0b6562cced433e5b04e31/scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54", size = 46210588 },
{ url = "https://files.pythonhosted.org/packages/b4/15/4a4bb1b15bbd2cd2786c4f46e76b871b28799b67891f23f455323a0cdcfb/scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9", size = 39333805 },
{ url = "https://files.pythonhosted.org/packages/ba/92/42476de1af309c27710004f5cdebc27bec62c204db42e05b23a302cb0c9a/scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326", size = 30317687 },
{ url = "https://files.pythonhosted.org/packages/80/ba/8be64fe225360a4beb6840f3cbee494c107c0887f33350d0a47d55400b01/scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299", size = 33694638 },
{ url = "https://files.pythonhosted.org/packages/36/07/035d22ff9795129c5a847c64cb43c1fa9188826b59344fee28a3ab02e283/scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa", size = 38569931 },
{ url = "https://files.pythonhosted.org/packages/d9/10/f9b43de37e5ed91facc0cfff31d45ed0104f359e4f9a68416cbf4e790241/scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59", size = 38838145 },
{ url = "https://files.pythonhosted.org/packages/4a/48/4513a1a5623a23e95f94abd675ed91cfb19989c58e9f6f7d03990f6caf3d/scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b", size = 46196227 },
{ url = "https://files.pythonhosted.org/packages/f2/7b/fb6b46fbee30fc7051913068758414f2721003a89dd9a707ad49174e3843/scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1", size = 39357301 },
{ url = "https://files.pythonhosted.org/packages/dc/5a/2043a3bde1443d94014aaa41e0b50c39d046dda8360abd3b2a1d3f79907d/scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d", size = 30363348 },
{ url = "https://files.pythonhosted.org/packages/e7/cb/26e4a47364bbfdb3b7fb3363be6d8a1c543bcd70a7753ab397350f5f189a/scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627", size = 33406062 },
{ url = "https://files.pythonhosted.org/packages/88/ab/6ecdc526d509d33814835447bbbeedbebdec7cca46ef495a61b00a35b4bf/scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884", size = 38218311 },
{ url = "https://files.pythonhosted.org/packages/0b/00/9f54554f0f8318100a71515122d8f4f503b1a2c4b4cfab3b4b68c0eb08fa/scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16", size = 38442493 },
{ url = "https://files.pythonhosted.org/packages/3e/df/963384e90733e08eac978cd103c34df181d1fec424de383cdc443f418dd4/scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949", size = 45910955 },
]
[[package]]
name = "segments"
version = "2.2.1"