Compare commits
25 Commits
e44c9b4add
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 6999bcdb57 | |||
| 1cda188c98 | |||
| 83aa2ec08c | |||
| 067a853d3b | |||
| 8272b6a8c9 | |||
| f2513c12a9 | |||
| 9e907b7573 | |||
| 3340e2e8a5 | |||
| 3f279e53ee | |||
| 985f30e3ae | |||
| e8a3844994 | |||
| ad58061b6f | |||
| 0614418dd4 | |||
| 38c1e5e096 | |||
| 5e8e7ad6d4 | |||
|
|
dfb907a02b | ||
|
|
d4ef0569c7 | ||
|
|
f1d129d835 | ||
|
|
d7654ba4e8 | ||
|
|
2760831139 | ||
|
|
4b647d371e | ||
|
|
2668b2e279 | ||
|
|
6d87f4ae7a | ||
|
|
1c7bdd971d | ||
|
|
4f5106e327 |
51
.gitea/workflows/docker-build.yml
Normal file
51
.gitea/workflows/docker-build.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'v*'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: git.sdgarren.com
|
||||
IMAGE: scott/kokoro
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Gitea Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ secrets.REGISTRY_USERNAME }}
|
||||
password: ${{ secrets.REGISTRY_TOKEN }}
|
||||
|
||||
- name: Extract metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE }}
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=sha,prefix=sha-
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE }}:buildcache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE }}:buildcache,mode=max
|
||||
15
.github/FUNDING.yml
vendored
15
.github/FUNDING.yml
vendored
@@ -1,15 +0,0 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
github: hexgrad # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||
liberapay: # Replace with a single Liberapay username
|
||||
issuehunt: # Replace with a single IssueHunt username
|
||||
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
||||
polar: # Replace with a single Polar username
|
||||
buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
|
||||
thanks_dev: # Replace with a single thanks.dev username
|
||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -169,3 +169,6 @@ cython_debug/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
# HuggingFace model cache (host-mounted Docker volume)
|
||||
hf_cache/
|
||||
|
||||
41
Dockerfile
Normal file
41
Dockerfile
Normal file
@@ -0,0 +1,41 @@
|
||||
FROM rocm/dev-ubuntu-22.04:6.1.2
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV HF_HOME=/root/.cache/huggingface
|
||||
|
||||
# System dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-dev \
|
||||
espeak-ng \
|
||||
libespeak-ng1 \
|
||||
libsndfile1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# PyTorch 2.5.1 + torchaudio for ROCm 6.1
|
||||
RUN pip3 install --no-cache-dir \
|
||||
torch==2.5.1 \
|
||||
torchaudio==2.5.1 \
|
||||
--index-url https://download.pytorch.org/whl/rocm6.1
|
||||
|
||||
# Application dependencies
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
RUN pip3 install --no-cache-dir -r /app/requirements.txt
|
||||
|
||||
# Pre-download model weights at build time
|
||||
# Uses CPU to avoid needing GPU during docker build
|
||||
RUN python3 -c "\
|
||||
from kokoro import KModel; \
|
||||
print('Downloading Kokoro model weights...'); \
|
||||
KModel(repo_id='hexgrad/Kokoro-82M'); \
|
||||
print('Model download complete.')"
|
||||
|
||||
WORKDIR /app
|
||||
COPY server.py config.yaml entrypoint.sh ./
|
||||
RUN chmod +x entrypoint.sh
|
||||
|
||||
EXPOSE 10300
|
||||
|
||||
ENTRYPOINT ["./entrypoint.sh"]
|
||||
165
README.md
165
README.md
@@ -1,113 +1,74 @@
|
||||
# kokoro
|
||||
# kokoro-rocm-wyoming
|
||||
|
||||
An inference library for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M). You can [`pip install kokoro`](https://pypi.org/project/kokoro/).
|
||||
A Docker image running [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) TTS on AMD GPUs via ROCm, with a [Wyoming protocol](https://github.com/rhasspy/wyoming) server for Home Assistant integration.
|
||||
|
||||
> **Kokoro** is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.
|
||||
## Stack
|
||||
|
||||
### Usage
|
||||
You can run this basic cell on [Google Colab](https://colab.research.google.com/). [Listen to samples](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/SAMPLES.md).
|
||||
```py
|
||||
!pip install -q kokoro>=0.9.2 soundfile
|
||||
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
|
||||
from kokoro import KPipeline
|
||||
from IPython.display import display, Audio
|
||||
import soundfile as sf
|
||||
import torch
|
||||
pipeline = KPipeline(lang_code='a')
|
||||
text = '''
|
||||
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
||||
'''
|
||||
generator = pipeline(text, voice='af_heart')
|
||||
for i, (gs, ps, audio) in enumerate(generator):
|
||||
print(i, gs, ps)
|
||||
display(Audio(data=audio, rate=24000, autoplay=i==0))
|
||||
sf.write(f'{i}.wav', audio, 24000)
|
||||
```
|
||||
Under the hood, `kokoro` uses [`misaki`](https://pypi.org/project/misaki/), a G2P library at https://github.com/hexgrad/misaki
|
||||
| Component | Version |
|
||||
|-----------|---------|
|
||||
| ROCm | 6.1.2 |
|
||||
| PyTorch | 2.5.1 |
|
||||
| Target GPU | AMD RX 6700 XT (gfx1031) |
|
||||
| Kokoro model | hexgrad/Kokoro-82M |
|
||||
| Protocol | Wyoming (TCP, port 10300) |
|
||||
|
||||
### Advanced Usage
|
||||
You can run this advanced cell on [Google Colab](https://colab.research.google.com/).
|
||||
```py
|
||||
# 1️⃣ Install kokoro
|
||||
!pip install -q kokoro>=0.9.2 soundfile
|
||||
## Quick start
|
||||
|
||||
# 2️⃣ Install espeak, used for English OOD fallback and some non-English languages
|
||||
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
|
||||
|
||||
# 3️⃣ Initalize a pipeline
|
||||
from kokoro import KPipeline
|
||||
from IPython.display import display, Audio
|
||||
import soundfile as sf
|
||||
import torch
|
||||
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
|
||||
# 🇪🇸 'e' => Spanish es
|
||||
# 🇫🇷 'f' => French fr-fr
|
||||
# 🇮🇳 'h' => Hindi hi
|
||||
# 🇮🇹 'i' => Italian it
|
||||
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
|
||||
# 🇧🇷 'p' => Brazilian Portuguese pt-br
|
||||
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
|
||||
pipeline = KPipeline(lang_code='a') # <= make sure lang_code matches voice, reference above.
|
||||
|
||||
# This text is for demonstration purposes only, unseen during training
|
||||
text = '''
|
||||
The sky above the port was the color of television, tuned to a dead channel.
|
||||
"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
|
||||
It was a Sprawl voice and a Sprawl joke. The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.
|
||||
|
||||
These were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come. One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures. The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire. Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need. We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.
|
||||
|
||||
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
||||
'''
|
||||
# text = '「もしおれがただ偶然、そしてこうしようというつもりでなくここに立っているのなら、ちょっとばかり絶望するところだな」と、そんなことが彼の頭に思い浮かんだ。'
|
||||
# text = '中國人民不信邪也不怕邪,不惹事也不怕事,任何外國不要指望我們會拿自己的核心利益做交易,不要指望我們會吞下損害我國主權、安全、發展利益的苦果!'
|
||||
# text = 'Los partidos políticos tradicionales compiten con los populismos y los movimientos asamblearios.'
|
||||
# text = 'Le dromadaire resplendissant déambulait tranquillement dans les méandres en mastiquant de petites feuilles vernissées.'
|
||||
# text = 'ट्रांसपोर्टरों की हड़ताल लगातार पांचवें दिन जारी, दिसंबर से इलेक्ट्रॉनिक टोल कलेक्शनल सिस्टम'
|
||||
# text = "Allora cominciava l'insonnia, o un dormiveglia peggiore dell'insonnia, che talvolta assumeva i caratteri dell'incubo."
|
||||
# text = 'Elabora relatórios de acompanhamento cronológico para as diferentes unidades do Departamento que propõem contratos.'
|
||||
|
||||
# 4️⃣ Generate, display, and save audio files in a loop.
|
||||
generator = pipeline(
|
||||
text, voice='af_heart', # <= change voice here
|
||||
speed=1, split_pattern=r'\n+'
|
||||
)
|
||||
# Alternatively, load voice tensor directly:
|
||||
# voice_tensor = torch.load('path/to/voice.pt', weights_only=True)
|
||||
# generator = pipeline(
|
||||
# text, voice=voice_tensor,
|
||||
# speed=1, split_pattern=r'\n+'
|
||||
# )
|
||||
|
||||
for i, (gs, ps, audio) in enumerate(generator):
|
||||
print(i) # i => index
|
||||
print(gs) # gs => graphemes/text
|
||||
print(ps) # ps => phonemes
|
||||
display(Audio(data=audio, rate=24000, autoplay=i==0))
|
||||
sf.write(f'{i}.wav', audio, 24000) # save each audio file
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### Conda Environment
|
||||
Use the following conda `environment.yml` if you're facing any dependency issues.
|
||||
The Wyoming server will be available at `<host-ip>:10300`.
|
||||
|
||||
## Home Assistant setup
|
||||
|
||||
1. In Home Assistant, go to **Settings → Devices & Services → Add Integration**
|
||||
2. Search for **Wyoming Protocol**
|
||||
3. Enter your host IP and port `10300`
|
||||
4. Kokoro voices will appear in your voice assistant configuration
|
||||
|
||||
## Configuration
|
||||
|
||||
Edit `config.yaml` before building to change the default voice, language, speed, or the list of voices advertised to Home Assistant
|
||||
|
||||
```yaml
|
||||
name: kokoro
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- python==3.9
|
||||
- libstdcxx~=12.4.0 # Needed to load espeak correctly. Try removing this if you're facing issues with Espeak fallback.
|
||||
- pip:
|
||||
- kokoro>=0.3.1
|
||||
- soundfile
|
||||
- misaki[en]
|
||||
tts:
|
||||
device: cuda # ROCm presents as 'cuda' to PyTorch via HIP
|
||||
language: a # a=American English, b=British English, etc.
|
||||
default_voice: af_heart
|
||||
default_speed: 1.0
|
||||
voices:
|
||||
- name: af_heart
|
||||
description: "Heart (Female, American English)"
|
||||
language: en-us
|
||||
# add more voices here
|
||||
```
|
||||
|
||||
### Acknowledgements
|
||||
- 🛠️ [@yl4579](https://huggingface.co/yl4579) for architecting StyleTTS 2.
|
||||
- 🏆 [@Pendrokar](https://huggingface.co/Pendrokar) for adding Kokoro as a contender in the TTS Spaces Arena.
|
||||
- 📊 Thank you to everyone who contributed synthetic training data.
|
||||
- ❤️ Special thanks to all compute sponsors.
|
||||
- 👾 Discord server: https://discord.gg/QuGxSWBfQy
|
||||
- 🪽 Kokoro is a Japanese word that translates to "heart" or "spirit". Kokoro is also a [character in the Terminator franchise](https://terminator.fandom.com/wiki/Kokoro) along with [Misaki](https://github.com/hexgrad/misaki?tab=readme-ov-file#acknowledgements).
|
||||
Available language codes: `a` (American English), `b` (British English), `e` (Spanish), `f` (French), `h` (Hindi), `i` (Italian), `j` (Japanese), `p` (Portuguese), `z` (Mandarin).
|
||||
|
||||
<img src="https://static0.gamerantimages.com/wordpress/wp-content/uploads/2024/08/terminator-zero-41-1.jpg" width="400" alt="kokoro" />
|
||||
## Building
|
||||
|
||||
The image is built automatically by Gitea Actions on every push to `main` and on `v*` tags. To build locally:
|
||||
|
||||
```bash
|
||||
docker build -t kokoro-rocm-wyoming .
|
||||
```
|
||||
|
||||
Model weights are downloaded from HuggingFace at build time. Voice files are fetched on first use and cached in the `hf_cache` Docker volume.
|
||||
|
||||
## GPU passthrough
|
||||
|
||||
The compose file passes through `/dev/kfd` and `/dev/dri` and adds the `video` and `render` groups. If ROCm does not detect the 6700 XT, uncomment the override in `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- HSA_OVERRIDE_GFX_VERSION=10.3.0
|
||||
```
|
||||
|
||||
## Audio output
|
||||
|
||||
Kokoro outputs 24 kHz 16-bit mono PCM. The Wyoming server streams chunks to Home Assistant as they are generated — long utterances start playing before synthesis is complete.
|
||||
|
||||
## License
|
||||
|
||||
Model weights: [Apache 2.0](https://huggingface.co/hexgrad/Kokoro-82M)
|
||||
|
||||
45
config.yaml
Normal file
45
config.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
# Kokoro TTS Wyoming Server Configuration
|
||||
|
||||
server:
|
||||
uri: tcp://0.0.0.0:10300
|
||||
|
||||
tts:
|
||||
device: cuda # ROCm presents as 'cuda' to PyTorch via HIP
|
||||
language: a # a=American English, b=British English, e=Spanish,
|
||||
# f=French, h=Hindi, i=Italian, j=Japanese,
|
||||
# p=Portuguese, z=Mandarin
|
||||
|
||||
default_voice: af_heart
|
||||
default_speed: 1.0
|
||||
|
||||
# Voices to advertise to Home Assistant.
|
||||
# Add or remove entries as needed.
|
||||
# Voice files are downloaded from HuggingFace on first use.
|
||||
voices:
|
||||
- name: af_heart
|
||||
description: "Heart (Female, American English)"
|
||||
language: en-us
|
||||
|
||||
- name: af_bella
|
||||
description: "Bella (Female, American English)"
|
||||
language: en-us
|
||||
|
||||
- name: af_nova
|
||||
description: "Nova (Female, American English)"
|
||||
language: en-us
|
||||
|
||||
- name: am_michael
|
||||
description: "Michael (Male, American English)"
|
||||
language: en-us
|
||||
|
||||
- name: am_fenrir
|
||||
description: "Fenrir (Male, American English)"
|
||||
language: en-us
|
||||
|
||||
- name: bf_emma
|
||||
description: "Emma (Female, British English)"
|
||||
language: en-gb
|
||||
|
||||
- name: bm_george
|
||||
description: "George (Male, British English)"
|
||||
language: en-gb
|
||||
@@ -1,15 +0,0 @@
|
||||
---
|
||||
title: Kokoro TTS
|
||||
emoji: ❤️
|
||||
colorFrom: indigo
|
||||
colorTo: pink
|
||||
sdk: gradio
|
||||
sdk_version: 5.12.0
|
||||
app_file: app.py
|
||||
pinned: true
|
||||
license: apache-2.0
|
||||
short_description: Upgraded to v1.0!
|
||||
disable_embedding: true
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
182
demo/app.py
182
demo/app.py
@@ -1,182 +0,0 @@
|
||||
import spaces
|
||||
from kokoro import KModel, KPipeline
|
||||
import gradio as gr
|
||||
import os
|
||||
import random
|
||||
import torch
|
||||
|
||||
CUDA_AVAILABLE = torch.cuda.is_available()
|
||||
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
|
||||
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
|
||||
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
|
||||
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
|
||||
|
||||
@spaces.GPU(duration=30)
|
||||
def forward_gpu(ps, ref_s, speed):
|
||||
return models[True](ps, ref_s, speed)
|
||||
|
||||
def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
|
||||
pipeline = pipelines[voice[0]]
|
||||
pack = pipeline.load_voice(voice)
|
||||
use_gpu = use_gpu and CUDA_AVAILABLE
|
||||
for _, ps, _ in pipeline(text, voice, speed):
|
||||
ref_s = pack[len(ps)-1]
|
||||
try:
|
||||
if use_gpu:
|
||||
audio = forward_gpu(ps, ref_s, speed)
|
||||
else:
|
||||
audio = models[False](ps, ref_s, speed)
|
||||
except gr.exceptions.Error as e:
|
||||
if use_gpu:
|
||||
gr.Warning(str(e))
|
||||
gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.')
|
||||
audio = models[False](ps, ref_s, speed)
|
||||
else:
|
||||
raise gr.Error(e)
|
||||
return (24000, audio.numpy()), ps
|
||||
return None, ''
|
||||
|
||||
# Arena API
|
||||
def predict(text, voice='af_heart', speed=1):
|
||||
return generate_first(text, voice, speed, use_gpu=False)[0]
|
||||
|
||||
def tokenize_first(text, voice='af_heart'):
|
||||
pipeline = pipelines[voice[0]]
|
||||
for _, ps, _ in pipeline(text, voice):
|
||||
return ps
|
||||
return ''
|
||||
|
||||
def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
|
||||
pipeline = pipelines[voice[0]]
|
||||
pack = pipeline.load_voice(voice)
|
||||
use_gpu = use_gpu and CUDA_AVAILABLE
|
||||
first = True
|
||||
for _, ps, _ in pipeline(text, voice, speed):
|
||||
ref_s = pack[len(ps)-1]
|
||||
try:
|
||||
if use_gpu:
|
||||
audio = forward_gpu(ps, ref_s, speed)
|
||||
else:
|
||||
audio = models[False](ps, ref_s, speed)
|
||||
except gr.exceptions.Error as e:
|
||||
if use_gpu:
|
||||
gr.Warning(str(e))
|
||||
gr.Info('Switching to CPU')
|
||||
audio = models[False](ps, ref_s, speed)
|
||||
else:
|
||||
raise gr.Error(e)
|
||||
yield 24000, audio.numpy()
|
||||
if first:
|
||||
first = False
|
||||
yield 24000, torch.zeros(1).numpy()
|
||||
|
||||
with open('en.txt', 'r') as r:
|
||||
random_quotes = [line.strip() for line in r]
|
||||
|
||||
def get_random_quote():
|
||||
return random.choice(random_quotes)
|
||||
|
||||
def get_gatsby():
|
||||
with open('gatsby5k.md', 'r') as r:
|
||||
return r.read().strip()
|
||||
|
||||
def get_frankenstein():
|
||||
with open('frankenstein5k.md', 'r') as r:
|
||||
return r.read().strip()
|
||||
|
||||
CHOICES = {
|
||||
'🇺🇸 🚺 Heart ❤️': 'af_heart',
|
||||
'🇺🇸 🚺 Bella 🔥': 'af_bella',
|
||||
'🇺🇸 🚺 Nicole 🎧': 'af_nicole',
|
||||
'🇺🇸 🚺 Aoede': 'af_aoede',
|
||||
'🇺🇸 🚺 Kore': 'af_kore',
|
||||
'🇺🇸 🚺 Sarah': 'af_sarah',
|
||||
'🇺🇸 🚺 Nova': 'af_nova',
|
||||
'🇺🇸 🚺 Sky': 'af_sky',
|
||||
'🇺🇸 🚺 Alloy': 'af_alloy',
|
||||
'🇺🇸 🚺 Jessica': 'af_jessica',
|
||||
'🇺🇸 🚺 River': 'af_river',
|
||||
'🇺🇸 🚹 Michael': 'am_michael',
|
||||
'🇺🇸 🚹 Fenrir': 'am_fenrir',
|
||||
'🇺🇸 🚹 Puck': 'am_puck',
|
||||
'🇺🇸 🚹 Echo': 'am_echo',
|
||||
'🇺🇸 🚹 Eric': 'am_eric',
|
||||
'🇺🇸 🚹 Liam': 'am_liam',
|
||||
'🇺🇸 🚹 Onyx': 'am_onyx',
|
||||
'🇺🇸 🚹 Santa': 'am_santa',
|
||||
'🇺🇸 🚹 Adam': 'am_adam',
|
||||
'🇬🇧 🚺 Emma': 'bf_emma',
|
||||
'🇬🇧 🚺 Isabella': 'bf_isabella',
|
||||
'🇬🇧 🚺 Alice': 'bf_alice',
|
||||
'🇬🇧 🚺 Lily': 'bf_lily',
|
||||
'🇬🇧 🚹 George': 'bm_george',
|
||||
'🇬🇧 🚹 Fable': 'bm_fable',
|
||||
'🇬🇧 🚹 Lewis': 'bm_lewis',
|
||||
'🇬🇧 🚹 Daniel': 'bm_daniel',
|
||||
}
|
||||
for v in CHOICES.values():
|
||||
pipelines[v[0]].load_voice(v)
|
||||
|
||||
TOKEN_NOTE = '''
|
||||
💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
|
||||
|
||||
💬 To adjust intonation, try punctuation `;:,.!?—…"()“”` or stress `ˈ` and `ˌ`
|
||||
|
||||
⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
|
||||
|
||||
⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
|
||||
'''
|
||||
|
||||
with gr.Blocks() as generate_tab:
|
||||
out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
|
||||
generate_btn = gr.Button('Generate', variant='primary')
|
||||
with gr.Accordion('Output Tokens', open=True):
|
||||
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
|
||||
tokenize_btn = gr.Button('Tokenize', variant='secondary')
|
||||
gr.Markdown(TOKEN_NOTE)
|
||||
predict_btn = gr.Button('Predict', variant='secondary', visible=False)
|
||||
|
||||
STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
|
||||
STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
|
||||
|
||||
with gr.Blocks() as stream_tab:
|
||||
out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
|
||||
with gr.Row():
|
||||
stream_btn = gr.Button('Stream', variant='primary')
|
||||
stop_btn = gr.Button('Stop', variant='stop')
|
||||
with gr.Accordion('Note', open=True):
|
||||
gr.Markdown(STREAM_NOTE)
|
||||
gr.DuplicateButton()
|
||||
|
||||
API_OPEN = True
|
||||
with gr.Blocks() as app:
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
text = gr.Textbox(label='Input Text', info=f"Arbitrarily many characters supported")
|
||||
with gr.Row():
|
||||
voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Quality and availability vary by language')
|
||||
use_gpu = gr.Dropdown(
|
||||
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
||||
value=CUDA_AVAILABLE,
|
||||
label='Hardware',
|
||||
info='GPU is usually faster, but has a usage quota',
|
||||
interactive=CUDA_AVAILABLE
|
||||
)
|
||||
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
|
||||
random_btn = gr.Button('🎲 Random Quote 💬', variant='secondary')
|
||||
with gr.Row():
|
||||
gatsby_btn = gr.Button('🥂 Gatsby 📕', variant='secondary')
|
||||
frankenstein_btn = gr.Button('💀 Frankenstein 📗', variant='secondary')
|
||||
with gr.Column():
|
||||
gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
|
||||
random_btn.click(fn=get_random_quote, inputs=[], outputs=[text])
|
||||
gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text])
|
||||
frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text])
|
||||
generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
|
||||
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
|
||||
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
|
||||
stop_btn.click(fn=None, cancels=stream_event)
|
||||
predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.queue(api_open=API_OPEN).launch(server_name="0.0.0.0", server_port=40001, show_api=API_OPEN)
|
||||
2123
demo/en.txt
2123
demo/en.txt
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.
|
||||
|
||||
I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight. Do you understand this feeling? This breeze, which has travelled from the regions towards which I am advancing, gives me a foretaste of those icy climes. Inspirited by this wind of promise, my daydreams become more fervent and vivid. I try in vain to be persuaded that the pole is the seat of frost and desolation; it ever presents itself to my imagination as the region of beauty and delight. There, Margaret, the sun is for ever visible, its broad disk just skirting the horizon and diffusing a perpetual splendour. There—for with your leave, my sister, I will put some trust in preceding navigators—there snow and frost are banished; and, sailing over a calm sea, we may be wafted to a land surpassing in wonders and in beauty every region hitherto discovered on the habitable globe. Its productions and features may be without example, as the phenomena of the heavenly bodies undoubtedly are in those undiscovered solitudes. What may not be expected in a country of eternal light? I may there discover the wondrous power which attracts the needle and may regulate a thousand celestial observations that require only this voyage to render their seeming eccentricities consistent for ever. I shall satiate my ardent curiosity with the sight of a part of the world never before visited, and may tread a land never before imprinted by the foot of man. These are my enticements, and they are sufficient to conquer all fear of danger or death and to induce me to commence this laborious voyage with the joy a child feels when he embarks in a little boat, with his holiday mates, on an expedition of discovery up his native river. But supposing all these conjectures to be false, you cannot contest the inestimable benefit which I shall confer on all mankind, to the last generation, by discovering a passage near the pole to those countries, to reach which at present so many months are requisite; or by ascertaining the secret of the magnet, which, if at all possible, can only be effected by an undertaking such as mine.
|
||||
|
||||
These reflections have dispelled the agitation with which I began my letter, and I feel my heart glow with an enthusiasm which elevates me to heaven, for nothing contributes so much to tranquillise the mind as a steady purpose—a point on which the soul may fix its intellectual eye. This expedition has been the favourite dream of my early years. I have read with ardour the accounts of the various voyages which have been made in the prospect of arriving at the North Pacific Ocean through the seas which surround the pole. You may remember that a history of all the voyages made for purposes of discovery composed the whole of our good Uncle Thomas’s library. My education was neglected, yet I was passionately fond of reading. These volumes were my study day and night, and my familiarity with them increased that regret which I had felt, as a child, on learning that my father’s dying injunction had forbidden my uncle to allow me to embark in a seafaring life.
|
||||
|
||||
These visions faded when I perused, for the first time, those poets whose effusions entranced my soul and lifted it to heaven. I also became a poet and for one year lived in a paradise of my own creation; I imagined that I also might obtain a niche in the temple where the names of Homer and Shakespeare are consecrated. You are well acquainted with my failure and how heavily I bore the disappointment. But just at that time I inherited the fortune of my cousin, and my thoughts were turned into the channel of their earlier bent.
|
||||
|
||||
Six years have passed since I resolved on my present undertaking. I can, even now, remember the hour from which I dedicated myself to this great enterprise. I commenced by inuring my body to hardship. I accompanied the whale-fishers on several expeditions to the North Sea; I voluntarily endured cold, famine, thirst, and want of sleep; I often worked harder than the common sailors during the day and devoted my nights to the study of mathematics, the theory of medicine, and those branches of physical science from which a naval adventurer might derive the greatest practical advantage. Twice I actually hired myself as an under-mate in a Greenland whaler, and acquitted myself to admiration. I must own I felt a little proud when my captain offered me the second dignity in the vessel and entreated me to remain with the greatest earnestness, so valuable did he consider my services.
|
||||
|
||||
And now, dear Margaret, do I not deserve to accomplish some great purpose?
|
||||
@@ -1,17 +0,0 @@
|
||||
In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in my mind ever since.
|
||||
|
||||
“Whenever you feel like criticizing anyone,” he told me, “just remember that all the people in this world haven’t had the advantages that you’ve had.”
|
||||
|
||||
He didn’t say any more, but we’ve always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that. In consequence, I’m inclined to reserve all judgements, a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores. The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person, and so it came about that in college I was unjustly accused of being a politician, because I was privy to the secret griefs of wild, unknown men. Most of the confidences were unsought—frequently I have feigned sleep, preoccupation, or a hostile levity when I realized by some unmistakable sign that an intimate revelation was quivering on the horizon; for the intimate revelations of young men, or at least the terms in which they express them, are usually plagiaristic and marred by obvious suppressions. Reserving judgements is a matter of infinite hope. I am still a little afraid of missing something if I forget that, as my father snobbishly suggested, and I snobbishly repeat, a sense of the fundamental decencies is parcelled out unequally at birth.
|
||||
|
||||
And, after boasting this way of my tolerance, I come to the admission that it has a limit. Conduct may be founded on the hard rock or the wet marshes, but after a certain point I don’t care what it’s founded on. When I came back from the East last autumn I felt that I wanted the world to be in uniform and at a sort of moral attention forever; I wanted no more riotous excursions with privileged glimpses into the human heart. Only Gatsby, the man who gives his name to this book, was exempt from my reaction—Gatsby, who represented everything for which I have an unaffected scorn. If personality is an unbroken series of successful gestures, then there was something gorgeous about him, some heightened sensitivity to the promises of life, as if he were related to one of those intricate machines that register earthquakes ten thousand miles away. This responsiveness had nothing to do with that flabby impressionability which is dignified under the name of the “creative temperament”—it was an extraordinary gift for hope, a romantic readiness such as I have never found in any other person and which it is not likely I shall ever find again. No—Gatsby turned out all right at the end; it is what preyed on Gatsby, what foul dust floated in the wake of his dreams that temporarily closed out my interest in the abortive sorrows and short-winded elations of men.
|
||||
|
||||
My family have been prominent, well-to-do people in this Middle Western city for three generations. The Carraways are something of a clan, and we have a tradition that we’re descended from the Dukes of Buccleuch, but the actual founder of my line was my grandfather’s brother, who came here in fifty-one, sent a substitute to the Civil War, and started the wholesale hardware business that my father carries on today.
|
||||
|
||||
I never saw this great-uncle, but I’m supposed to look like him—with special reference to the rather hard-boiled painting that hangs in father’s office. I graduated from New Haven in 1915, just a quarter of a century after my father, and a little later I participated in that delayed Teutonic migration known as the Great War. I enjoyed the counter-raid so thoroughly that I came back restless. Instead of being the warm centre of the world, the Middle West now seemed like the ragged edge of the universe—so I decided to go East and learn the bond business. Everybody I knew was in the bond business, so I supposed it could support one more single man. All my aunts and uncles talked it over as if they were choosing a prep school for me, and finally said, “Why—[ye-es](/jˈɛ ɛs/),” with very grave, hesitant faces. Father agreed to finance me for a year, and after various delays I came East, permanently, I thought, in the spring of twenty-two.
|
||||
|
||||
The practical thing was to find rooms in the city, but it was a warm season, and I had just left a country of wide lawns and friendly trees, so when a young man at the office suggested that we take a house together in a commuting town, it sounded like a great idea. He found the house, a weather-beaten cardboard bungalow at eighty a month, but at the last minute the firm ordered him to Washington, and I went out to the country alone. I had a dog—at least I had him for a few days until he ran away—and an old Dodge and a Finnish woman, who made my bed and cooked breakfast and muttered Finnish wisdom to herself over the electric stove.
|
||||
|
||||
It was lonely for a day or so until one morning some man, more recently arrived than I, stopped me on the road.
|
||||
|
||||
“How do you get to West Egg village?” he asked helplessly.
|
||||
@@ -1 +0,0 @@
|
||||
espeak-ng
|
||||
@@ -1,3 +0,0 @@
|
||||
kokoro>=0.7.13
|
||||
gradio
|
||||
pip
|
||||
44
docker-compose.yml
Normal file
44
docker-compose.yml
Normal file
@@ -0,0 +1,44 @@
|
||||
services:
|
||||
kokoro-tts:
|
||||
image: git.sdgarren.com/scott/kokoro:main
|
||||
container_name: kokoro-tts
|
||||
restart: unless-stopped
|
||||
|
||||
# ROCm GPU passthrough
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
group_add:
|
||||
- video
|
||||
- render
|
||||
|
||||
ports:
|
||||
- "10300:10300"
|
||||
|
||||
volumes:
|
||||
# Persist HuggingFace model/voice cache so downloads survive container restarts
|
||||
- ./hf_cache:/root/.cache/huggingface
|
||||
# Mount config so voices/settings can be changed without rebuilding
|
||||
- ./config.yaml:/app/config.yaml:ro
|
||||
|
||||
environment:
|
||||
- HIP_VISIBLE_DEVICES=0
|
||||
# gfx1031 (6700 XT) is not in the ROCm wheel's compiled arch list; override to gfx1030
|
||||
- HSA_OVERRIDE_GFX_VERSION=10.3.0
|
||||
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- python3
|
||||
- -c
|
||||
- |
|
||||
import socket
|
||||
s = socket.socket()
|
||||
s.settimeout(5)
|
||||
s.connect(('localhost', 10300))
|
||||
s.close()
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
26
entrypoint.sh
Normal file
26
entrypoint.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "=== Kokoro TTS Wyoming Server ==="
|
||||
|
||||
# Show ROCm device info if available
|
||||
if command -v rocm-smi &>/dev/null; then
|
||||
echo "--- ROCm Devices ---"
|
||||
rocm-smi --showproductname 2>/dev/null || true
|
||||
echo "--------------------"
|
||||
fi
|
||||
|
||||
# Quick GPU availability check via Python
|
||||
python3 - <<'EOF'
|
||||
import torch
|
||||
available = torch.cuda.is_available()
|
||||
print(f"ROCm/CUDA available: {available}")
|
||||
if available:
|
||||
count = torch.cuda.device_count()
|
||||
for i in range(count):
|
||||
print(f" [{i}] {torch.cuda.get_device_name(i)}")
|
||||
else:
|
||||
print(" WARNING: No GPU detected — running on CPU (performance will be degraded)")
|
||||
EOF
|
||||
|
||||
exec python3 /app/server.py "$@"
|
||||
@@ -1,45 +0,0 @@
|
||||
"""
|
||||
Quick example to show how device selection can be controlled, and was checked
|
||||
"""
|
||||
import time
|
||||
from kokoro import KPipeline
|
||||
from loguru import logger
|
||||
|
||||
def generate_audio(pipeline, text):
|
||||
for _, _, audio in pipeline(text, voice='af_bella'):
|
||||
samples = audio.shape[0] if audio is not None else 0
|
||||
assert samples > 0, "No audio generated"
|
||||
return samples
|
||||
|
||||
def time_synthesis(device=None):
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
pipeline = KPipeline(lang_code='a', device=device)
|
||||
samples = generate_audio(pipeline, "The quick brown fox jumps over the lazy dog.")
|
||||
ms = (time.perf_counter() - start) * 1000
|
||||
logger.info(f"✓ {device or 'auto':<6} | {ms:>5.1f}ms total | {samples:>6,d} samples")
|
||||
except RuntimeError as e:
|
||||
logger.error(f"✗ {'cuda' if 'CUDA' in str(e) else device or 'auto':<6} | {'not available' if 'CUDA' in str(e) else str(e)}")
|
||||
|
||||
def compare_shared_model():
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
en_us = KPipeline(lang_code='a')
|
||||
en_uk = KPipeline(lang_code='a', model=en_us.model)
|
||||
|
||||
for pipeline in [en_us, en_uk]:
|
||||
generate_audio(pipeline, "Testing model reuse.")
|
||||
|
||||
ms = (time.perf_counter() - start) * 1000
|
||||
logger.info(f"✓ reuse | {ms:>5.1f}ms for both models")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ reuse | {str(e)}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger.info("Device Selection & Performance")
|
||||
logger.info("-" * 40)
|
||||
time_synthesis()
|
||||
time_synthesis('cuda')
|
||||
time_synthesis('cpu')
|
||||
logger.info("-" * 40)
|
||||
compare_shared_model()
|
||||
@@ -1,148 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import torch
|
||||
import onnx
|
||||
import onnxruntime as ort
|
||||
import sounddevice as sd
|
||||
|
||||
from kokoro import KModel, KPipeline
|
||||
from kokoro.model import KModelForONNX
|
||||
|
||||
def export_onnx(model, output):
|
||||
onnx_file = output + "/" + "kokoro.onnx"
|
||||
|
||||
input_ids = torch.randint(1, 100, (48,)).numpy()
|
||||
input_ids = torch.LongTensor([[0, *input_ids, 0]])
|
||||
style = torch.randn(1, 256)
|
||||
speed = torch.randint(1, 10, (1,)).int()
|
||||
|
||||
torch.onnx.export(
|
||||
model,
|
||||
args = (input_ids, style, speed),
|
||||
f = onnx_file,
|
||||
export_params = True,
|
||||
verbose = True,
|
||||
input_names = [ 'input_ids', 'style', 'speed' ],
|
||||
output_names = [ 'waveform', 'duration' ],
|
||||
opset_version = 17,
|
||||
dynamic_axes = {
|
||||
'input_ids': { 1: 'input_ids_len' },
|
||||
'waveform': { 0: 'num_samples' },
|
||||
},
|
||||
do_constant_folding = True,
|
||||
)
|
||||
|
||||
print('export kokoro.onnx ok!')
|
||||
|
||||
onnx_model = onnx.load(onnx_file)
|
||||
onnx.checker.check_model(onnx_model)
|
||||
print('onnx check ok!')
|
||||
|
||||
def load_input_ids(pipeline, text):
|
||||
if pipeline.lang_code in 'ab':
|
||||
_, tokens = pipeline.g2p(text)
|
||||
for gs, ps, tks in pipeline.en_tokenize(tokens):
|
||||
if not ps:
|
||||
continue
|
||||
else:
|
||||
ps, _ = pipeline.g2p(text)
|
||||
|
||||
if len(ps) > 510:
|
||||
ps = ps[:510]
|
||||
|
||||
input_ids = list(filter(lambda i: i is not None, map(lambda p: pipeline.model.vocab.get(p), ps)))
|
||||
print(f"text: {text} -> phonemes: {ps} -> input_ids: {input_ids}")
|
||||
input_ids = torch.LongTensor([[0, *input_ids, 0]]).to(pipeline.model.device)
|
||||
return ps, input_ids
|
||||
|
||||
def load_voice(pipeline, voice, phonemes):
|
||||
pack = pipeline.load_voice(voice).to('cpu')
|
||||
return pack[len(phonemes) - 1]
|
||||
|
||||
def load_sample(model):
|
||||
pipeline = KPipeline(lang_code='a', model=model.kmodel, device='cpu')
|
||||
text = '''
|
||||
In today's fast-paced tech world, building software applications has never been easier — thanks to AI-powered coding assistants.'
|
||||
'''
|
||||
text = '''
|
||||
The sky above the port was the color of television, tuned to a dead channel.
|
||||
'''
|
||||
voice = 'checkpoints/voices/af_heart.pt'
|
||||
|
||||
pipeline = KPipeline(lang_code='z', model=model.kmodel, device='cpu')
|
||||
text = '''
|
||||
2月15日晚,猫眼专业版数据显示,截至发稿,《哪吒之魔童闹海》(或称《哪吒2》)今日票房已达7.8亿元,累计票房(含预售)超过114亿元。
|
||||
'''
|
||||
voice = 'checkpoints/voices/zf_xiaoxiao.pt'
|
||||
|
||||
phonemes, input_ids = load_input_ids(pipeline, text)
|
||||
style = load_voice(pipeline, voice, phonemes)
|
||||
speed = torch.IntTensor([1])
|
||||
|
||||
return input_ids, style, speed
|
||||
|
||||
def inference_onnx(model, output):
|
||||
onnx_file = output + "/" + "kokoro.onnx"
|
||||
session = ort.InferenceSession(onnx_file)
|
||||
|
||||
input_ids, style, speed = load_sample(model)
|
||||
|
||||
outputs = session.run(None, {
|
||||
'input_ids': input_ids.numpy(),
|
||||
'style': style.numpy(),
|
||||
'speed': speed.numpy(),
|
||||
})
|
||||
|
||||
output = torch.from_numpy(outputs[0])
|
||||
print(f'output: {output.shape}')
|
||||
print(output)
|
||||
|
||||
audio = output.numpy()
|
||||
sd.play(audio, 24000)
|
||||
sd.wait()
|
||||
|
||||
def check_model(model):
|
||||
input_ids, style, speed = load_sample(model)
|
||||
output, duration = model(input_ids, style, speed)
|
||||
|
||||
print(f'output: {output.shape}')
|
||||
print(f'duration: {duration.shape}')
|
||||
print(output)
|
||||
|
||||
audio = output.numpy()
|
||||
sd.play(audio, 24000)
|
||||
sd.wait()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser("Export kokoro Model to ONNX", add_help=True)
|
||||
parser.add_argument("--inference", "-t", help="test kokoro.onnx model", action="store_true")
|
||||
parser.add_argument("--check", "-m", help="check kokoro model", action="store_true")
|
||||
parser.add_argument(
|
||||
"--config_file", "-c", type=str, default="checkpoints/config.json", help="path to config file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", "-p", type=str, default="checkpoints/kokoro-v1_0.pth", help="path to checkpoint file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", "-o", type=str, default="onnx", help="output directory"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# cfg
|
||||
config_file = args.config_file # change the path of the model config file
|
||||
checkpoint_path = args.checkpoint_path # change the path of the model
|
||||
output_dir = args.output_dir
|
||||
|
||||
# make dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
kmodel = KModel(config=config_file, model=checkpoint_path, disable_complex=True)
|
||||
model = KModelForONNX(kmodel).eval()
|
||||
|
||||
if args.inference:
|
||||
inference_onnx(model, output_dir)
|
||||
elif args.check:
|
||||
check_model(model)
|
||||
else:
|
||||
export_onnx(model, output_dir)
|
||||
@@ -1,62 +0,0 @@
|
||||
from kokoro import KPipeline, KModel
|
||||
import torch
|
||||
from scipy.io import wavfile
|
||||
|
||||
def save_audio(audio: torch.Tensor, filename: str):
|
||||
"""Helper function to save audio tensor as WAV file"""
|
||||
if audio is not None:
|
||||
# Ensure audio is on CPU and in the right format
|
||||
audio_cpu = audio.cpu().numpy()
|
||||
|
||||
# Save using scipy.io.wavfile
|
||||
wavfile.write(
|
||||
filename,
|
||||
24000, # Kokoro uses 24kHz sample rate
|
||||
audio_cpu
|
||||
)
|
||||
print(f"Audio saved as '{filename}'")
|
||||
else:
|
||||
print("No audio was generated")
|
||||
|
||||
def main():
|
||||
# Initialize pipeline with American English
|
||||
pipeline = KPipeline(lang_code='a')
|
||||
|
||||
# The phoneme string for:
|
||||
# "How are you today? I am doing reasonably well, thank you for asking"
|
||||
phonemes = "hˌW ɑɹ ju tədˈA? ˌI ɐm dˈuɪŋ ɹˈizənəbli wˈɛl, θˈæŋk ju fɔɹ ˈæskɪŋ"
|
||||
|
||||
try:
|
||||
print("\nExample 1: Using generate_from_tokens with raw phonemes")
|
||||
results = list(pipeline.generate_from_tokens(
|
||||
tokens=phonemes,
|
||||
voice="af_bella",
|
||||
speed=1.0
|
||||
))
|
||||
if results:
|
||||
save_audio(results[0].audio, 'phoneme_output_new.wav')
|
||||
|
||||
# Example 2: Using generate_from_tokens with pre-processed tokens
|
||||
print("\nExample 2: Using generate_from_tokens with pre-processed tokens")
|
||||
# get the tokens through G2P or any other method
|
||||
text = "How are you today? I am doing reasonably well, thank you for asking"
|
||||
_, tokens = pipeline.g2p(text)
|
||||
|
||||
# Then generate from tokens
|
||||
for result in pipeline.generate_from_tokens(
|
||||
tokens=tokens,
|
||||
voice="af_bella",
|
||||
speed=1.0
|
||||
):
|
||||
# Each result may contain timestamps if available
|
||||
if result.tokens:
|
||||
for token in result.tokens:
|
||||
if hasattr(token, 'start_ts') and hasattr(token, 'end_ts'):
|
||||
print(f"Token: {token.text} ({token.start_ts:.2f}s - {token.end_ts:.2f}s)")
|
||||
save_audio(result.audio, f'token_output_{hash(result.phonemes)}.wav')
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {str(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4
kokoro.js/.gitignore
vendored
4
kokoro.js/.gitignore
vendored
@@ -1,4 +0,0 @@
|
||||
node_modules/
|
||||
dist
|
||||
types
|
||||
LICENSE
|
||||
@@ -1,2 +0,0 @@
|
||||
dist
|
||||
types
|
||||
@@ -1,119 +0,0 @@
|
||||
# Kokoro TTS
|
||||
|
||||
<p align="center">
|
||||
<a href="https://www.npmjs.com/package/kokoro-js"><img alt="NPM" src="https://img.shields.io/npm/v/kokoro-js"></a>
|
||||
<a href="https://www.npmjs.com/package/kokoro-js"><img alt="NPM Downloads" src="https://img.shields.io/npm/dw/kokoro-js"></a>
|
||||
<a href="https://www.jsdelivr.com/package/npm/kokoro-js"><img alt="jsDelivr Hits" src="https://img.shields.io/jsdelivr/npm/hw/kokoro-js"></a>
|
||||
<a href="https://github.com/hexgrad/kokoro/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/hexgrad/kokoro?color=blue"></a>
|
||||
<a href="https://huggingface.co/spaces/webml-community/kokoro-webgpu"><img alt="Demo" src="https://img.shields.io/badge/Hugging_Face-demo-green"></a>
|
||||
</p>
|
||||
|
||||
Kokoro is a frontier TTS model for its size of 82 million parameters (text in/audio out). This JavaScript library allows the model to be run 100% locally in the browser thanks to [🤗 Transformers.js](https://huggingface.co/docs/transformers.js). Try it out using our [online demo](https://huggingface.co/spaces/webml-community/kokoro-webgpu)!
|
||||
|
||||
## Usage
|
||||
|
||||
First, install the `kokoro-js` library from [NPM](https://npmjs.com/package/kokoro-js) using:
|
||||
|
||||
```bash
|
||||
npm i kokoro-js
|
||||
```
|
||||
|
||||
You can then generate speech as follows:
|
||||
|
||||
```js
|
||||
import { KokoroTTS } from "kokoro-js";
|
||||
|
||||
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
||||
const tts = await KokoroTTS.from_pretrained(model_id, {
|
||||
dtype: "q8", // Options: "fp32", "fp16", "q8", "q4", "q4f16"
|
||||
device: "wasm", // Options: "wasm", "webgpu" (web) or "cpu" (node). If using "webgpu", we recommend using dtype="fp32".
|
||||
});
|
||||
|
||||
const text = "Life is like a box of chocolates. You never know what you're gonna get.";
|
||||
const audio = await tts.generate(text, {
|
||||
// Use `tts.list_voices()` to list all available voices
|
||||
voice: "af_heart",
|
||||
});
|
||||
audio.save("audio.wav");
|
||||
```
|
||||
|
||||
Or if you'd prefer to stream the output, you can do that with:
|
||||
|
||||
```js
|
||||
import { KokoroTTS, TextSplitterStream } from "kokoro-js";
|
||||
|
||||
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
||||
const tts = await KokoroTTS.from_pretrained(model_id, {
|
||||
dtype: "fp32", // Options: "fp32", "fp16", "q8", "q4", "q4f16"
|
||||
// device: "webgpu", // Options: "wasm", "webgpu" (web) or "cpu" (node).
|
||||
});
|
||||
|
||||
// First, set up the stream
|
||||
const splitter = new TextSplitterStream();
|
||||
const stream = tts.stream(splitter);
|
||||
(async () => {
|
||||
let i = 0;
|
||||
for await (const { text, phonemes, audio } of stream) {
|
||||
console.log({ text, phonemes });
|
||||
audio.save(`audio-${i++}.wav`);
|
||||
}
|
||||
})();
|
||||
|
||||
// Next, add text to the stream. Note that the text can be added at different times.
|
||||
// For this example, let's pretend we're consuming text from an LLM, one word at a time.
|
||||
const text = "Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects. It can even run 100% locally in your browser, powered by Transformers.js!";
|
||||
const tokens = text.match(/\s*\S+/g);
|
||||
for (const token of tokens) {
|
||||
splitter.push(token);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
}
|
||||
|
||||
// Finally, close the stream to signal that no more text will be added.
|
||||
splitter.close();
|
||||
|
||||
// Alternatively, if you'd like to keep the stream open, but flush any remaining text, you can use the `flush` method.
|
||||
// splitter.flush();
|
||||
```
|
||||
|
||||
## Voices/Samples
|
||||
|
||||
> [!TIP]
|
||||
> You can find samples for each of the voices in the [model card](https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX#samples) on Hugging Face.
|
||||
|
||||
### American English
|
||||
|
||||
| Name | Traits | Target Quality | Training Duration | Overall Grade |
|
||||
| ------------ | ------ | -------------- | ----------------- | ------------- |
|
||||
| **af_heart** | 🚺❤️ | | | **A** |
|
||||
| af_alloy | 🚺 | B | MM minutes | C |
|
||||
| af_aoede | 🚺 | B | H hours | C+ |
|
||||
| af_bella | 🚺🔥 | **A** | **HH hours** | **A-** |
|
||||
| af_jessica | 🚺 | C | MM minutes | D |
|
||||
| af_kore | 🚺 | B | H hours | C+ |
|
||||
| af_nicole | 🚺🎧 | B | **HH hours** | B- |
|
||||
| af_nova | 🚺 | B | MM minutes | C |
|
||||
| af_river | 🚺 | C | MM minutes | D |
|
||||
| af_sarah | 🚺 | B | H hours | C+ |
|
||||
| af_sky | 🚺 | B | _M minutes_ 🤏 | C- |
|
||||
| am_adam | 🚹 | D | H hours | F+ |
|
||||
| am_echo | 🚹 | C | MM minutes | D |
|
||||
| am_eric | 🚹 | C | MM minutes | D |
|
||||
| am_fenrir | 🚹 | B | H hours | C+ |
|
||||
| am_liam | 🚹 | C | MM minutes | D |
|
||||
| am_michael | 🚹 | B | H hours | C+ |
|
||||
| am_onyx | 🚹 | C | MM minutes | D |
|
||||
| am_puck | 🚹 | B | H hours | C+ |
|
||||
| am_santa | 🚹 | C | _M minutes_ 🤏 | D- |
|
||||
|
||||
### British English
|
||||
|
||||
| Name | Traits | Target Quality | Training Duration | Overall Grade |
|
||||
| ----------- | ------ | -------------- | ----------------- | ------------- |
|
||||
| bf_alice | 🚺 | C | MM minutes | D |
|
||||
| bf_emma | 🚺 | B | **HH hours** | B- |
|
||||
| bf_isabella | 🚺 | B | MM minutes | C |
|
||||
| bf_lily | 🚺 | C | MM minutes | D |
|
||||
| bm_daniel | 🚹 | C | MM minutes | D |
|
||||
| bm_fable | 🚹 | B | MM minutes | C |
|
||||
| bm_george | 🚹 | B | MM minutes | C |
|
||||
| bm_lewis | 🚹 | C | H hours | D+ |
|
||||
24
kokoro.js/demo/.gitignore
vendored
24
kokoro.js/demo/.gitignore
vendored
@@ -1,24 +0,0 @@
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
pnpm-debug.log*
|
||||
lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/*
|
||||
!.vscode/extensions.json
|
||||
.idea
|
||||
.DS_Store
|
||||
*.suo
|
||||
*.ntvs*
|
||||
*.njsproj
|
||||
*.sln
|
||||
*.sw?
|
||||
@@ -1,56 +0,0 @@
|
||||
---
|
||||
title: Kokoro Text-to-Speech
|
||||
emoji: 🗣️
|
||||
colorFrom: indigo
|
||||
colorTo: purple
|
||||
sdk: static
|
||||
pinned: false
|
||||
license: apache-2.0
|
||||
short_description: High-quality speech synthesis powered by Kokoro TTS
|
||||
header: mini
|
||||
models:
|
||||
- onnx-community/Kokoro-82M-ONNX
|
||||
custom_headers:
|
||||
cross-origin-embedder-policy: require-corp
|
||||
cross-origin-opener-policy: same-origin
|
||||
cross-origin-resource-policy: cross-origin
|
||||
---
|
||||
|
||||
# Kokoro Text-to-Speech
|
||||
|
||||
A simple React + Vite application for running [Kokoro](https://github.com/hexgrad/kokoro), a frontier text-to-speech model for its size. The model runs 100% locally in the browser using [kokoro-js](https://www.npmjs.com/package/kokoro-js) and [🤗 Transformers.js](https://www.npmjs.com/package/@huggingface/transformers)!
|
||||
|
||||
## Getting Started
|
||||
|
||||
Follow the steps below to set up and run the application.
|
||||
|
||||
### 1. Clone the Repository
|
||||
|
||||
```sh
|
||||
git clone https://github.com/hexgrad/kokoro.git
|
||||
```
|
||||
|
||||
### 2. Build the Dependencies
|
||||
|
||||
```sh
|
||||
cd kokoro/kokoro.js
|
||||
npm i
|
||||
npm run build
|
||||
```
|
||||
|
||||
### 3. Setup the Demo Project
|
||||
|
||||
Note this depends on build output from the previous step.
|
||||
|
||||
```sh
|
||||
cd demo
|
||||
npm i
|
||||
```
|
||||
|
||||
### 4. Start the Development Server
|
||||
|
||||
```sh
|
||||
npm run dev
|
||||
```
|
||||
|
||||
The application should now be running locally. Open your browser and go to [http://localhost:5173](http://localhost:5173) to see it in action.
|
||||
@@ -1,35 +0,0 @@
|
||||
import js from "@eslint/js";
|
||||
import globals from "globals";
|
||||
import react from "eslint-plugin-react";
|
||||
import reactHooks from "eslint-plugin-react-hooks";
|
||||
import reactRefresh from "eslint-plugin-react-refresh";
|
||||
|
||||
export default [
|
||||
{ ignores: ["dist"] },
|
||||
{
|
||||
files: ["**/*.{js,jsx}"],
|
||||
languageOptions: {
|
||||
ecmaVersion: 2020,
|
||||
globals: globals.browser,
|
||||
parserOptions: {
|
||||
ecmaVersion: "latest",
|
||||
ecmaFeatures: { jsx: true },
|
||||
sourceType: "module",
|
||||
},
|
||||
},
|
||||
settings: { react: { version: "18.3" } },
|
||||
plugins: {
|
||||
react,
|
||||
"react-hooks": reactHooks,
|
||||
"react-refresh": reactRefresh,
|
||||
},
|
||||
rules: {
|
||||
...js.configs.recommended.rules,
|
||||
...react.configs.recommended.rules,
|
||||
...react.configs["jsx-runtime"].rules,
|
||||
...reactHooks.configs.recommended.rules,
|
||||
"react/jsx-no-target-blank": "off",
|
||||
"react-refresh/only-export-components": ["warn", { allowConstantExport: true }],
|
||||
},
|
||||
},
|
||||
];
|
||||
@@ -1,13 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="icon" type="image/svg+xml" href="/hf-logo.svg" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Kokoro Text-to-Speech</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.jsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
5350
kokoro.js/demo/package-lock.json
generated
5350
kokoro.js/demo/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,33 +0,0 @@
|
||||
{
|
||||
"name": "kokoro-web",
|
||||
"private": true,
|
||||
"version": "0.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "vite build",
|
||||
"lint": "eslint .",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"kokoro-js": "file:..",
|
||||
"motion": "^11.12.0",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.15.0",
|
||||
"@types/react": "^18.3.12",
|
||||
"@types/react-dom": "^18.3.1",
|
||||
"@vitejs/plugin-react": "^4.3.4",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"eslint": "^9.15.0",
|
||||
"eslint-plugin-react": "^7.37.2",
|
||||
"eslint-plugin-react-hooks": "^5.0.0",
|
||||
"eslint-plugin-react-refresh": "^0.4.14",
|
||||
"globals": "^15.12.0",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.15",
|
||||
"vite": "^6.0.1"
|
||||
}
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
export default {
|
||||
plugins: {
|
||||
tailwindcss: {},
|
||||
autoprefixer: {},
|
||||
},
|
||||
};
|
||||
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 34 KiB |
@@ -1,9 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="1600" height="198">
|
||||
<defs>
|
||||
<linearGradient id="a" x1="50%" x2="50%" y1="-10.959%" y2="100%">
|
||||
<stop stop-color="#57BBC1" stop-opacity=".25" offset="0%"/>
|
||||
<stop stop-color="#015871" offset="100%"/>
|
||||
</linearGradient>
|
||||
</defs>
|
||||
<path fill="url(#a)" fill-rule="evenodd" d="M.005 121C311 121 409.898-.25 811 0c400 0 500 121 789 121v77H0s.005-48 .005-77z" transform="matrix(-1 0 0 1 1600 0)"/>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 465 B |
@@ -1,138 +0,0 @@
|
||||
import { useRef, useState, useEffect } from "react";
|
||||
import { motion } from "motion/react";
|
||||
|
||||
export default function App() {
|
||||
// Create a reference to the worker object.
|
||||
const worker = useRef(null);
|
||||
|
||||
const [inputText, setInputText] = useState("Life is like a box of chocolates. You never know what you're gonna get.");
|
||||
const [selectedSpeaker, setSelectedSpeaker] = useState("af_heart");
|
||||
|
||||
const [voices, setVoices] = useState([]);
|
||||
const [status, setStatus] = useState(null);
|
||||
const [error, setError] = useState(null);
|
||||
const [loadingMessage, setLoadingMessage] = useState("Loading...");
|
||||
|
||||
const [results, setResults] = useState([]);
|
||||
|
||||
// We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
|
||||
useEffect(() => {
|
||||
// Create the worker if it does not yet exist.
|
||||
worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
|
||||
type: "module",
|
||||
});
|
||||
|
||||
// Create a callback function for messages from the worker thread.
|
||||
const onMessageReceived = (e) => {
|
||||
switch (e.data.status) {
|
||||
case "device":
|
||||
setLoadingMessage(`Loading model (device="${e.data.device}")`);
|
||||
break;
|
||||
case "ready":
|
||||
setStatus("ready");
|
||||
setVoices(e.data.voices);
|
||||
break;
|
||||
case "error":
|
||||
setError(e.data.data);
|
||||
break;
|
||||
case "complete":
|
||||
const { audio, text } = e.data;
|
||||
// Generation complete: re-enable the "Generate" button
|
||||
setResults((prev) => [{ text, src: audio }, ...prev]);
|
||||
setStatus("ready");
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
const onErrorReceived = (e) => {
|
||||
console.error("Worker error:", e);
|
||||
setError(e.message);
|
||||
};
|
||||
|
||||
// Attach the callback function as an event listener.
|
||||
worker.current.addEventListener("message", onMessageReceived);
|
||||
worker.current.addEventListener("error", onErrorReceived);
|
||||
|
||||
// Define a cleanup function for when the component is unmounted.
|
||||
return () => {
|
||||
worker.current.removeEventListener("message", onMessageReceived);
|
||||
worker.current.removeEventListener("error", onErrorReceived);
|
||||
};
|
||||
}, []);
|
||||
|
||||
const handleSubmit = (e) => {
|
||||
e.preventDefault();
|
||||
setStatus("running");
|
||||
|
||||
worker.current.postMessage({
|
||||
type: "generate",
|
||||
text: inputText.trim(),
|
||||
voice: selectedSpeaker,
|
||||
});
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="relative w-full min-h-screen bg-gradient-to-br from-gray-900 to-gray-700 flex flex-col items-center justify-center p-4 relative overflow-hidden font-sans">
|
||||
<motion.div initial={{ opacity: 1 }} animate={{ opacity: status === null ? 1 : 0 }} transition={{ duration: 0.5 }} className="absolute w-screen h-screen justify-center flex flex-col items-center z-10 bg-gray-800/95 backdrop-blur-md" style={{ pointerEvents: status === null ? "auto" : "none" }}>
|
||||
<div className="w-[250px] h-[250px] border-4 border-white shadow-[0_0_0_5px_#4973ff] rounded-full overflow-hidden">
|
||||
<div className="loading-wave"></div>
|
||||
</div>
|
||||
<p className={`text-3xl my-5 text-center ${error ? "text-red-500" : "text-white"}`}>{error ?? loadingMessage}</p>
|
||||
</motion.div>
|
||||
|
||||
<div className="max-w-3xl w-full space-y-8 relative z-[2]">
|
||||
<div className="text-center">
|
||||
<h1 className="text-5xl font-extrabold text-gray-100 mb-2 drop-shadow-lg font-heading">Kokoro Text-to-Speech</h1>
|
||||
<p className="text-2xl text-gray-300 font-semibold font-subheading">
|
||||
Powered by
|
||||
<a href="https://github.com/hexgrad/kokoro" target="_blank" rel="noreferrer" className="underline">
|
||||
Kokoro
|
||||
</a>
|
||||
and
|
||||
<a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">
|
||||
<img width="40" src="hf-logo.svg" className="inline translate-y-[-2px] me-1"></img>Transformers.js
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
<div className="bg-gray-800/50 backdrop-blur-sm border border-gray-700 rounded-lg p-6">
|
||||
<form onSubmit={handleSubmit} className="space-y-4">
|
||||
<textarea placeholder="Enter text..." value={inputText} onChange={(e) => setInputText(e.target.value)} className="w-full min-h-[100px] max-h-[300px] bg-gray-700/50 backdrop-blur-sm border-2 border-gray-600 rounded-xl resize-y text-gray-100 placeholder-gray-400 px-3 py-2 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent" rows={Math.min(8, inputText.split("\n").length)} />
|
||||
<div className="flex flex-col items-center space-y-4">
|
||||
<select value={selectedSpeaker} onChange={(e) => setSelectedSpeaker(e.target.value)} className="w-full bg-gray-700/50 backdrop-blur-sm border-2 border-gray-600 rounded-xl text-gray-100 px-3 py-2 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent">
|
||||
{Object.entries(voices).map(([id, voice]) => (
|
||||
<option key={id} value={id}>
|
||||
{voice.name} ({voice.language === "en-us" ? "American" : "British"} {voice.gender})
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
<button type="submit" className="inline-flex justify-center items-center px-6 py-2 text-lg font-semibold bg-gradient-to-t from-blue-600 to-purple-600 hover:from-blue-700 hover:to-purple-700 transition-colors duration-300 rounded-xl text-white disabled:opacity-50" disabled={status === "running" || inputText.trim() === ""}>
|
||||
{status === "running" ? "Generating..." : "Generate"}
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
{results.length > 0 && (
|
||||
<motion.div initial={{ y: 50, opacity: 0 }} animate={{ y: 0, opacity: 1 }} transition={{ duration: 0.5 }} className="max-h-[250px] overflow-y-auto px-2 mt-4 space-y-6 relative z-[2]">
|
||||
{results.map((result, i) => (
|
||||
<div key={i}>
|
||||
<div className="text-white bg-gray-800/70 backdrop-blur-sm border border-gray-700 rounded-lg p-4 z-10">
|
||||
<span className="absolute right-5 font-bold">#{results.length - i}</span>
|
||||
<p className="mb-3 max-w-[95%]">{result.text}</p>
|
||||
<audio controls src={result.src} className="w-full">
|
||||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</motion.div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="bg-[#015871] pointer-events-none absolute left-0 w-full h-[5%] bottom-[-50px]">
|
||||
<div className="wave"></div>
|
||||
<div className="wave"></div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -1,100 +0,0 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
/*
|
||||
* Wave animations adapted from the following two demos:
|
||||
* - https://codepen.io/upasanaasopa/pen/poObEWZ
|
||||
* - https://codepen.io/breakstorm00/pen/qBJZQNB
|
||||
*/
|
||||
|
||||
*,
|
||||
*:before,
|
||||
*:after {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.loading-wave {
|
||||
position: relative;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background: #2c74b3;
|
||||
border-radius: 50%;
|
||||
box-shadow: inset 0 0 50px 0 rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
|
||||
.loading-wave:before,
|
||||
.loading-wave:after {
|
||||
content: "";
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 50%;
|
||||
width: 200%;
|
||||
height: 200%;
|
||||
background: black;
|
||||
transform: translate(-50%, -75%);
|
||||
}
|
||||
|
||||
.loading-wave:before {
|
||||
border-radius: 45%;
|
||||
background: rgba(255, 255, 255, 1);
|
||||
animation: animate 5s linear infinite;
|
||||
}
|
||||
|
||||
.loading-wave:after {
|
||||
border-radius: 40%;
|
||||
background: rgba(255, 255, 255, 0.5);
|
||||
animation: animate 10s linear infinite;
|
||||
}
|
||||
|
||||
.wave {
|
||||
background: url(/wave.svg) repeat-x;
|
||||
position: absolute;
|
||||
top: -198px;
|
||||
width: 6400px;
|
||||
height: 198px;
|
||||
animation: wave 7s cubic-bezier(0.36, 0.45, 0.63, 0.53) infinite;
|
||||
transform: translate3d(0, 0, 0);
|
||||
}
|
||||
|
||||
.wave:nth-of-type(2) {
|
||||
top: -175px;
|
||||
animation:
|
||||
wave 7s cubic-bezier(0.36, 0.45, 0.63, 0.53) -0.125s infinite,
|
||||
swell 7s ease -1.25s infinite;
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
@keyframes wave {
|
||||
0% {
|
||||
margin-left: 0;
|
||||
}
|
||||
|
||||
100% {
|
||||
margin-left: -1600px;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes swell {
|
||||
0%,
|
||||
100% {
|
||||
transform: translate3d(0, -25px, 0);
|
||||
}
|
||||
|
||||
50% {
|
||||
transform: translate3d(0, 5px, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes animate {
|
||||
0% {
|
||||
transform: translate(-50%, -75%) rotate(0deg);
|
||||
}
|
||||
|
||||
100% {
|
||||
transform: translate(-50%, -75%) rotate(360deg);
|
||||
}
|
||||
}
|
||||
@@ -1,10 +0,0 @@
|
||||
import { StrictMode } from "react";
|
||||
import { createRoot } from "react-dom/client";
|
||||
import "./index.css";
|
||||
import App from "./App.jsx";
|
||||
|
||||
createRoot(document.getElementById("root")).render(
|
||||
<StrictMode>
|
||||
<App />
|
||||
</StrictMode>,
|
||||
);
|
||||
@@ -1,8 +0,0 @@
|
||||
export async function detectWebGPU() {
|
||||
try {
|
||||
const adapter = await navigator.gpu.requestAdapter();
|
||||
return !!adapter;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
import { KokoroTTS } from "kokoro-js";
|
||||
import { detectWebGPU } from "./utils.js";
|
||||
|
||||
// Device detection
|
||||
const device = (await detectWebGPU()) ? "webgpu" : "wasm";
|
||||
self.postMessage({ status: "device", device });
|
||||
|
||||
// Load the model
|
||||
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
||||
const tts = await KokoroTTS.from_pretrained(model_id, {
|
||||
dtype: device === "wasm" ? "q8" : "fp32",
|
||||
device,
|
||||
}).catch((e) => {
|
||||
self.postMessage({ status: "error", error: e.message });
|
||||
throw e;
|
||||
});
|
||||
self.postMessage({ status: "ready", voices: tts.voices, device });
|
||||
|
||||
// Listen for messages from the main thread
|
||||
self.addEventListener("message", async (e) => {
|
||||
const { text, voice } = e.data;
|
||||
|
||||
// Generate speech
|
||||
const audio = await tts.generate(text, { voice });
|
||||
|
||||
// Send the audio file back to the main thread
|
||||
const blob = audio.toBlob();
|
||||
self.postMessage({ status: "complete", audio: URL.createObjectURL(blob), text });
|
||||
});
|
||||
@@ -1,8 +0,0 @@
|
||||
/** @type {import('tailwindcss').Config} */
|
||||
export default {
|
||||
content: ["./index.html", "./src/**/*.{js,ts,jsx,tsx}"],
|
||||
theme: {
|
||||
extend: {},
|
||||
},
|
||||
plugins: [],
|
||||
};
|
||||
@@ -1,12 +0,0 @@
|
||||
import { defineConfig } from "vite";
|
||||
import react from "@vitejs/plugin-react";
|
||||
|
||||
// https://vite.dev/config/
|
||||
export default defineConfig({
|
||||
plugins: [react()],
|
||||
worker: { format: "es" },
|
||||
build: {
|
||||
target: "esnext",
|
||||
},
|
||||
logLevel: process.env.NODE_ENV === "development" ? "error" : "info",
|
||||
});
|
||||
2972
kokoro.js/package-lock.json
generated
2972
kokoro.js/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,65 +0,0 @@
|
||||
{
|
||||
"name": "kokoro-js",
|
||||
"version": "1.2.0",
|
||||
"type": "module",
|
||||
"exports": {
|
||||
"types": "./types/kokoro.d.ts",
|
||||
"node": {
|
||||
"import": "./dist/kokoro.js",
|
||||
"require": "./dist/kokoro.cjs"
|
||||
},
|
||||
"default": "./dist/kokoro.web.js"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "rm -rf dist types && rollup -c && tsc && cp ../LICENSE LICENSE",
|
||||
"format": "prettier --write . --print-width 1000",
|
||||
"test": "vitest run"
|
||||
},
|
||||
"keywords": [
|
||||
"kokoro",
|
||||
"tts",
|
||||
"text-to-speech"
|
||||
],
|
||||
"author": {
|
||||
"name": "hexgrad",
|
||||
"email": "hello@hexgrad.com"
|
||||
},
|
||||
"browser": {
|
||||
"path": false,
|
||||
"fs/promises": false
|
||||
},
|
||||
"contributors": [
|
||||
"Xenova"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"description": "High-quality text-to-speech for the web",
|
||||
"dependencies": {
|
||||
"@huggingface/transformers": "^3.3.3",
|
||||
"phonemizer": "^1.2.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@rollup/plugin-node-resolve": "^16.0.0",
|
||||
"@rollup/plugin-terser": "^0.4.4",
|
||||
"prettier": "3.4.2",
|
||||
"rollup": "^4.30.1",
|
||||
"typescript": "^5.7.3",
|
||||
"vitest": "^2.1.8"
|
||||
},
|
||||
"files": [
|
||||
"types",
|
||||
"dist",
|
||||
"voices",
|
||||
"README.md",
|
||||
"LICENSE"
|
||||
],
|
||||
"homepage": "https://github.com/hexgrad/kokoro",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/hexgrad/kokoro.git"
|
||||
},
|
||||
"publishConfig": {
|
||||
"access": "public"
|
||||
},
|
||||
"jsdelivr": "./dist/kokoro.web.js",
|
||||
"unpkg": "./dist/kokoro.web.js"
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
import terser from "@rollup/plugin-terser";
|
||||
import { nodeResolve } from "@rollup/plugin-node-resolve";
|
||||
|
||||
const plugins = (browser) => [nodeResolve({ browser }), terser({ format: { comments: false } })];
|
||||
|
||||
const OUTPUT_CONFIGS = [
|
||||
// Node versions
|
||||
{
|
||||
file: "./dist/kokoro.cjs",
|
||||
format: "cjs",
|
||||
},
|
||||
{
|
||||
file: "./dist/kokoro.js",
|
||||
format: "esm",
|
||||
},
|
||||
|
||||
// Web version
|
||||
{
|
||||
file: "./dist/kokoro.web.js",
|
||||
format: "esm",
|
||||
},
|
||||
];
|
||||
|
||||
const WEB_SPECIFIC_CONFIG = {
|
||||
onwarn: (warning, warn) => {
|
||||
if (!warning.message.includes("@huggingface/transformers")) warn(warning);
|
||||
},
|
||||
};
|
||||
|
||||
const NODE_SPECIFIC_CONFIG = {
|
||||
external: ["@huggingface/transformers", "phonemizer"],
|
||||
};
|
||||
|
||||
export default OUTPUT_CONFIGS.map((output) => {
|
||||
const web = output.file.endsWith(".web.js");
|
||||
return {
|
||||
input: "./src/kokoro.js",
|
||||
output,
|
||||
plugins: plugins(web),
|
||||
...(web ? WEB_SPECIFIC_CONFIG : NODE_SPECIFIC_CONFIG),
|
||||
};
|
||||
});
|
||||
@@ -1,152 +0,0 @@
|
||||
import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
|
||||
import { phonemize } from "./phonemize.js";
|
||||
import { TextSplitterStream } from "./splitter.js";
|
||||
import { getVoiceData, VOICES } from "./voices.js";
|
||||
|
||||
const STYLE_DIM = 256;
|
||||
const SAMPLE_RATE = 24000;
|
||||
|
||||
/**
|
||||
* @typedef {Object} GenerateOptions
|
||||
* @property {keyof typeof VOICES} [voice="af_heart"] The voice
|
||||
* @property {number} [speed=1] The speaking speed
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} StreamProperties
|
||||
* @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
|
||||
* @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
|
||||
*/
|
||||
|
||||
export class KokoroTTS {
|
||||
/**
|
||||
* Create a new KokoroTTS instance.
|
||||
* @param {import('@huggingface/transformers').StyleTextToSpeech2Model} model The model
|
||||
* @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer
|
||||
*/
|
||||
constructor(model, tokenizer) {
|
||||
this.model = model;
|
||||
this.tokenizer = tokenizer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a KokoroTTS model from the Hugging Face Hub.
|
||||
* @param {string} model_id The model id
|
||||
* @param {Object} options Additional options
|
||||
* @param {"fp32"|"fp16"|"q8"|"q4"|"q4f16"} [options.dtype="fp32"] The data type to use.
|
||||
* @param {"wasm"|"webgpu"|"cpu"|null} [options.device=null] The device to run the model on.
|
||||
* @param {import("@huggingface/transformers").ProgressCallback} [options.progress_callback=null] A callback function that is called with progress information.
|
||||
* @returns {Promise<KokoroTTS>} The loaded model
|
||||
*/
|
||||
static async from_pretrained(model_id, { dtype = "fp32", device = null, progress_callback = null } = {}) {
|
||||
const model = StyleTextToSpeech2Model.from_pretrained(model_id, { progress_callback, dtype, device });
|
||||
const tokenizer = AutoTokenizer.from_pretrained(model_id, { progress_callback });
|
||||
|
||||
const info = await Promise.all([model, tokenizer]);
|
||||
return new KokoroTTS(...info);
|
||||
}
|
||||
|
||||
get voices() {
|
||||
return VOICES;
|
||||
}
|
||||
|
||||
list_voices() {
|
||||
console.table(VOICES);
|
||||
}
|
||||
|
||||
_validate_voice(voice) {
|
||||
if (!VOICES.hasOwnProperty(voice)) {
|
||||
console.error(`Voice "${voice}" not found. Available voices:`);
|
||||
console.table(VOICES);
|
||||
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
|
||||
}
|
||||
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from text.
|
||||
*
|
||||
* @param {string} text The input text
|
||||
* @param {GenerateOptions} options Additional options
|
||||
* @returns {Promise<RawAudio>} The generated audio
|
||||
*/
|
||||
async generate(text, { voice = "af_heart", speed = 1 } = {}) {
|
||||
const language = this._validate_voice(voice);
|
||||
|
||||
const phonemes = await phonemize(text, language);
|
||||
const { input_ids } = this.tokenizer(phonemes, {
|
||||
truncation: true,
|
||||
});
|
||||
|
||||
return this.generate_from_ids(input_ids, { voice, speed });
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from input ids.
|
||||
* @param {Tensor} input_ids The input ids
|
||||
* @param {GenerateOptions} options Additional options
|
||||
* @returns {Promise<RawAudio>} The generated audio
|
||||
*/
|
||||
async generate_from_ids(input_ids, { voice = "af_heart", speed = 1 } = {}) {
|
||||
// Select voice style based on number of input tokens
|
||||
const num_tokens = Math.min(Math.max(input_ids.dims.at(-1) - 2, 0), 509);
|
||||
|
||||
// Load voice style
|
||||
const data = await getVoiceData(voice);
|
||||
const offset = num_tokens * STYLE_DIM;
|
||||
const voiceData = data.slice(offset, offset + STYLE_DIM);
|
||||
|
||||
// Prepare model inputs
|
||||
const inputs = {
|
||||
input_ids,
|
||||
style: new Tensor("float32", voiceData, [1, STYLE_DIM]),
|
||||
speed: new Tensor("float32", [speed], [1]),
|
||||
};
|
||||
|
||||
// Generate audio
|
||||
const { waveform } = await this.model(inputs);
|
||||
return new RawAudio(waveform.data, SAMPLE_RATE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from text in a streaming fashion.
|
||||
* @param {string|TextSplitterStream} text The input text
|
||||
* @param {StreamGenerateOptions} options Additional options
|
||||
* @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
|
||||
*/
|
||||
async *stream(text, { voice = "af_heart", speed = 1, split_pattern = null } = {}) {
|
||||
const language = this._validate_voice(voice);
|
||||
|
||||
/** @type {TextSplitterStream} */
|
||||
let splitter;
|
||||
if (text instanceof TextSplitterStream) {
|
||||
splitter = text;
|
||||
} else if (typeof text === "string") {
|
||||
splitter = new TextSplitterStream();
|
||||
const chunks = split_pattern
|
||||
? text
|
||||
.split(split_pattern)
|
||||
.map((chunk) => chunk.trim())
|
||||
.filter((chunk) => chunk.length > 0)
|
||||
: [text];
|
||||
splitter.push(...chunks);
|
||||
} else {
|
||||
throw new Error("Invalid input type. Expected string or TextSplitterStream.");
|
||||
}
|
||||
for await (const sentence of splitter) {
|
||||
const phonemes = await phonemize(sentence, language);
|
||||
const { input_ids } = this.tokenizer(phonemes, {
|
||||
truncation: true,
|
||||
});
|
||||
|
||||
// TODO: There may be some cases where - even with splitting - the text is too long.
|
||||
// In that case, we should split the text into smaller chunks and process them separately.
|
||||
// For now, we just truncate these exceptionally long chunks
|
||||
const audio = await this.generate_from_ids(input_ids, { voice, speed });
|
||||
yield { text: sentence, phonemes, audio };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export { TextSplitterStream };
|
||||
@@ -1,204 +0,0 @@
|
||||
import { phonemize as espeakng } from "phonemizer";
|
||||
|
||||
/**
|
||||
* Helper function to split a string on a regex, but keep the delimiters.
|
||||
* This is required, because the JavaScript `.split()` method does not keep the delimiters,
|
||||
* and wrapping in a capturing group causes issues with existing capturing groups (due to nesting).
|
||||
* @param {string} text The text to split.
|
||||
* @param {RegExp} regex The regex to split on.
|
||||
* @returns {{match: boolean; text: string}[]} The split string.
|
||||
*/
|
||||
function split(text, regex) {
|
||||
const result = [];
|
||||
let prev = 0;
|
||||
for (const match of text.matchAll(regex)) {
|
||||
const fullMatch = match[0];
|
||||
if (prev < match.index) {
|
||||
result.push({ match: false, text: text.slice(prev, match.index) });
|
||||
}
|
||||
if (fullMatch.length > 0) {
|
||||
result.push({ match: true, text: fullMatch });
|
||||
}
|
||||
prev = match.index + fullMatch.length;
|
||||
}
|
||||
if (prev < text.length) {
|
||||
result.push({ match: false, text: text.slice(prev) });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to split numbers into phonetic equivalents
|
||||
* @param {string} match The matched number
|
||||
* @returns {string} The phonetic equivalent
|
||||
*/
|
||||
function split_num(match) {
|
||||
if (match.includes(".")) {
|
||||
return match;
|
||||
} else if (match.includes(":")) {
|
||||
let [h, m] = match.split(":").map(Number);
|
||||
if (m === 0) {
|
||||
return `${h} o'clock`;
|
||||
} else if (m < 10) {
|
||||
return `${h} oh ${m}`;
|
||||
}
|
||||
return `${h} ${m}`;
|
||||
}
|
||||
let year = parseInt(match.slice(0, 4), 10);
|
||||
if (year < 1100 || year % 1000 < 10) {
|
||||
return match;
|
||||
}
|
||||
let left = match.slice(0, 2);
|
||||
let right = parseInt(match.slice(2, 4), 10);
|
||||
let suffix = match.endsWith("s") ? "s" : "";
|
||||
if (year % 1000 >= 100 && year % 1000 <= 999) {
|
||||
if (right === 0) {
|
||||
return `${left} hundred${suffix}`;
|
||||
} else if (right < 10) {
|
||||
return `${left} oh ${right}${suffix}`;
|
||||
}
|
||||
}
|
||||
return `${left} ${right}${suffix}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to format monetary values
|
||||
* @param {string} match The matched currency
|
||||
* @returns {string} The formatted currency
|
||||
*/
|
||||
function flip_money(match) {
|
||||
const bill = match[0] === "$" ? "dollar" : "pound";
|
||||
if (isNaN(Number(match.slice(1)))) {
|
||||
return `${match.slice(1)} ${bill}s`;
|
||||
} else if (!match.includes(".")) {
|
||||
let suffix = match.slice(1) === "1" ? "" : "s";
|
||||
return `${match.slice(1)} ${bill}${suffix}`;
|
||||
}
|
||||
const [b, c] = match.slice(1).split(".");
|
||||
const d = parseInt(c.padEnd(2, "0"), 10);
|
||||
let coins = match[0] === "$" ? (d === 1 ? "cent" : "cents") : d === 1 ? "penny" : "pence";
|
||||
return `${b} ${bill}${b === "1" ? "" : "s"} and ${d} ${coins}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to process decimal numbers
|
||||
* @param {string} match The matched number
|
||||
* @returns {string} The formatted number
|
||||
*/
|
||||
function point_num(match) {
|
||||
let [a, b] = match.split(".");
|
||||
return `${a} point ${b.split("").join(" ")}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize text for phonemization
|
||||
* @param {string} text The text to normalize
|
||||
* @returns {string} The normalized text
|
||||
*/
|
||||
function normalize_text(text) {
|
||||
return (
|
||||
text
|
||||
// 1. Handle quotes and brackets
|
||||
.replace(/[‘’]/g, "'")
|
||||
.replace(/«/g, "“")
|
||||
.replace(/»/g, "”")
|
||||
.replace(/[“”]/g, '"')
|
||||
.replace(/\(/g, "«")
|
||||
.replace(/\)/g, "»")
|
||||
|
||||
// 2. Replace uncommon punctuation marks
|
||||
.replace(/、/g, ", ")
|
||||
.replace(/。/g, ". ")
|
||||
.replace(/!/g, "! ")
|
||||
.replace(/,/g, ", ")
|
||||
.replace(/:/g, ": ")
|
||||
.replace(/;/g, "; ")
|
||||
.replace(/?/g, "? ")
|
||||
|
||||
// 3. Whitespace normalization
|
||||
.replace(/[^\S \n]/g, " ")
|
||||
.replace(/ +/, " ")
|
||||
.replace(/(?<=\n) +(?=\n)/g, "")
|
||||
|
||||
// 4. Abbreviations
|
||||
.replace(/\bD[Rr]\.(?= [A-Z])/g, "Doctor")
|
||||
.replace(/\b(?:Mr\.|MR\.(?= [A-Z]))/g, "Mister")
|
||||
.replace(/\b(?:Ms\.|MS\.(?= [A-Z]))/g, "Miss")
|
||||
.replace(/\b(?:Mrs\.|MRS\.(?= [A-Z]))/g, "Mrs")
|
||||
.replace(/\betc\.(?! [A-Z])/gi, "etc")
|
||||
|
||||
// 5. Normalize casual words
|
||||
.replace(/\b(y)eah?\b/gi, "$1e'a")
|
||||
|
||||
// 5. Handle numbers and currencies
|
||||
.replace(/\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)/g, split_num)
|
||||
.replace(/(?<=\d),(?=\d)/g, "")
|
||||
.replace(/[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b/gi, flip_money)
|
||||
.replace(/\d*\.\d+/g, point_num)
|
||||
.replace(/(?<=\d)-(?=\d)/g, " to ")
|
||||
.replace(/(?<=\d)S/g, " S")
|
||||
|
||||
// 6. Handle possessives
|
||||
.replace(/(?<=[BCDFGHJ-NP-TV-Z])'?s\b/g, "'S")
|
||||
.replace(/(?<=X')S\b/g, "s")
|
||||
|
||||
// 7. Handle hyphenated words/letters
|
||||
.replace(/(?:[A-Za-z]\.){2,} [a-z]/g, (m) => m.replace(/\./g, "-"))
|
||||
.replace(/(?<=[A-Z])\.(?=[A-Z])/gi, "-")
|
||||
|
||||
// 8. Strip leading and trailing whitespace
|
||||
.trim()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes regular expression special characters from a string by replacing them with their escaped counterparts.
|
||||
*
|
||||
* @param {string} string The string to escape.
|
||||
* @returns {string} The escaped string.
|
||||
*/
|
||||
function escapeRegExp(string) {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
|
||||
}
|
||||
|
||||
const PUNCTUATION = ';:,.!?¡¿—…"«»“”(){}[]';
|
||||
const PUNCTUATION_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*)+`, "g");
|
||||
|
||||
/**
|
||||
* Phonemize text using the eSpeak-NG phonemizer
|
||||
* @param {string} text The text to phonemize
|
||||
* @param {"a"|"b"} language The language to use
|
||||
* @param {boolean} norm Whether to normalize the text
|
||||
* @returns {Promise<string>} The phonemized text
|
||||
*/
|
||||
export async function phonemize(text, language = "a", norm = true) {
|
||||
// 1. Normalize text
|
||||
if (norm) {
|
||||
text = normalize_text(text);
|
||||
}
|
||||
|
||||
// 2. Split into chunks, to ensure we preserve punctuation
|
||||
const sections = split(text, PUNCTUATION_PATTERN);
|
||||
|
||||
// 3. Convert each section to phonemes
|
||||
const lang = language === "a" ? "en-us" : "en";
|
||||
const ps = (await Promise.all(sections.map(async ({ match, text }) => (match ? text : (await espeakng(text, lang)).join(" "))))).join("");
|
||||
|
||||
// 4. Post-process phonemes
|
||||
let processed = ps
|
||||
// https://en.wiktionary.org/wiki/kokoro#English
|
||||
.replace(/kəkˈoːɹoʊ/g, "kˈoʊkəɹoʊ")
|
||||
.replace(/kəkˈɔːɹəʊ/g, "kˈəʊkəɹəʊ")
|
||||
.replace(/ʲ/g, "j")
|
||||
.replace(/r/g, "ɹ")
|
||||
.replace(/x/g, "k")
|
||||
.replace(/ɬ/g, "l")
|
||||
.replace(/(?<=[a-zɹː])(?=hˈʌndɹɪd)/g, " ")
|
||||
.replace(/ z(?=[;:,.!?¡¿—…"«»“” ]|$)/g, "z");
|
||||
|
||||
// 5. Additional post-processing for American English
|
||||
if (language === "a") {
|
||||
processed = processed.replace(/(?<=nˈaɪn)ti(?!ː)/g, "di");
|
||||
}
|
||||
return processed.trim();
|
||||
}
|
||||
@@ -1,344 +0,0 @@
|
||||
/**
|
||||
* Returns true if the character is considered a sentence terminator.
|
||||
* This includes ASCII (".", "!", "?") and common Unicode terminators.
|
||||
* NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
|
||||
* @param {string} c The character to test.
|
||||
* @param {boolean} [includeNewlines=true] Whether to treat newlines as terminators.
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isSentenceTerminator(c, includeNewlines = true) {
|
||||
return ".!?…。?!".includes(c) || (includeNewlines && c === "\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the character should be attached to the sentence terminator,
|
||||
* such as closing quotes or brackets.
|
||||
* @param {string} c The character to test.
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isTrailingChar(c) {
|
||||
return "\"')]}」』".includes(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a token (a contiguous sequence of non–whitespace characters)
|
||||
* from the buffer starting at the given index.
|
||||
* @param {string} buffer The input text.
|
||||
* @param {number} start The starting index.
|
||||
* @returns {string} The extracted token.
|
||||
*/
|
||||
function getTokenFromBuffer(buffer, start) {
|
||||
let end = start;
|
||||
while (end < buffer.length && !/\s/.test(buffer[end])) {
|
||||
++end;
|
||||
}
|
||||
return buffer.substring(start, end);
|
||||
}
|
||||
|
||||
// List of common abbreviations. Note that strings with single letters joined by periods
|
||||
// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
|
||||
const ABBREVIATIONS = new Set(["mr", "mrs", "ms", "dr", "prof", "sr", "jr", "sgt", "col", "gen", "rep", "sen", "gov", "lt", "maj", "capt", "st", "mt", "etc", "co", "inc", "ltd", "dept", "vs", "p", "pg", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", "sun", "mon", "tu", "tue", "tues", "wed", "th", "thu", "thur", "thurs", "fri", "sat"]);
|
||||
|
||||
/**
|
||||
* Determines if the given token (or series of initials) is a known abbreviation.
|
||||
* @param {string} token The token to check.
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isAbbreviation(token) {
|
||||
// Remove possessive endings and trailing periods.
|
||||
token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
|
||||
return ABBREVIATIONS.has(token.toLowerCase());
|
||||
}
|
||||
|
||||
// Map of closing punctuation to their corresponding opening punctuation.
|
||||
const MATCHING = new Map([
|
||||
[")", "("],
|
||||
["]", "["],
|
||||
["}", "{"],
|
||||
["》", "《"],
|
||||
["〉", "〈"],
|
||||
["›", "‹"],
|
||||
["»", "«"],
|
||||
["〉", "〈"],
|
||||
["」", "「"],
|
||||
["』", "『"],
|
||||
["〕", "〔"],
|
||||
["】", "【"],
|
||||
]);
|
||||
// Set of opening punctuation characters.
|
||||
const OPENING = new Set(MATCHING.values());
|
||||
|
||||
/**
|
||||
* Updates the nesting stack to track quotes and paired punctuation.
|
||||
* This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
|
||||
* (An apostrophe between letters is ignored so that contractions remain intact.)
|
||||
* @param {string} c The current character.
|
||||
* @param {string[]} stack The current nesting stack.
|
||||
* @param {number} i The index of the character in the buffer.
|
||||
* @param {string} buffer The full text being processed.
|
||||
*/
|
||||
function updateStack(c, stack, i, buffer) {
|
||||
// Handle standard quotes.
|
||||
if (c === '"' || c === "'") {
|
||||
// Ignore an apostrophe if it's between letters (e.g., in contractions).
|
||||
if (c === "'" && i > 0 && i < buffer.length - 1 && /[A-Za-z]/.test(buffer[i - 1]) && /[A-Za-z]/.test(buffer[i + 1])) {
|
||||
return;
|
||||
}
|
||||
if (stack.length && stack.at(-1) === c) {
|
||||
stack.pop();
|
||||
} else {
|
||||
stack.push(c);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Handle opening punctuation.
|
||||
if (OPENING.has(c)) {
|
||||
stack.push(c);
|
||||
return;
|
||||
}
|
||||
// Handle closing punctuation.
|
||||
const expectedOpening = MATCHING.get(c);
|
||||
if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple stream-based text splitter that emits complete sentences.
|
||||
*/
|
||||
export class TextSplitterStream {
|
||||
constructor() {
|
||||
this._buffer = "";
|
||||
this._sentences = [];
|
||||
this._resolver = null;
|
||||
this._closed = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Push one or more text chunks into the stream.
|
||||
* @param {...string} texts Text fragments to process.
|
||||
*/
|
||||
push(...texts) {
|
||||
for (const txt of texts) {
|
||||
this._buffer += txt;
|
||||
this._process();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the stream, signaling that no more text will be pushed.
|
||||
* This will flush any remaining text in the buffer as a sentence
|
||||
* and allow the consuming process to finish processing the stream.
|
||||
*/
|
||||
close() {
|
||||
if (this._closed) {
|
||||
throw new Error("Stream is already closed.");
|
||||
}
|
||||
this._closed = true;
|
||||
this.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes any remaining text in the buffer as a sentence.
|
||||
*/
|
||||
flush() {
|
||||
const remainder = this._buffer.trim();
|
||||
if (remainder.length > 0) {
|
||||
this._sentences.push(remainder);
|
||||
}
|
||||
this._buffer = "";
|
||||
this._resolve();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the pending promise to signal that sentences are available.
|
||||
* @private
|
||||
*/
|
||||
_resolve() {
|
||||
if (this._resolver) {
|
||||
this._resolver();
|
||||
this._resolver = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes the internal buffer to extract complete sentences.
|
||||
* If the potential sentence boundary is at the end of the current buffer,
|
||||
* it waits for more text before splitting.
|
||||
* @private
|
||||
*/
|
||||
_process() {
|
||||
let sentenceStart = 0;
|
||||
const buffer = this._buffer;
|
||||
const len = buffer.length;
|
||||
let i = 0;
|
||||
let stack = [];
|
||||
|
||||
// Helper to scan from the current index over trailing terminators and punctuation.
|
||||
const scanBoundary = (idx) => {
|
||||
let end = idx;
|
||||
// Consume contiguous sentence terminators (excluding newlines).
|
||||
while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
|
||||
++end;
|
||||
}
|
||||
// Consume trailing characters (e.g., closing quotes/brackets).
|
||||
while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
|
||||
++end;
|
||||
}
|
||||
let nextNonSpace = end + 1;
|
||||
while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
|
||||
++nextNonSpace;
|
||||
}
|
||||
return { end, nextNonSpace };
|
||||
};
|
||||
|
||||
while (i < len) {
|
||||
const c = buffer[i];
|
||||
updateStack(c, stack, i, buffer);
|
||||
|
||||
// Only consider splitting if we're not inside any nested structure.
|
||||
if (stack.length === 0 && isSentenceTerminator(c)) {
|
||||
const currentSegment = buffer.slice(sentenceStart, i);
|
||||
// Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
|
||||
if (/(^|\n)\d+$/.test(currentSegment)) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
|
||||
|
||||
// If the terminator is not a newline and there's no extra whitespace,
|
||||
// we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
|
||||
if (i === nextNonSpace - 1 && c !== "\n") {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Wait for more text if there's no non-whitespace character yet.
|
||||
if (nextNonSpace === len) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Determine the token immediately preceding the terminator.
|
||||
let tokenStart = i - 1;
|
||||
while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
|
||||
tokenStart--;
|
||||
}
|
||||
tokenStart = Math.max(sentenceStart, tokenStart + 1);
|
||||
const token = getTokenFromBuffer(buffer, tokenStart);
|
||||
if (!token) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- URL/email protection ---
|
||||
// If the token appears to be a URL or email (contains "://" or "@")
|
||||
// and does not already end with a terminator, skip splitting.
|
||||
if ((/https?[,:]\/\//.test(token) || token.includes("@")) && !isSentenceTerminator(token.at(-1))) {
|
||||
i = tokenStart + token.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Abbreviation protection ---
|
||||
if (isAbbreviation(token)) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Middle initials heuristic ---
|
||||
// If the token is a series of single-letter initials (each ending in a period)
|
||||
// and is followed by a capitalized word, assume it's part of a name.
|
||||
if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Lookahead heuristic ---
|
||||
// If the terminator is a period and the next non–whitespace character is lowercase,
|
||||
// assume it is not the end of a sentence.
|
||||
if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Special case: ellipsis that stands alone should be merged with the following sentence.
|
||||
const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
|
||||
if (sentence === "..." || sentence === "…") {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accept the sentence boundary.
|
||||
if (sentence) {
|
||||
this._sentences.push(sentence);
|
||||
}
|
||||
// Move to the next sentence.
|
||||
i = sentenceStart = boundaryEnd + 1;
|
||||
continue;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
// Remove the processed portion of the buffer.
|
||||
this._buffer = buffer.substring(sentenceStart);
|
||||
|
||||
// Resolve any pending promise if sentences are available.
|
||||
if (this._sentences.length > 0) {
|
||||
this._resolve();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Async iterator to yield sentences as they become available.
|
||||
* @returns {AsyncGenerator<string, void, void>}
|
||||
*/
|
||||
async *[Symbol.asyncIterator]() {
|
||||
if (this._resolver) {
|
||||
throw new Error("Another iterator is already active.");
|
||||
}
|
||||
while (true) {
|
||||
if (this._sentences.length > 0) {
|
||||
yield this._sentences.shift();
|
||||
} else if (this._closed) {
|
||||
// No more text will be pushed.
|
||||
break;
|
||||
} else {
|
||||
// Wait for more text.
|
||||
await new Promise((resolve) => {
|
||||
this._resolver = resolve;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronous iterator that flushes the buffer and returns all sentences.
|
||||
* @returns {Iterator<string>}
|
||||
*/
|
||||
[Symbol.iterator]() {
|
||||
this.flush();
|
||||
const iterator = this._sentences[Symbol.iterator]();
|
||||
this._sentences = [];
|
||||
return iterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the array of sentences currently available.
|
||||
* @type {string[]} The array of sentences.
|
||||
* @readonly
|
||||
*/
|
||||
get sentences() {
|
||||
return this._sentences;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the input text into an array of sentences.
|
||||
* @param {string} text The text to split.
|
||||
* @returns {string[]} An array of sentences.
|
||||
*/
|
||||
export function split(text) {
|
||||
const splitter = new TextSplitterStream();
|
||||
splitter.push(text);
|
||||
return [...splitter];
|
||||
}
|
||||
@@ -1,479 +0,0 @@
|
||||
import path from "path";
|
||||
import fs from "fs/promises";
|
||||
|
||||
export const VOICES = Object.freeze({
|
||||
af_heart: {
|
||||
name: "Heart",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
traits: "❤️",
|
||||
targetQuality: "A",
|
||||
overallGrade: "A",
|
||||
},
|
||||
af_alloy: {
|
||||
name: "Alloy",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C",
|
||||
},
|
||||
af_aoede: {
|
||||
name: "Aoede",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C+",
|
||||
},
|
||||
af_bella: {
|
||||
name: "Bella",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
traits: "🔥",
|
||||
targetQuality: "A",
|
||||
overallGrade: "A-",
|
||||
},
|
||||
af_jessica: {
|
||||
name: "Jessica",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
af_kore: {
|
||||
name: "Kore",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C+",
|
||||
},
|
||||
af_nicole: {
|
||||
name: "Nicole",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
traits: "🎧",
|
||||
targetQuality: "B",
|
||||
overallGrade: "B-",
|
||||
},
|
||||
af_nova: {
|
||||
name: "Nova",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C",
|
||||
},
|
||||
af_river: {
|
||||
name: "River",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
af_sarah: {
|
||||
name: "Sarah",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C+",
|
||||
},
|
||||
af_sky: {
|
||||
name: "Sky",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C-",
|
||||
},
|
||||
am_adam: {
|
||||
name: "Adam",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "D",
|
||||
overallGrade: "F+",
|
||||
},
|
||||
am_echo: {
|
||||
name: "Echo",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
am_eric: {
|
||||
name: "Eric",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
am_fenrir: {
|
||||
name: "Fenrir",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C+",
|
||||
},
|
||||
am_liam: {
|
||||
name: "Liam",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
am_michael: {
|
||||
name: "Michael",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C+",
|
||||
},
|
||||
am_onyx: {
|
||||
name: "Onyx",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
am_puck: {
|
||||
name: "Puck",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C+",
|
||||
},
|
||||
am_santa: {
|
||||
name: "Santa",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D-",
|
||||
},
|
||||
bf_emma: {
|
||||
name: "Emma",
|
||||
language: "en-gb",
|
||||
gender: "Female",
|
||||
traits: "🚺",
|
||||
targetQuality: "B",
|
||||
overallGrade: "B-",
|
||||
},
|
||||
bf_isabella: {
|
||||
name: "Isabella",
|
||||
language: "en-gb",
|
||||
gender: "Female",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C",
|
||||
},
|
||||
bm_george: {
|
||||
name: "George",
|
||||
language: "en-gb",
|
||||
gender: "Male",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C",
|
||||
},
|
||||
bm_lewis: {
|
||||
name: "Lewis",
|
||||
language: "en-gb",
|
||||
gender: "Male",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D+",
|
||||
},
|
||||
bf_alice: {
|
||||
name: "Alice",
|
||||
language: "en-gb",
|
||||
gender: "Female",
|
||||
traits: "🚺",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
bf_lily: {
|
||||
name: "Lily",
|
||||
language: "en-gb",
|
||||
gender: "Female",
|
||||
traits: "🚺",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
bm_daniel: {
|
||||
name: "Daniel",
|
||||
language: "en-gb",
|
||||
gender: "Male",
|
||||
traits: "🚹",
|
||||
targetQuality: "C",
|
||||
overallGrade: "D",
|
||||
},
|
||||
bm_fable: {
|
||||
name: "Fable",
|
||||
language: "en-gb",
|
||||
gender: "Male",
|
||||
traits: "🚹",
|
||||
targetQuality: "B",
|
||||
overallGrade: "C",
|
||||
},
|
||||
|
||||
// TODO: Add support for other languages:
|
||||
// jf_alpha: {
|
||||
// name: "alpha",
|
||||
// language: "ja",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C+",
|
||||
// },
|
||||
// jf_gongitsune: {
|
||||
// name: "gongitsune",
|
||||
// language: "ja",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// jf_nezumi: {
|
||||
// name: "nezumi",
|
||||
// language: "ja",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C-",
|
||||
// },
|
||||
// jf_tebukuro: {
|
||||
// name: "tebukuro",
|
||||
// language: "ja",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// jm_kumo: {
|
||||
// name: "kumo",
|
||||
// language: "ja",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C-",
|
||||
// },
|
||||
// zf_xiaobei: {
|
||||
// name: "xiaobei",
|
||||
// language: "zh",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zf_xiaoni: {
|
||||
// name: "xiaoni",
|
||||
// language: "zh",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zf_xiaoxiao: {
|
||||
// name: "xiaoxiao",
|
||||
// language: "zh",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zf_xiaoyi: {
|
||||
// name: "xiaoyi",
|
||||
// language: "zh",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zm_yunjian: {
|
||||
// name: "yunjian",
|
||||
// language: "zh",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zm_yunxi: {
|
||||
// name: "yunxi",
|
||||
// language: "zh",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zm_yunxia: {
|
||||
// name: "yunxia",
|
||||
// language: "zh",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// zm_yunyang: {
|
||||
// name: "yunyang",
|
||||
// language: "zh",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// ef_dora: {
|
||||
// name: "dora",
|
||||
// language: "es",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// em_alex: {
|
||||
// name: "alex",
|
||||
// language: "es",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// em_santa: {
|
||||
// name: "santa",
|
||||
// language: "es",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// ff_siwis: {
|
||||
// name: "siwis",
|
||||
// language: "es",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "B-",
|
||||
// },
|
||||
// hf_alpha: {
|
||||
// name: "alpha",
|
||||
// language: "hi",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// hf_beta: {
|
||||
// name: "beta",
|
||||
// language: "hi",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// hm_omega: {
|
||||
// name: "omega",
|
||||
// language: "hi",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// hm_psi: {
|
||||
// name: "psi",
|
||||
// language: "hi",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// if_sara: {
|
||||
// name: "sara",
|
||||
// language: "it",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// im_nicola: {
|
||||
// name: "nicola",
|
||||
// language: "it",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "B",
|
||||
// overallGrade: "C",
|
||||
// },
|
||||
// pf_dora: {
|
||||
// name: "dora",
|
||||
// language: "pt-br",
|
||||
// gender: "Female",
|
||||
// traits: "🚺",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// pm_alex: {
|
||||
// name: "alex",
|
||||
// language: "pt-br",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
// pm_santa: {
|
||||
// name: "santa",
|
||||
// language: "pt-br",
|
||||
// gender: "Male",
|
||||
// traits: "🚹",
|
||||
// targetQuality: "C",
|
||||
// overallGrade: "D",
|
||||
// },
|
||||
});
|
||||
|
||||
const VOICE_DATA_URL = "https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/voices";
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {keyof typeof VOICES} id
|
||||
* @returns {Promise<ArrayBufferLike>}
|
||||
*/
|
||||
async function getVoiceFile(id) {
|
||||
if (fs?.readFile) {
|
||||
const dirname = typeof __dirname !== "undefined" ? __dirname : import.meta.dirname;
|
||||
const file = path.resolve(dirname, `../voices/${id}.bin`);
|
||||
const { buffer } = await fs.readFile(file);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const url = `${VOICE_DATA_URL}/${id}.bin`;
|
||||
|
||||
let cache;
|
||||
try {
|
||||
cache = await caches.open("kokoro-voices");
|
||||
const cachedResponse = await cache.match(url);
|
||||
if (cachedResponse) {
|
||||
return await cachedResponse.arrayBuffer();
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn("Unable to open cache", e);
|
||||
}
|
||||
|
||||
// No cache, or cache failed to open. Fetch the file.
|
||||
const response = await fetch(url);
|
||||
const buffer = await response.arrayBuffer();
|
||||
|
||||
if (cache) {
|
||||
try {
|
||||
// NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files
|
||||
await cache.put(
|
||||
url,
|
||||
new Response(buffer, {
|
||||
headers: response.headers,
|
||||
}),
|
||||
);
|
||||
} catch (e) {
|
||||
console.warn("Unable to cache file", e);
|
||||
}
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const VOICE_CACHE = new Map();
|
||||
export async function getVoiceData(voice) {
|
||||
if (VOICE_CACHE.has(voice)) {
|
||||
return VOICE_CACHE.get(voice);
|
||||
}
|
||||
|
||||
const buffer = new Float32Array(await getVoiceFile(voice));
|
||||
VOICE_CACHE.set(voice, buffer);
|
||||
return buffer;
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
import { describe, test, expect } from "vitest";
|
||||
import { phonemize } from "../src/phonemize.js";
|
||||
|
||||
const A_TEST_CASES = new Map([
|
||||
["‘Hello’", "həlˈoʊ"],
|
||||
["‘Test’ and ‘Example’", "tˈɛst ænd ɛɡzˈæmpəl"],
|
||||
["«Bonjour»", '"bɔːnʒˈʊɹ"'],
|
||||
["«Test «nested» quotes»", '"tˈɛst "nˈɛstᵻd" kwˈoʊts"'],
|
||||
["(Hello)", "«həlˈoʊ»"],
|
||||
["(Nested (Parentheses))", "«nˈɛstᵻd «pɚɹˈɛnθəsˌiːz»»"],
|
||||
["こんにちは、世界!", "dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ, tʃˈaɪniːzlˌɛɾɚ tʃˈaɪniːzlˌɛɾɚ!"],
|
||||
["これはテストです:はい?", "dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ: dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ?"],
|
||||
["Hello World", "həlˈoʊ wˈɜːld"],
|
||||
["Hello World", "həlˈoʊ wˈɜːld"],
|
||||
["Hello\n \nWorld", "həlˈoʊ wˈɜːld"],
|
||||
["Dr. Smith", "dˈɑːktɚ smˈɪθ"],
|
||||
["DR. Brown", "dˈɑːktɚ bɹˈaʊn"],
|
||||
["Mr. Smith", "mˈɪstɚ smˈɪθ"],
|
||||
["MR. Anderson", "mˈɪstɚɹ ˈændɚsən"],
|
||||
["Ms. Taylor", "mˈɪs tˈeɪlɚ"],
|
||||
["MS. Carter", "mˈɪs kˈɑːɹɾɚ"],
|
||||
["Mrs. Johnson", "mˈɪsɪz dʒˈɑːnsən"],
|
||||
["MRS. Wilson", "mˈɪsɪz wˈɪlsən"],
|
||||
["Apples, oranges, etc.", "ˈæpəlz, ˈɔɹɪndʒᵻz, ɛtsˈɛtɹə"],
|
||||
["Apples, etc. Pears.", "ˈæpəlz, ɛtsˈɛtɹə. pˈɛɹz."],
|
||||
["Yeah", "jˈɛə"],
|
||||
["yeah", "jˈɛə"],
|
||||
["1990", "nˈaɪntiːn nˈaɪndi"],
|
||||
["12:34", "twˈɛlv θˈɜːɾi fˈoːɹ"],
|
||||
["2022s", "twˈɛnti twˈɛnti tˈuːz"],
|
||||
["1,000", "wˈʌn θˈaʊzənd"],
|
||||
["12,345,678", "twˈɛlv mˈɪliən θɹˈiː hˈʌndɹɪd fˈoːɹɾi fˈaɪv θˈaʊzənd sˈɪks hˈʌndɹɪd sˈɛvənti ˈeɪt"],
|
||||
["$100", "wˈʌn hˈʌndɹɪd dˈɑːlɚz"],
|
||||
["£1.50", "wˈʌn pˈaʊnd ænd fˈɪfti pˈɛns"],
|
||||
["12.34", "twˈɛlv pˈɔɪnt θɹˈiː fˈoːɹ"],
|
||||
["0.01", "zˈiəɹoʊ pˈɔɪnt zˈiəɹoʊ wˈʌn"],
|
||||
["10-20", "tˈɛn tə twˈɛnti"],
|
||||
["5-10", "fˈaɪv tə tˈɛn"],
|
||||
["10S", "tˈɛn ˈɛs"],
|
||||
["5S", "fˈaɪv ˈɛs"],
|
||||
["Cat's tail", "kˈæts tˈeɪl"],
|
||||
["X's mark", "ˈɛksᵻz mˈɑːɹk"],
|
||||
["U.S.A.", "jˈuːˈɛsˈeɪ."],
|
||||
["A.B.C", "ˈeɪbˈiːsˈiː"],
|
||||
]);
|
||||
|
||||
const B_TEST_CASES = new Map([
|
||||
["‘Hello’", "həlˈəʊ"],
|
||||
["‘Test’ and ‘Example’", "tˈɛst and ɛɡzˈampəl"],
|
||||
["«Bonjour»", '"bɔːnʒˈʊə"'],
|
||||
["«Test «nested» quotes»", '"tˈɛst "nˈɛstɪd" kwˈəʊts"'],
|
||||
["(Hello)", "«həlˈəʊ»"],
|
||||
["(Nested (Parentheses))", "«nˈɛstɪd «pəɹˈɛnθəsˌiːz»»"],
|
||||
["こんにちは、世界!", "dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə, tʃˈaɪniːzlˌɛtə tʃˈaɪniːzlˌɛtə!"],
|
||||
["これはテストです:はい?", "dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə: dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə?"],
|
||||
["Hello World", "həlˈəʊ wˈɜːld"],
|
||||
["Hello World", "həlˈəʊ wˈɜːld"],
|
||||
["Hello\n \nWorld", "həlˈəʊ wˈɜːld"],
|
||||
["Dr. Smith", "dˈɒktə smˈɪθ"],
|
||||
["DR. Brown", "dˈɒktə bɹˈaʊn"],
|
||||
["Mr. Smith", "mˈɪstə smˈɪθ"],
|
||||
["MR. Anderson", "mˈɪstəɹ ˈandəsən"],
|
||||
["Ms. Taylor", "mˈɪs tˈeɪlə"],
|
||||
["MS. Carter", "mˈɪs kˈɑːtə"],
|
||||
["Mrs. Johnson", "mˈɪsɪz dʒˈɒnsən"],
|
||||
["Apples, oranges, etc.", "ˈapəlz, ˈɒɹɪndʒɪz, ɛtsˈɛtɹə"],
|
||||
["Apples, etc. Pears.", "ˈapəlz, ɛtsˈɛtɹə. pˈeəz."],
|
||||
["1990", "nˈaɪntiːn nˈaɪnti"],
|
||||
["12:34", "twˈɛlv θˈɜːti fˈɔː"],
|
||||
["1,000", "wˈɒn θˈaʊzənd"],
|
||||
["12,345,678", "twˈɛlv mˈɪliən θɹˈiː hˈʌndɹɪdən fˈɔːti fˈaɪv θˈaʊzənd sˈɪks hˈʌndɹɪdən sˈɛvənti ˈeɪt"],
|
||||
["$100", "wˈɒn hˈʌndɹɪd dˈɒləz"],
|
||||
["£1.50", "wˈɒn pˈaʊnd and fˈɪfti pˈɛns"],
|
||||
["12.34", "twˈɛlv pˈɔɪnt θɹˈiː fˈɔː"],
|
||||
["0.01", "zˈiəɹəʊ pˈɔɪnt zˈiəɹəʊ wˈɒn"],
|
||||
["Cat's tail", "kˈats tˈeɪl"],
|
||||
["X's mark", "ˈɛksɪz mˈɑːk"],
|
||||
]);
|
||||
|
||||
describe("phonemize", () => {
|
||||
describe("en-us", () => {
|
||||
for (const [input, expected] of A_TEST_CASES) {
|
||||
test(`phonemize("${input}")`, async () => {
|
||||
expect(await phonemize(input)).toEqual(expected);
|
||||
});
|
||||
}
|
||||
});
|
||||
describe("en-gb", () => {
|
||||
for (const [input, expected] of B_TEST_CASES) {
|
||||
test(`phonemize("${input}")`, async () => {
|
||||
expect(await phonemize(input, "b")).toEqual(expected);
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -1,654 +0,0 @@
|
||||
import { describe, test, expect } from "vitest";
|
||||
import { TextSplitterStream, split } from "../src/splitter.js";
|
||||
|
||||
const TESTS = [
|
||||
{
|
||||
name: "Basic sentence splitting",
|
||||
input: "This is a test. This is another test.",
|
||||
target: ["This is a test.", "This is another test."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with dash (em dash)",
|
||||
input: "This is a test — yes, it is.",
|
||||
target: ["This is a test — yes, it is."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with quoted speech",
|
||||
input: 'She said, "Hello there. How are you?". I replied, "I\'m fine."',
|
||||
target: ['She said, "Hello there. How are you?".', 'I replied, "I\'m fine."'],
|
||||
},
|
||||
{
|
||||
name: "Sentences with abbreviations",
|
||||
input: "Dr. Smith is here. At 10 a.m. I saw him.",
|
||||
target: ["Dr. Smith is here.", "At 10 a.m. I saw him."],
|
||||
},
|
||||
{
|
||||
name: "Advanced sentences with abbreviations",
|
||||
input: "I went to Dr. Smith this morning at 10 a.m. and said hi.",
|
||||
target: ["I went to Dr. Smith this morning at 10 a.m. and said hi."],
|
||||
},
|
||||
{
|
||||
name: "Abbreviations with possessive",
|
||||
input: "The Dr.'s office.",
|
||||
target: ["The Dr.'s office."],
|
||||
},
|
||||
{
|
||||
name: "Ellipses in sentences",
|
||||
input: "Wait... what just happened? I don't understand...",
|
||||
target: ["Wait... what just happened?", "I don't understand..."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with numbers and decimals",
|
||||
input: "The price is $4.99. Do you want to buy it?",
|
||||
target: ["The price is $4.99.", "Do you want to buy it?"],
|
||||
},
|
||||
{
|
||||
name: "Sentences starting and ending with numbers",
|
||||
input: "10 people died in 2025. 20 people died in 2026.",
|
||||
target: ["10 people died in 2025.", "20 people died in 2026."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with scientific notation",
|
||||
input: "The star is 3.2×10^4 light-years away.",
|
||||
target: ["The star is 3.2×10^4 light-years away."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with multiple punctuation marks",
|
||||
input: "What?! Are you serious?! This is crazy...",
|
||||
target: ["What?!", "Are you serious?!", "This is crazy..."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with parentheses",
|
||||
input: "This is an example (which is quite useful). Do you agree?",
|
||||
target: ["This is an example (which is quite useful).", "Do you agree?"],
|
||||
},
|
||||
{
|
||||
name: "Nested sentences with parentheses",
|
||||
input: "This is an example (This is pretty cool. Another sentence). Do you agree?",
|
||||
target: ["This is an example (This is pretty cool. Another sentence).", "Do you agree?"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with newlines",
|
||||
input: "First sentence.\nSecond sentence.\nThird sentence.",
|
||||
target: ["First sentence.", "Second sentence.", "Third sentence."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with emojis",
|
||||
input: "I love pizza! 🍕 Do you? 😊",
|
||||
target: ["I love pizza!", "🍕 Do you?", "😊"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with unicode and non-Latin characters",
|
||||
input: "これはテストです。 次の文です。",
|
||||
target: ["これはテストです。", "次の文です。"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with bullet points",
|
||||
input: "- First point.\n- Second point.\n- Third point.",
|
||||
target: ["- First point.", "- Second point.", "- Third point."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with email addresses",
|
||||
input: "My email is test@example.com. Contact me!",
|
||||
target: ["My email is test@example.com.", "Contact me!"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with URLs",
|
||||
input: "Visit https://example.com. It's a great site!",
|
||||
target: ["Visit https://example.com.", "It's a great site!"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with URLs (subdomains)",
|
||||
input: "Visit https://test.example.com. It's a great site!",
|
||||
target: ["Visit https://test.example.com.", "It's a great site!"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with trailing spaces",
|
||||
input: " This is a sentence. Another one. ",
|
||||
target: ["This is a sentence.", "Another one."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with contractions",
|
||||
input: "You can't be serious. It's too late.",
|
||||
target: ["You can't be serious.", "It's too late."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with title case and proper nouns",
|
||||
input: "Mr. Johnson went to New York. He loves it there.",
|
||||
target: ["Mr. Johnson went to New York.", "He loves it there."],
|
||||
},
|
||||
{
|
||||
name: "Sentences with mixed cases",
|
||||
input: "i am happy. Are you?",
|
||||
target: ["i am happy.", "Are you?"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with missing punctuation",
|
||||
input: "This is a test without punctuation What should happen",
|
||||
target: ["This is a test without punctuation What should happen"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with mixed symbols",
|
||||
input: "Hello @John! How's it going? #excited",
|
||||
target: ["Hello @John!", "How's it going?", "#excited"],
|
||||
},
|
||||
{
|
||||
name: "Sentences with math expressions",
|
||||
input: "The result is 3.14. It's an approximation of pi.",
|
||||
target: ["The result is 3.14.", "It's an approximation of pi."],
|
||||
},
|
||||
{
|
||||
name: "Excessive punctuation",
|
||||
input: "Wait!!!! Are you sure??? This is insane!!! Right???",
|
||||
target: ["Wait!!!!", "Are you sure???", "This is insane!!!", "Right???"],
|
||||
},
|
||||
{
|
||||
name: "Mixed languages in one line",
|
||||
input: "English sentence. 这是一句中文? Another English sentence!",
|
||||
target: ["English sentence.", "这是一句中文?", "Another English sentence!"],
|
||||
},
|
||||
{
|
||||
name: "Sequence of punctuation plus emoji",
|
||||
input: "What??! 🤯Wait?? Hello!",
|
||||
target: ["What??!", "🤯Wait??", "Hello!"],
|
||||
},
|
||||
{
|
||||
name: "Nested parentheses and quotes",
|
||||
input: '(This is "very (strange)" text). Right?',
|
||||
target: ['(This is "very (strange)" text).', "Right?"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with ellipsis following a question mark",
|
||||
input: "Are you coming? ... I don't know.",
|
||||
target: ["Are you coming?", "... I don't know."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with mixed punctuation marks (colon, comma, question mark)",
|
||||
input: "What do you think: Is this the answer, or not?",
|
||||
target: ["What do you think: Is this the answer, or not?"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with parentheses and question mark",
|
||||
input: "Did you understand (after all)?",
|
||||
target: ["Did you understand (after all)?"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with repeated punctuation marks (exclamation)",
|
||||
input: "What a great day!!! This is amazing!!!",
|
||||
target: ["What a great day!!!", "This is amazing!!!"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with multiple short sentences and abbreviations",
|
||||
input: "Dr. Lee is busy. Mr. Brown is in a meeting.",
|
||||
target: ["Dr. Lee is busy.", "Mr. Brown is in a meeting."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with only emojis",
|
||||
input: "🍕🍔🍟🍦",
|
||||
target: ["🍕🍔🍟🍦"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with single quotes around a word",
|
||||
input: "The word 'apple' is red.",
|
||||
target: ["The word 'apple' is red."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with an email and a period",
|
||||
input: "My email is example@domain.com. Please contact me.",
|
||||
target: ["My email is example@domain.com.", "Please contact me."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with non-standard punctuation (pipe)",
|
||||
input: "This | is | a | test.",
|
||||
target: ["This | is | a | test."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with a URL and a period after it",
|
||||
input: "You can find more info at https://www.website.com. It’s reliable.",
|
||||
target: ["You can find more info at https://www.website.com.", "It’s reliable."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with multiple hashtags",
|
||||
input: "I love coding! #developer #javascript #testing",
|
||||
target: ["I love coding!", "#developer #javascript #testing"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with numbers and currency",
|
||||
input: "I have $99.99 in my wallet. It's not enough.",
|
||||
target: ["I have $99.99 in my wallet.", "It's not enough."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with mixed punctuation marks and parentheses",
|
||||
input: "Are you sure (really)? I don't think so!",
|
||||
target: ["Are you sure (really)?", "I don't think so!"],
|
||||
},
|
||||
{
|
||||
name: "Sentence with parentheses and ellipses",
|
||||
input: "This is a test (and it’s great)... seriously.",
|
||||
target: ["This is a test (and it’s great)... seriously."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with an uncommon abbreviation",
|
||||
input: "The event is scheduled for noon PST. I’ll be there.",
|
||||
target: ["The event is scheduled for noon PST.", "I’ll be there."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with a phone number",
|
||||
input: "Call me at 555-1234. Or email me at example@domain.com.",
|
||||
target: ["Call me at 555-1234.", "Or email me at example@domain.com."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with nested punctuation (quotes inside quotes)",
|
||||
input: 'He said, "It\'s a test," and left.',
|
||||
target: ['He said, "It\'s a test," and left.'],
|
||||
},
|
||||
{
|
||||
name: "Sentences only containing a quotation",
|
||||
input: `"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."\nThis is a test.`,
|
||||
target: [`"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat.`, `"It's like my body's developed this massive drug deficiency."`, "This is a test."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with a URL containing a question mark",
|
||||
input: "Visit https://www.example.com?query=test. It’s useful.",
|
||||
target: ["Visit https://www.example.com?query=test.", "It’s useful."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with mixed punctuation and commas",
|
||||
input: "Hello, how are you? I'm fine, thanks.",
|
||||
target: ["Hello, how are you?", "I'm fine, thanks."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with a comma before 'and'",
|
||||
input: "I like ice cream, and I like cake.",
|
||||
target: ["I like ice cream, and I like cake."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with capital letters inside parentheses",
|
||||
input: "I went to the store (THE BIG ONE).",
|
||||
target: ["I went to the store (THE BIG ONE)."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with dates and periods",
|
||||
input: "The event is on January 1st. It's a new year.",
|
||||
target: ["The event is on January 1st.", "It's a new year."],
|
||||
},
|
||||
{
|
||||
name: "Sentence with suffixes and periods",
|
||||
input: "Kokoro.js is powered by Transformers.js, a JavaScript library by Hugging Face.",
|
||||
target: ["Kokoro.js is powered by Transformers.js, a JavaScript library by Hugging Face."],
|
||||
},
|
||||
{
|
||||
name: "Non-splitting after a period",
|
||||
input: "Pi is 3.14 i.e., a mathematical constant. J.R.R. Tolkien wrote The Lord of the Rings. Wait... what? The files are /path/to/file.txt, VIDEO.MP4 and image.jpg.",
|
||||
target: ["Pi is 3.14 i.e., a mathematical constant.", "J.R.R. Tolkien wrote The Lord of the Rings.", "Wait... what?", "The files are /path/to/file.txt, VIDEO.MP4 and image.jpg."],
|
||||
},
|
||||
{
|
||||
name: "Long text with multiple sentences",
|
||||
input: `The sky above the port was the color of television, tuned to a dead channel.\n"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."\nIt was a Sprawl voice and a Sprawl joke. The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.\nThese were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come. One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures. The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire. Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need. We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.`,
|
||||
target: [
|
||||
"The sky above the port was the color of television, tuned to a dead channel.",
|
||||
"\"It's not like I'm using,\" Case heard someone say, as he shouldered his way through the crowd around the door of the Chat.",
|
||||
"\"It's like my body's developed this massive drug deficiency.\"",
|
||||
"It was a Sprawl voice and a Sprawl joke.",
|
||||
"The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.",
|
||||
"These were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come.",
|
||||
"One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures.",
|
||||
"The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire.",
|
||||
"Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need.",
|
||||
"We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.",
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
// Tests adapted from https://github.com/textlint-rule/sentence-splitter/blob/master/test/sentence-splitter-test.ts
|
||||
TESTS.push(
|
||||
{
|
||||
name: "Basic sentence splitting",
|
||||
input: "text",
|
||||
target: ["text"],
|
||||
},
|
||||
{
|
||||
name: "Should not split number",
|
||||
input: "Temperature is 40.2 degrees.",
|
||||
target: ["Temperature is 40.2 degrees."],
|
||||
},
|
||||
{
|
||||
name: "Should not split in pair string with same mark",
|
||||
input: 'I hear "I\'m back to home." from radio.',
|
||||
target: ['I hear "I\'m back to home." from radio.'],
|
||||
},
|
||||
{
|
||||
name: "Should not split in pair string",
|
||||
input: "彼は「ココにある。」と言った。",
|
||||
target: ["彼は「ココにある。」と言った。"],
|
||||
},
|
||||
// {
|
||||
// name: "Should not split in pair string and correct after sentence",
|
||||
// input: "彼は「ココにある。」と言った。だけではそれは違った。",
|
||||
// target: ["彼は「ココにある。」と言った。", "だけではそれは違った。"],
|
||||
// },
|
||||
{
|
||||
name: "Should split by first line break",
|
||||
input: "text",
|
||||
target: ["text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by last line break",
|
||||
input: "text\n",
|
||||
target: ["text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by double line break",
|
||||
input: "text\n\ntext",
|
||||
target: ["text", "text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by 。",
|
||||
input: "text。。text",
|
||||
target: ["text。。", "text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by 。 and linebreak",
|
||||
input: "text。\ntext",
|
||||
target: ["text。", "text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by . and whitespace",
|
||||
input: "1st text. 2nd text",
|
||||
target: ["1st text.", "2nd text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by multiple whitespaces",
|
||||
input: "1st text. 2nd text",
|
||||
target: ["1st text.", "2nd text"],
|
||||
},
|
||||
{
|
||||
name: "Should support start and end whitespace",
|
||||
input: " text. ",
|
||||
target: ["text."],
|
||||
},
|
||||
{
|
||||
name: "Should split by text, whitespaces, and newline",
|
||||
input: "1st text. \n 2nd text",
|
||||
target: ["1st text.", "2nd text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by !?",
|
||||
input: "text!?text",
|
||||
target: ["text!?", "text"],
|
||||
},
|
||||
{
|
||||
name: "Should split by last 。",
|
||||
input: "text。",
|
||||
target: ["text。"],
|
||||
},
|
||||
{
|
||||
name: "Should not split numbered list",
|
||||
input: "1. 1st text.\n2. 2nd text.\n10. 10th text.",
|
||||
target: ["1. 1st text.", "2. 2nd text.", "10. 10th text."],
|
||||
},
|
||||
);
|
||||
|
||||
// Tests adapted from https://github.com/wikimedia/sentencex-js/blob/main/test/en.test.js
|
||||
TESTS.push(
|
||||
{
|
||||
name: "Dr. title should not split",
|
||||
input: "This is Dr. Watson",
|
||||
target: ["This is Dr. Watson"],
|
||||
},
|
||||
{
|
||||
name: "Basic sentence split",
|
||||
input: "Roses Are Red. Violets Are Blue",
|
||||
target: ["Roses Are Red.", "Violets Are Blue"],
|
||||
},
|
||||
{
|
||||
name: "Exclamation and question split",
|
||||
input: "Hello! How are you?",
|
||||
target: ["Hello!", "How are you?"],
|
||||
},
|
||||
{
|
||||
name: "Simple period split",
|
||||
input: "This is a test.",
|
||||
target: ["This is a test."],
|
||||
},
|
||||
{
|
||||
name: "Mr. title should not split",
|
||||
input: "Mr. Smith went to Washington.",
|
||||
target: ["Mr. Smith went to Washington."],
|
||||
},
|
||||
{
|
||||
name: "Words ending in title-like suffixes should split",
|
||||
input: "He hit the drums. Then he hit the cymbals.",
|
||||
target: ["He hit the drums.", "Then he hit the cymbals."],
|
||||
},
|
||||
{
|
||||
name: "Surprise sentence should not split",
|
||||
input: "What a suprise?!",
|
||||
target: ["What a suprise?!"],
|
||||
},
|
||||
{
|
||||
name: "Ellipsis should not split",
|
||||
input: "That's all folks...",
|
||||
target: ["That's all folks..."],
|
||||
},
|
||||
{
|
||||
name: "Single line break should split",
|
||||
input: "First line\nSecond line",
|
||||
target: ["First line", "Second line"],
|
||||
},
|
||||
{
|
||||
name: "Double line break should split",
|
||||
input: "First line\nSecond line\n\nThird line",
|
||||
target: ["First line", "Second line", "Third line"],
|
||||
},
|
||||
{
|
||||
name: "Abbreviations should not split",
|
||||
input: "This is UK. Not US",
|
||||
target: ["This is UK.", "Not US"],
|
||||
},
|
||||
{
|
||||
name: "Dollar amount should not split",
|
||||
input: "This balloon costs $1.20",
|
||||
target: ["This balloon costs $1.20"],
|
||||
},
|
||||
{
|
||||
name: "Basic multiple sentence split",
|
||||
input: "Hello World. My name is Jonas.",
|
||||
target: ["Hello World.", "My name is Jonas."],
|
||||
},
|
||||
{
|
||||
name: "Basic question and sentence split",
|
||||
input: "What is your name? My name is Jonas.",
|
||||
target: ["What is your name?", "My name is Jonas."],
|
||||
},
|
||||
{
|
||||
name: "Exclamation and period split",
|
||||
input: "There it is! I found it.",
|
||||
target: ["There it is!", "I found it."],
|
||||
},
|
||||
{
|
||||
name: "Middle initial should not split",
|
||||
input: "My name is Jonas E. Smith.",
|
||||
target: ["My name is Jonas E. Smith."],
|
||||
},
|
||||
{
|
||||
name: "Page reference should not split",
|
||||
input: "Please turn to p. 55.",
|
||||
target: ["Please turn to p. 55."],
|
||||
},
|
||||
{
|
||||
name: "Co. abbreviation should not split",
|
||||
input: "Were Jane and co. at the party?",
|
||||
target: ["Were Jane and co. at the party?"],
|
||||
},
|
||||
{
|
||||
name: "Business name should not split",
|
||||
input: "They closed the deal with Pitt, Briggs & Co. at noon.",
|
||||
target: ["They closed the deal with Pitt, Briggs & Co. at noon."],
|
||||
},
|
||||
{
|
||||
name: "Mount abbreviation should not split",
|
||||
input: "I can see Mt. Fuji from here.",
|
||||
target: ["I can see Mt. Fuji from here."],
|
||||
},
|
||||
{
|
||||
name: "Saint abbreviation should not split",
|
||||
input: "St. Michael's Church is on 5th st. near the light.",
|
||||
target: ["St. Michael's Church is on 5th st. near the light."],
|
||||
},
|
||||
{
|
||||
name: "JFK Jr. should not split",
|
||||
input: "That is JFK Jr.'s book.",
|
||||
target: ["That is JFK Jr.'s book."],
|
||||
},
|
||||
{
|
||||
name: "Country abbreviation should not split",
|
||||
input: "I visited the U.S.A. last year.",
|
||||
target: ["I visited the U.S.A. last year."],
|
||||
},
|
||||
{
|
||||
name: "Dollar amount with period split",
|
||||
input: "She has $100.00. It is in her bag.",
|
||||
target: ["She has $100.00.", "It is in her bag."],
|
||||
},
|
||||
{
|
||||
name: "Email should not split",
|
||||
input: "Her email is Jane.Doe@example.com. I sent her an email.",
|
||||
target: ["Her email is Jane.Doe@example.com.", "I sent her an email."],
|
||||
},
|
||||
{
|
||||
name: "URL should not split",
|
||||
input: "The site is, https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
|
||||
target: ["The site is, https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."],
|
||||
},
|
||||
// {
|
||||
// name: "Yahoo! should not split",
|
||||
// input: "She works at Yahoo! in the accounting department.",
|
||||
// target: ["She works at Yahoo! in the accounting department."],
|
||||
// },
|
||||
{
|
||||
name: "Multiple exclamations should split",
|
||||
input: "Hello!! Long time no see.",
|
||||
target: ["Hello!!", "Long time no see."],
|
||||
},
|
||||
{
|
||||
name: "Mixed punctuation should split",
|
||||
input: "Hello?! Is that you?",
|
||||
target: ["Hello?!", "Is that you?"],
|
||||
},
|
||||
// {
|
||||
// name: "Numbered reference should not split",
|
||||
// input: "Saint Maximus (died 250) is a Christian saint and martyr.[1] The emperor Decius published a decree ordering the veneration of busts of the deified emperors.",
|
||||
// target: ["Saint Maximus (died 250) is a Christian saint and martyr.[1]", "The emperor Decius published a decree ordering the veneration of busts of the deified emperors."],
|
||||
// },
|
||||
);
|
||||
|
||||
const STREAMED_TESTS = [
|
||||
{
|
||||
name: "Basic sentence splitting",
|
||||
input: ["I went", " to the", " store. I", " bought an apple for $1.", "99. It was", " a good deal."],
|
||||
target: ["I went to the store.", "I bought an apple for $1.99.", "It was a good deal."],
|
||||
},
|
||||
{
|
||||
name: "URL with query parameters",
|
||||
input: ["Visit https://www", ".example.", "com", "?query=test."],
|
||||
target: ["Visit https://www.example.com?query=test."],
|
||||
},
|
||||
];
|
||||
|
||||
describe("Sentence splitting", () => {
|
||||
describe("synchronous", () => {
|
||||
for (const { name, input, target } of TESTS) {
|
||||
test(name, () => {
|
||||
expect(split(input)).toEqual(target);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("for loop", () => {
|
||||
test("synchronous for ... of", () => {
|
||||
const streamer = new TextSplitterStream();
|
||||
// Initial text
|
||||
streamer.push("Hello, how are you? I'm fine, thanks.");
|
||||
|
||||
// 1. Consume the current stream
|
||||
const sentences = [];
|
||||
for (const sentence of streamer) {
|
||||
sentences.push(sentence);
|
||||
}
|
||||
expect(sentences).toEqual(["Hello, how are you?", "I'm fine, thanks."]);
|
||||
|
||||
// 2. Consume the stream again
|
||||
streamer.push("This is a test. This is unfinish-");
|
||||
const sentences2 = [];
|
||||
for (const sentence of streamer) {
|
||||
sentences2.push(sentence);
|
||||
}
|
||||
expect(sentences2).toEqual(["This is a test.", "This is unfinish-"]);
|
||||
|
||||
// 3. Consume the stream again
|
||||
streamer.push("ed.");
|
||||
const sentences3 = [];
|
||||
for (const sentence of streamer) {
|
||||
sentences3.push(sentence);
|
||||
}
|
||||
expect(sentences3).toEqual(["ed."]);
|
||||
});
|
||||
|
||||
test("asynchronous for ... of", async () => {
|
||||
const streamer = new TextSplitterStream();
|
||||
// Initial text
|
||||
streamer.push("Hello, how are");
|
||||
|
||||
// Consumes the stream asynchronously
|
||||
const sentences = [];
|
||||
const consumeStream = (async () => {
|
||||
for await (const sentence of streamer) {
|
||||
sentences.push(sentence);
|
||||
}
|
||||
})();
|
||||
|
||||
setTimeout(() => {
|
||||
streamer.push(" you? I'm fine, thanks.");
|
||||
}, 10);
|
||||
setTimeout(() => {
|
||||
streamer.push(" This is a test. This is unfinish-");
|
||||
}, 20);
|
||||
setTimeout(() => {
|
||||
streamer.push("ed.");
|
||||
}, 30);
|
||||
setTimeout(() => {
|
||||
streamer.close();
|
||||
}, 40);
|
||||
|
||||
await consumeStream;
|
||||
expect(sentences).toEqual(["Hello, how are you?", "I'm fine, thanks.", "This is a test.", "This is unfinish-ed."]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("streaming", () => {
|
||||
const tests = [
|
||||
// Pre-defined test cases
|
||||
...STREAMED_TESTS,
|
||||
// Test that adding character by character (the most extreme case) also works correctly
|
||||
...TESTS.map(({ name, input, target }) => ({ name, input: Array.from(input), target })),
|
||||
];
|
||||
for (const { name, input, target } of tests) {
|
||||
test(name, async () => {
|
||||
const streamer = new TextSplitterStream();
|
||||
|
||||
const sentences = [];
|
||||
const consumeStream = (async () => {
|
||||
for await (const sentence of streamer) {
|
||||
sentences.push(sentence);
|
||||
}
|
||||
})();
|
||||
streamer.push(...input);
|
||||
streamer.close();
|
||||
|
||||
await consumeStream;
|
||||
expect(sentences).toEqual(target);
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"include": ["src/**/*"],
|
||||
"compilerOptions": {
|
||||
"checkJs": true,
|
||||
"target": "esnext",
|
||||
"module": "nodenext",
|
||||
"moduleResolution": "nodenext",
|
||||
"outDir": "types",
|
||||
"strict": false,
|
||||
"skipLibCheck": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"noEmit": false,
|
||||
"emitDeclarationOnly": true
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user