Create Kokoro TTS JavaScript library (#3)
* Set up JS project * Finalise JS library * Update README * Fix package.json repository url * Rename package -> `kokoro-js` * Fix samples in README * Cleanup README * Bump `phonemizer` version * Create web demo * Run prettier * Link to model used in demo * Enable multithreading in HF space demo (~40% faster) * Add link to demo in README * Bump to v1.0.1
This commit is contained in:
90
kokoro.js/src/kokoro.js
Normal file
90
kokoro.js/src/kokoro.js
Normal file
@@ -0,0 +1,90 @@
|
||||
import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
|
||||
import { phonemize } from "./phonemize.js";
|
||||
import { getVoiceData, VOICES } from "./voices.js";
|
||||
|
||||
const STYLE_DIM = 256;
|
||||
const SAMPLE_RATE = 24000;
|
||||
|
||||
export class KokoroTTS {
|
||||
/**
|
||||
* Create a new KokoroTTS instance.
|
||||
* @param {import('@huggingface/transformers').StyleTextToSpeech2Model} model The model
|
||||
* @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer
|
||||
*/
|
||||
constructor(model, tokenizer) {
|
||||
this.model = model;
|
||||
this.tokenizer = tokenizer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a KokoroTTS model from the Hugging Face Hub.
|
||||
* @param {string} model_id The model id
|
||||
* @param {Object} options Additional options
|
||||
* @param {"fp32"|"fp16"|"q8"|"q4"|"q4f16"} [options.dtype="fp32"] The data type to use.
|
||||
* @param {"wasm"|"webgpu"|"cpu"|null} [options.device=null] The device to run the model on.
|
||||
* @param {import("@huggingface/transformers").ProgressCallback} [options.progress_callback=null] A callback function that is called with progress information.
|
||||
* @returns {Promise<KokoroTTS>} The loaded model
|
||||
*/
|
||||
static async from_pretrained(model_id, { dtype = "fp32", device = null, progress_callback = null } = {}) {
|
||||
const model = StyleTextToSpeech2Model.from_pretrained(model_id, { progress_callback, dtype, device });
|
||||
const tokenizer = AutoTokenizer.from_pretrained(model_id, { progress_callback });
|
||||
|
||||
const info = await Promise.all([model, tokenizer]);
|
||||
return new KokoroTTS(...info);
|
||||
}
|
||||
|
||||
get voices() {
|
||||
return VOICES;
|
||||
}
|
||||
|
||||
list_voices() {
|
||||
console.table(VOICES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from text.
|
||||
*
|
||||
* Note: The model will be loaded on the first call, and subsequent calls will use the same model.
|
||||
* @param {string} text The input text
|
||||
* @param {Object} options Additional options
|
||||
* @param {keyof typeof VOICES} [options.voice="af"] The voice style to use
|
||||
* @param {number} [options.speed=1] The speaking speed
|
||||
* @returns {Promise<RawAudio>} The generated audio
|
||||
*/
|
||||
async generate(text, { voice = "af", speed = 1 } = {}) {
|
||||
if (!VOICES.hasOwnProperty(voice)) {
|
||||
console.error(`Voice "${voice}" not found. Available voices:`);
|
||||
console.table(VOICES);
|
||||
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
|
||||
}
|
||||
|
||||
const language = voice.at(0); // "a" or "b"
|
||||
const phonemes = await phonemize(text, language);
|
||||
const { input_ids } = this.tokenizer(phonemes, {
|
||||
truncation: true,
|
||||
});
|
||||
|
||||
// Select voice style based on number of input tokens
|
||||
const num_tokens = Math.max(
|
||||
input_ids.dims.at(-1) - 2, // Without padding;
|
||||
0,
|
||||
);
|
||||
|
||||
// Load voice style
|
||||
const data = await getVoiceData(voice);
|
||||
const offset = num_tokens * STYLE_DIM;
|
||||
const voiceData = data.slice(offset, offset + STYLE_DIM);
|
||||
|
||||
// Prepare model inputs
|
||||
const inputs = {
|
||||
input_ids,
|
||||
style: new Tensor("float32", voiceData, [1, STYLE_DIM]),
|
||||
speed: new Tensor("float32", [speed], [1]),
|
||||
};
|
||||
|
||||
// Generate audio
|
||||
const { waveform } = await this.model(inputs);
|
||||
|
||||
return new RawAudio(waveform.data, SAMPLE_RATE);
|
||||
}
|
||||
}
|
||||
197
kokoro.js/src/phonemize.js
Normal file
197
kokoro.js/src/phonemize.js
Normal file
@@ -0,0 +1,197 @@
|
||||
import { phonemize as espeakng } from "phonemizer";
|
||||
|
||||
/**
|
||||
* Helper function to split a string on a regex, but keep the delimiters.
|
||||
* This is required, because the JavaScript `.split()` method does not keep the delimiters,
|
||||
* and wrapping in a capturing group causes issues with existing capturing groups (due to nesting).
|
||||
* @param {string} text The text to split.
|
||||
* @param {RegExp} regex The regex to split on.
|
||||
* @returns {{match: boolean; text: string}[]} The split string.
|
||||
*/
|
||||
function split(text, regex) {
|
||||
const result = [];
|
||||
let prev = 0;
|
||||
for (const match of text.matchAll(regex)) {
|
||||
const fullMatch = match[0];
|
||||
if (prev < match.index) {
|
||||
result.push({ match: false, text: text.slice(prev, match.index) });
|
||||
}
|
||||
if (fullMatch.length > 0) {
|
||||
result.push({ match: true, text: fullMatch });
|
||||
}
|
||||
prev = match.index + fullMatch.length;
|
||||
}
|
||||
if (prev < text.length) {
|
||||
result.push({ match: false, text: text.slice(prev) });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to split numbers into phonetic equivalents
|
||||
* @param {string} match The matched number
|
||||
* @returns {string} The phonetic equivalent
|
||||
*/
|
||||
function split_num(match) {
|
||||
if (match.includes(".")) {
|
||||
return match;
|
||||
} else if (match.includes(":")) {
|
||||
let [h, m] = match.split(":").map(Number);
|
||||
if (m === 0) {
|
||||
return `${h} o'clock`;
|
||||
} else if (m < 10) {
|
||||
return `${h} oh ${m}`;
|
||||
}
|
||||
return `${h} ${m}`;
|
||||
}
|
||||
let year = parseInt(match.slice(0, 4), 10);
|
||||
if (year < 1100 || year % 1000 < 10) {
|
||||
return match;
|
||||
}
|
||||
let left = match.slice(0, 2);
|
||||
let right = parseInt(match.slice(2, 4), 10);
|
||||
let suffix = match.endsWith("s") ? "s" : "";
|
||||
if (year % 1000 >= 100 && year % 1000 <= 999) {
|
||||
if (right === 0) {
|
||||
return `${left} hundred${suffix}`;
|
||||
} else if (right < 10) {
|
||||
return `${left} oh ${right}${suffix}`;
|
||||
}
|
||||
}
|
||||
return `${left} ${right}${suffix}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to format monetary values
|
||||
* @param {string} match The matched currency
|
||||
* @returns {string} The formatted currency
|
||||
*/
|
||||
function flip_money(match) {
|
||||
const bill = match[0] === "$" ? "dollar" : "pound";
|
||||
if (isNaN(Number(match.slice(1)))) {
|
||||
return `${match.slice(1)} ${bill}s`;
|
||||
} else if (!match.includes(".")) {
|
||||
let suffix = match.slice(1) === "1" ? "" : "s";
|
||||
return `${match.slice(1)} ${bill}${suffix}`;
|
||||
}
|
||||
const [b, c] = match.slice(1).split(".");
|
||||
const d = parseInt(c.padEnd(2, "0"), 10);
|
||||
let coins = match[0] === "$" ? (d === 1 ? "cent" : "cents") : d === 1 ? "penny" : "pence";
|
||||
return `${b} ${bill}${b === "1" ? "" : "s"} and ${d} ${coins}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to process decimal numbers
|
||||
* @param {string} match The matched number
|
||||
* @returns {string} The formatted number
|
||||
*/
|
||||
function point_num(match) {
|
||||
let [a, b] = match.split(".");
|
||||
return `${a} point ${b.split("").join(" ")}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize text for phonemization
|
||||
* @param {string} text The text to normalize
|
||||
* @returns {string} The normalized text
|
||||
*/
|
||||
function normalize_text(text) {
|
||||
return (
|
||||
text
|
||||
// 1. Handle quotes and brackets
|
||||
.replace(/[‘’]/g, "'")
|
||||
.replace(/«/g, "“")
|
||||
.replace(/»/g, "”")
|
||||
.replace(/[“”]/g, '"')
|
||||
.replace(/\(/g, "«")
|
||||
.replace(/\)/g, "»")
|
||||
|
||||
// 2. Replace uncommon punctuation marks
|
||||
.replace(/、/g, ", ")
|
||||
.replace(/。/g, ". ")
|
||||
.replace(/!/g, "! ")
|
||||
.replace(/,/g, ", ")
|
||||
.replace(/:/g, ": ")
|
||||
.replace(/;/g, "; ")
|
||||
.replace(/?/g, "? ")
|
||||
|
||||
// 3. Whitespace normalization
|
||||
.replace(/[^\S \n]/g, " ")
|
||||
.replace(/ +/, " ")
|
||||
.replace(/(?<=\n) +(?=\n)/g, "")
|
||||
|
||||
// 4. Abbreviations
|
||||
.replace(/\bD[Rr]\.(?= [A-Z])/g, "Doctor")
|
||||
.replace(/\b(?:Mr\.|MR\.(?= [A-Z]))/g, "Mister")
|
||||
.replace(/\b(?:Ms\.|MS\.(?= [A-Z]))/g, "Miss")
|
||||
.replace(/\b(?:Mrs\.|MRS\.(?= [A-Z]))/g, "Mrs")
|
||||
.replace(/\betc\.(?! [A-Z])/gi, "etc")
|
||||
|
||||
// 5. Normalize casual words
|
||||
.replace(/\b(y)eah?\b/gi, "$1e'a")
|
||||
|
||||
// 5. Handle numbers and currencies
|
||||
.replace(/\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)/g, split_num)
|
||||
.replace(/(?<=\d),(?=\d)/g, "")
|
||||
.replace(/[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b/gi, flip_money)
|
||||
.replace(/\d*\.\d+/g, point_num)
|
||||
.replace(/(?<=\d)-(?=\d)/g, " to ")
|
||||
.replace(/(?<=\d)S/g, " S")
|
||||
|
||||
// 6. Handle possessives
|
||||
.replace(/(?<=[BCDFGHJ-NP-TV-Z])'?s\b/g, "'S")
|
||||
.replace(/(?<=X')S\b/g, "s")
|
||||
|
||||
// 7. Handle hyphenated words/letters
|
||||
.replace(/(?:[A-Za-z]\.){2,} [a-z]/g, (m) => m.replace(/\./g, "-"))
|
||||
.replace(/(?<=[A-Z])\.(?=[A-Z])/gi, "-")
|
||||
|
||||
// 8. Strip leading and trailing whitespace
|
||||
.trim()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes regular expression special characters from a string by replacing them with their escaped counterparts.
|
||||
*
|
||||
* @param {string} string The string to escape.
|
||||
* @returns {string} The escaped string.
|
||||
*/
|
||||
function escapeRegExp(string) {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
|
||||
}
|
||||
|
||||
const PUNCTUATION = ';:,.!?¡¿—…"«»“”(){}[]';
|
||||
const PUNCTUATION_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*)+`, "g");
|
||||
|
||||
export async function phonemize(text, language = "a", norm = true) {
|
||||
// 1. Normalize text
|
||||
if (norm) {
|
||||
text = normalize_text(text);
|
||||
}
|
||||
|
||||
// 2. Split into chunks, to ensure we preserve punctuation
|
||||
const sections = split(text, PUNCTUATION_PATTERN);
|
||||
|
||||
// 3. Convert each section to phonemes
|
||||
const lang = language === "a" ? "en-us" : "en";
|
||||
const ps = (await Promise.all(sections.map(async ({ match, text }) => (match ? text : (await espeakng(text, lang)).join(" "))))).join("");
|
||||
|
||||
// 4. Post-process phonemes
|
||||
let processed = ps
|
||||
// https://en.wiktionary.org/wiki/kokoro#English
|
||||
.replace(/kəkˈoːɹoʊ/g, "kˈoʊkəɹoʊ")
|
||||
.replace(/kəkˈɔːɹəʊ/g, "kˈəʊkəɹəʊ")
|
||||
.replace(/ʲ/g, "j")
|
||||
.replace(/r/g, "ɹ")
|
||||
.replace(/x/g, "k")
|
||||
.replace(/ɬ/g, "l")
|
||||
.replace(/(?<=[a-zɹː])(?=hˈʌndɹɪd)/g, " ")
|
||||
.replace(/ z(?=[;:,.!?¡¿—…"«»“” ]|$)/g, "z");
|
||||
|
||||
// 5. Additional post-processing for American English
|
||||
if (language === "a") {
|
||||
processed = processed.replace(/(?<=nˈaɪn)ti(?!ː)/g, "di");
|
||||
}
|
||||
return processed.trim();
|
||||
}
|
||||
121
kokoro.js/src/voices.js
Normal file
121
kokoro.js/src/voices.js
Normal file
@@ -0,0 +1,121 @@
|
||||
import path from "path";
|
||||
import fs from "fs/promises";
|
||||
|
||||
export const VOICES = Object.freeze({
|
||||
af: {
|
||||
// Default voice is a 50-50 mix of Bella & Sarah
|
||||
name: "Default",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
},
|
||||
af_bella: {
|
||||
name: "Bella",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
},
|
||||
af_nicole: {
|
||||
name: "Nicole",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
},
|
||||
af_sarah: {
|
||||
name: "Sarah",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
},
|
||||
af_sky: {
|
||||
name: "Sky",
|
||||
language: "en-us",
|
||||
gender: "Female",
|
||||
},
|
||||
am_adam: {
|
||||
name: "Adam",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
},
|
||||
am_michael: {
|
||||
name: "Michael",
|
||||
language: "en-us",
|
||||
gender: "Male",
|
||||
},
|
||||
|
||||
bf_emma: {
|
||||
name: "Emma",
|
||||
language: "en-gb",
|
||||
gender: "Female",
|
||||
},
|
||||
bf_isabella: {
|
||||
name: "Isabella",
|
||||
language: "en-gb",
|
||||
gender: "Female",
|
||||
},
|
||||
bm_george: {
|
||||
name: "George",
|
||||
language: "en-gb",
|
||||
gender: "Male",
|
||||
},
|
||||
bm_lewis: {
|
||||
name: "Lewis",
|
||||
language: "en-gb",
|
||||
gender: "Male",
|
||||
},
|
||||
});
|
||||
|
||||
const VOICE_DATA_URL = "https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices";
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {keyof typeof VOICES} id
|
||||
* @returns {Promise<ArrayBufferLike>}
|
||||
*/
|
||||
async function getVoiceFile(id) {
|
||||
if (fs?.readFile) {
|
||||
const file = path.resolve(import.meta.dirname ?? __dirname, `../voices/${id}.bin`);
|
||||
const { buffer } = await fs.readFile(file);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const url = `${VOICE_DATA_URL}/${id}.bin`;
|
||||
|
||||
let cache;
|
||||
try {
|
||||
cache = await caches.open("kokoro-voices");
|
||||
const cachedResponse = await cache.match(url);
|
||||
if (cachedResponse) {
|
||||
return await cachedResponse.arrayBuffer();
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn("Unable to open cache", e);
|
||||
}
|
||||
|
||||
// No cache, or cache failed to open. Fetch the file.
|
||||
const response = await fetch(url);
|
||||
const buffer = await response.arrayBuffer();
|
||||
|
||||
if (cache) {
|
||||
try {
|
||||
// NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files
|
||||
await cache.put(
|
||||
url,
|
||||
new Response(buffer, {
|
||||
headers: response.headers,
|
||||
}),
|
||||
);
|
||||
} catch (e) {
|
||||
console.warn("Unable to cache file", e);
|
||||
}
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const VOICE_CACHE = new Map();
|
||||
export async function getVoiceData(voice) {
|
||||
if (VOICE_CACHE.has(voice)) {
|
||||
return VOICE_CACHE.get(voice);
|
||||
}
|
||||
|
||||
const buffer = new Float32Array(await getVoiceFile(voice));
|
||||
VOICE_CACHE.set(voice, buffer);
|
||||
return buffer;
|
||||
}
|
||||
Reference in New Issue
Block a user