Kokoro.js v1.2.0: Streaming support (#92)
* Set up JS project * Finalise JS library * Update README * Fix package.json repository url * Rename package -> `kokoro-js` * Fix samples in README * Cleanup README * Bump `phonemizer` version * Create web demo * Run prettier * Link to model used in demo * Enable multithreading in HF space demo (~40% faster) * Add link to demo in README * Bump to v1.0.1 * Update voices * Update versions * Update phonemize JSDoc * Use updated voice pack * Update versions * Update demo (v1.0 & WebGPU support) * Update README * Enforce maximum number of tokens * Update README * [version] Update to 1.1.1 * Create simple sentence splitter * Update `npm run test` * Update API to use sync and async iterators * Add support for streamed generation in kokoro.js * Always split on newlines * Remove debug line * Improvements * Add more matching puntuation marks * Update comments * nits * Export TextSplitterStream too * Update splitter.js * Update README * [version] Update to 1.2.0
This commit is contained in:
@@ -1,10 +1,23 @@
|
||||
import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
|
||||
import { phonemize } from "./phonemize.js";
|
||||
import { TextSplitterStream } from "./splitter.js";
|
||||
import { getVoiceData, VOICES } from "./voices.js";
|
||||
|
||||
const STYLE_DIM = 256;
|
||||
const SAMPLE_RATE = 24000;
|
||||
|
||||
/**
|
||||
* @typedef {Object} GenerateOptions
|
||||
* @property {keyof typeof VOICES} [voice="af_heart"] The voice
|
||||
* @property {number} [speed=1] The speaking speed
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} StreamProperties
|
||||
* @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
|
||||
* @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
|
||||
*/
|
||||
|
||||
export class KokoroTTS {
|
||||
/**
|
||||
* Create a new KokoroTTS instance.
|
||||
@@ -47,34 +60,37 @@ export class KokoroTTS {
|
||||
console.table(VOICES);
|
||||
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
|
||||
}
|
||||
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from text.
|
||||
*
|
||||
* @param {string} text The input text
|
||||
* @param {Object} options Additional options
|
||||
* @param {keyof typeof VOICES} [options.voice="af_heart"] The voice style to use
|
||||
* @param {number} [options.speed=1] The speaking speed
|
||||
* @param {GenerateOptions} options Additional options
|
||||
* @returns {Promise<RawAudio>} The generated audio
|
||||
*/
|
||||
async generate(text, { voice = "af_heart", speed = 1 } = {}) {
|
||||
this._validate_voice(voice);
|
||||
const language = this._validate_voice(voice);
|
||||
|
||||
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
|
||||
const phonemes = await phonemize(text, language);
|
||||
const { input_ids } = this.tokenizer(phonemes, {
|
||||
truncation: true,
|
||||
});
|
||||
|
||||
return this.generate_from_ids(input_ids, { voice, speed });
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from input ids.
|
||||
* @param {Tensor} input_ids The input ids
|
||||
* @param {GenerateOptions} options Additional options
|
||||
* @returns {Promise<RawAudio>} The generated audio
|
||||
*/
|
||||
async generate_from_ids(input_ids, { voice = "af_heart", speed = 1 } = {}) {
|
||||
// Select voice style based on number of input tokens
|
||||
const num_tokens = Math.min(
|
||||
Math.max(
|
||||
input_ids.dims.at(-1) - 2,
|
||||
0,
|
||||
),
|
||||
509,
|
||||
);
|
||||
const num_tokens = Math.min(Math.max(input_ids.dims.at(-1) - 2, 0), 509);
|
||||
|
||||
// Load voice style
|
||||
const data = await getVoiceData(voice);
|
||||
@@ -90,7 +106,47 @@ export class KokoroTTS {
|
||||
|
||||
// Generate audio
|
||||
const { waveform } = await this.model(inputs);
|
||||
|
||||
return new RawAudio(waveform.data, SAMPLE_RATE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio from text in a streaming fashion.
|
||||
* @param {string|TextSplitterStream} text The input text
|
||||
* @param {StreamGenerateOptions} options Additional options
|
||||
* @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
|
||||
*/
|
||||
async *stream(text, { voice = "af_heart", speed = 1, split_pattern = null } = {}) {
|
||||
const language = this._validate_voice(voice);
|
||||
|
||||
/** @type {TextSplitterStream} */
|
||||
let splitter;
|
||||
if (text instanceof TextSplitterStream) {
|
||||
splitter = text;
|
||||
} else if (typeof text === "string") {
|
||||
splitter = new TextSplitterStream();
|
||||
const chunks = split_pattern
|
||||
? text
|
||||
.split(split_pattern)
|
||||
.map((chunk) => chunk.trim())
|
||||
.filter((chunk) => chunk.length > 0)
|
||||
: [text];
|
||||
splitter.push(...chunks);
|
||||
} else {
|
||||
throw new Error("Invalid input type. Expected string or TextSplitterStream.");
|
||||
}
|
||||
for await (const sentence of splitter) {
|
||||
const phonemes = await phonemize(sentence, language);
|
||||
const { input_ids } = this.tokenizer(phonemes, {
|
||||
truncation: true,
|
||||
});
|
||||
|
||||
// TODO: There may be some cases where - even with splitting - the text is too long.
|
||||
// In that case, we should split the text into smaller chunks and process them separately.
|
||||
// For now, we just truncate these exceptionally long chunks
|
||||
const audio = await this.generate_from_ids(input_ids, { voice, speed });
|
||||
yield { text: sentence, phonemes, audio };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export { TextSplitterStream };
|
||||
|
||||
344
kokoro.js/src/splitter.js
Normal file
344
kokoro.js/src/splitter.js
Normal file
@@ -0,0 +1,344 @@
|
||||
/**
|
||||
* Returns true if the character is considered a sentence terminator.
|
||||
* This includes ASCII (".", "!", "?") and common Unicode terminators.
|
||||
* NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
|
||||
* @param {string} c The character to test.
|
||||
* @param {boolean} [includeNewlines=true] Whether to treat newlines as terminators.
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isSentenceTerminator(c, includeNewlines = true) {
|
||||
return ".!?…。?!".includes(c) || (includeNewlines && c === "\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the character should be attached to the sentence terminator,
|
||||
* such as closing quotes or brackets.
|
||||
* @param {string} c The character to test.
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isTrailingChar(c) {
|
||||
return "\"')]}」』".includes(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a token (a contiguous sequence of non–whitespace characters)
|
||||
* from the buffer starting at the given index.
|
||||
* @param {string} buffer The input text.
|
||||
* @param {number} start The starting index.
|
||||
* @returns {string} The extracted token.
|
||||
*/
|
||||
function getTokenFromBuffer(buffer, start) {
|
||||
let end = start;
|
||||
while (end < buffer.length && !/\s/.test(buffer[end])) {
|
||||
++end;
|
||||
}
|
||||
return buffer.substring(start, end);
|
||||
}
|
||||
|
||||
// List of common abbreviations. Note that strings with single letters joined by periods
|
||||
// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
|
||||
const ABBREVIATIONS = new Set(["mr", "mrs", "ms", "dr", "prof", "sr", "jr", "sgt", "col", "gen", "rep", "sen", "gov", "lt", "maj", "capt", "st", "mt", "etc", "co", "inc", "ltd", "dept", "vs", "p", "pg", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", "sun", "mon", "tu", "tue", "tues", "wed", "th", "thu", "thur", "thurs", "fri", "sat"]);
|
||||
|
||||
/**
|
||||
* Determines if the given token (or series of initials) is a known abbreviation.
|
||||
* @param {string} token The token to check.
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isAbbreviation(token) {
|
||||
// Remove possessive endings and trailing periods.
|
||||
token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
|
||||
return ABBREVIATIONS.has(token.toLowerCase());
|
||||
}
|
||||
|
||||
// Map of closing punctuation to their corresponding opening punctuation.
|
||||
const MATCHING = new Map([
|
||||
[")", "("],
|
||||
["]", "["],
|
||||
["}", "{"],
|
||||
["》", "《"],
|
||||
["〉", "〈"],
|
||||
["›", "‹"],
|
||||
["»", "«"],
|
||||
["〉", "〈"],
|
||||
["」", "「"],
|
||||
["』", "『"],
|
||||
["〕", "〔"],
|
||||
["】", "【"],
|
||||
]);
|
||||
// Set of opening punctuation characters.
|
||||
const OPENING = new Set(MATCHING.values());
|
||||
|
||||
/**
|
||||
* Updates the nesting stack to track quotes and paired punctuation.
|
||||
* This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
|
||||
* (An apostrophe between letters is ignored so that contractions remain intact.)
|
||||
* @param {string} c The current character.
|
||||
* @param {string[]} stack The current nesting stack.
|
||||
* @param {number} i The index of the character in the buffer.
|
||||
* @param {string} buffer The full text being processed.
|
||||
*/
|
||||
function updateStack(c, stack, i, buffer) {
|
||||
// Handle standard quotes.
|
||||
if (c === '"' || c === "'") {
|
||||
// Ignore an apostrophe if it's between letters (e.g., in contractions).
|
||||
if (c === "'" && i > 0 && i < buffer.length - 1 && /[A-Za-z]/.test(buffer[i - 1]) && /[A-Za-z]/.test(buffer[i + 1])) {
|
||||
return;
|
||||
}
|
||||
if (stack.length && stack.at(-1) === c) {
|
||||
stack.pop();
|
||||
} else {
|
||||
stack.push(c);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Handle opening punctuation.
|
||||
if (OPENING.has(c)) {
|
||||
stack.push(c);
|
||||
return;
|
||||
}
|
||||
// Handle closing punctuation.
|
||||
const expectedOpening = MATCHING.get(c);
|
||||
if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple stream-based text splitter that emits complete sentences.
|
||||
*/
|
||||
export class TextSplitterStream {
|
||||
constructor() {
|
||||
this._buffer = "";
|
||||
this._sentences = [];
|
||||
this._resolver = null;
|
||||
this._closed = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Push one or more text chunks into the stream.
|
||||
* @param {...string} texts Text fragments to process.
|
||||
*/
|
||||
push(...texts) {
|
||||
for (const txt of texts) {
|
||||
this._buffer += txt;
|
||||
this._process();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the stream, signaling that no more text will be pushed.
|
||||
* This will flush any remaining text in the buffer as a sentence
|
||||
* and allow the consuming process to finish processing the stream.
|
||||
*/
|
||||
close() {
|
||||
if (this._closed) {
|
||||
throw new Error("Stream is already closed.");
|
||||
}
|
||||
this._closed = true;
|
||||
this.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes any remaining text in the buffer as a sentence.
|
||||
*/
|
||||
flush() {
|
||||
const remainder = this._buffer.trim();
|
||||
if (remainder.length > 0) {
|
||||
this._sentences.push(remainder);
|
||||
}
|
||||
this._buffer = "";
|
||||
this._resolve();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the pending promise to signal that sentences are available.
|
||||
* @private
|
||||
*/
|
||||
_resolve() {
|
||||
if (this._resolver) {
|
||||
this._resolver();
|
||||
this._resolver = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes the internal buffer to extract complete sentences.
|
||||
* If the potential sentence boundary is at the end of the current buffer,
|
||||
* it waits for more text before splitting.
|
||||
* @private
|
||||
*/
|
||||
_process() {
|
||||
let sentenceStart = 0;
|
||||
const buffer = this._buffer;
|
||||
const len = buffer.length;
|
||||
let i = 0;
|
||||
let stack = [];
|
||||
|
||||
// Helper to scan from the current index over trailing terminators and punctuation.
|
||||
const scanBoundary = (idx) => {
|
||||
let end = idx;
|
||||
// Consume contiguous sentence terminators (excluding newlines).
|
||||
while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
|
||||
++end;
|
||||
}
|
||||
// Consume trailing characters (e.g., closing quotes/brackets).
|
||||
while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
|
||||
++end;
|
||||
}
|
||||
let nextNonSpace = end + 1;
|
||||
while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
|
||||
++nextNonSpace;
|
||||
}
|
||||
return { end, nextNonSpace };
|
||||
};
|
||||
|
||||
while (i < len) {
|
||||
const c = buffer[i];
|
||||
updateStack(c, stack, i, buffer);
|
||||
|
||||
// Only consider splitting if we're not inside any nested structure.
|
||||
if (stack.length === 0 && isSentenceTerminator(c)) {
|
||||
const currentSegment = buffer.slice(sentenceStart, i);
|
||||
// Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
|
||||
if (/(^|\n)\d+$/.test(currentSegment)) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
|
||||
|
||||
// If the terminator is not a newline and there's no extra whitespace,
|
||||
// we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
|
||||
if (i === nextNonSpace - 1 && c !== "\n") {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Wait for more text if there's no non-whitespace character yet.
|
||||
if (nextNonSpace === len) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Determine the token immediately preceding the terminator.
|
||||
let tokenStart = i - 1;
|
||||
while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
|
||||
tokenStart--;
|
||||
}
|
||||
tokenStart = Math.max(sentenceStart, tokenStart + 1);
|
||||
const token = getTokenFromBuffer(buffer, tokenStart);
|
||||
if (!token) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- URL/email protection ---
|
||||
// If the token appears to be a URL or email (contains "://" or "@")
|
||||
// and does not already end with a terminator, skip splitting.
|
||||
if ((/https?[,:]\/\//.test(token) || token.includes("@")) && !isSentenceTerminator(token.at(-1))) {
|
||||
i = tokenStart + token.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Abbreviation protection ---
|
||||
if (isAbbreviation(token)) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Middle initials heuristic ---
|
||||
// If the token is a series of single-letter initials (each ending in a period)
|
||||
// and is followed by a capitalized word, assume it's part of a name.
|
||||
if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Lookahead heuristic ---
|
||||
// If the terminator is a period and the next non–whitespace character is lowercase,
|
||||
// assume it is not the end of a sentence.
|
||||
if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Special case: ellipsis that stands alone should be merged with the following sentence.
|
||||
const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
|
||||
if (sentence === "..." || sentence === "…") {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accept the sentence boundary.
|
||||
if (sentence) {
|
||||
this._sentences.push(sentence);
|
||||
}
|
||||
// Move to the next sentence.
|
||||
i = sentenceStart = boundaryEnd + 1;
|
||||
continue;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
// Remove the processed portion of the buffer.
|
||||
this._buffer = buffer.substring(sentenceStart);
|
||||
|
||||
// Resolve any pending promise if sentences are available.
|
||||
if (this._sentences.length > 0) {
|
||||
this._resolve();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Async iterator to yield sentences as they become available.
|
||||
* @returns {AsyncGenerator<string, void, void>}
|
||||
*/
|
||||
async *[Symbol.asyncIterator]() {
|
||||
if (this._resolver) {
|
||||
throw new Error("Another iterator is already active.");
|
||||
}
|
||||
while (true) {
|
||||
if (this._sentences.length > 0) {
|
||||
yield this._sentences.shift();
|
||||
} else if (this._closed) {
|
||||
// No more text will be pushed.
|
||||
break;
|
||||
} else {
|
||||
// Wait for more text.
|
||||
await new Promise((resolve) => {
|
||||
this._resolver = resolve;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronous iterator that flushes the buffer and returns all sentences.
|
||||
* @returns {Iterator<string>}
|
||||
*/
|
||||
[Symbol.iterator]() {
|
||||
this.flush();
|
||||
const iterator = this._sentences[Symbol.iterator]();
|
||||
this._sentences = [];
|
||||
return iterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the array of sentences currently available.
|
||||
* @type {string[]} The array of sentences.
|
||||
* @readonly
|
||||
*/
|
||||
get sentences() {
|
||||
return this._sentences;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the input text into an array of sentences.
|
||||
* @param {string} text The text to split.
|
||||
* @returns {string[]} An array of sentences.
|
||||
*/
|
||||
export function split(text) {
|
||||
const splitter = new TextSplitterStream();
|
||||
splitter.push(text);
|
||||
return [...splitter];
|
||||
}
|
||||
Reference in New Issue
Block a user