Kokoro.js v1.2.0: Streaming support (#92)

* Set up JS project

* Finalise JS library

* Update README

* Fix package.json repository url

* Rename package -> `kokoro-js`

* Fix samples in README

* Cleanup README

* Bump `phonemizer` version

* Create web demo

* Run prettier

* Link to model used in demo

* Enable multithreading in HF space demo (~40% faster)

* Add link to demo in README

* Bump to v1.0.1

* Update voices

* Update versions

* Update phonemize JSDoc

* Use updated voice pack

* Update versions

* Update demo (v1.0 & WebGPU support)

* Update README

* Enforce maximum number of tokens

* Update README

* [version] Update to 1.1.1

* Create simple sentence splitter

* Update `npm run test`

* Update API to use sync and async iterators

* Add support for streamed generation in kokoro.js

* Always split on newlines

* Remove debug line

* Improvements

* Add more matching puntuation marks

* Update comments

* nits

* Export TextSplitterStream too

* Update splitter.js

* Update README

* [version] Update to 1.2.0
This commit is contained in:
Joshua Lochner
2025-02-15 21:06:33 +02:00
committed by GitHub
parent 93abff8795
commit 5229a254b7
6 changed files with 1109 additions and 17 deletions

View File

@@ -1,10 +1,23 @@
import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
import { phonemize } from "./phonemize.js";
import { TextSplitterStream } from "./splitter.js";
import { getVoiceData, VOICES } from "./voices.js";
const STYLE_DIM = 256;
const SAMPLE_RATE = 24000;
/**
* @typedef {Object} GenerateOptions
* @property {keyof typeof VOICES} [voice="af_heart"] The voice
* @property {number} [speed=1] The speaking speed
*/
/**
* @typedef {Object} StreamProperties
* @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
* @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
*/
export class KokoroTTS {
/**
* Create a new KokoroTTS instance.
@@ -47,34 +60,37 @@ export class KokoroTTS {
console.table(VOICES);
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
}
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
return language;
}
/**
* Generate audio from text.
*
* @param {string} text The input text
* @param {Object} options Additional options
* @param {keyof typeof VOICES} [options.voice="af_heart"] The voice style to use
* @param {number} [options.speed=1] The speaking speed
* @param {GenerateOptions} options Additional options
* @returns {Promise<RawAudio>} The generated audio
*/
async generate(text, { voice = "af_heart", speed = 1 } = {}) {
this._validate_voice(voice);
const language = this._validate_voice(voice);
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
const phonemes = await phonemize(text, language);
const { input_ids } = this.tokenizer(phonemes, {
truncation: true,
});
return this.generate_from_ids(input_ids, { voice, speed });
}
/**
* Generate audio from input ids.
* @param {Tensor} input_ids The input ids
* @param {GenerateOptions} options Additional options
* @returns {Promise<RawAudio>} The generated audio
*/
async generate_from_ids(input_ids, { voice = "af_heart", speed = 1 } = {}) {
// Select voice style based on number of input tokens
const num_tokens = Math.min(
Math.max(
input_ids.dims.at(-1) - 2,
0,
),
509,
);
const num_tokens = Math.min(Math.max(input_ids.dims.at(-1) - 2, 0), 509);
// Load voice style
const data = await getVoiceData(voice);
@@ -90,7 +106,47 @@ export class KokoroTTS {
// Generate audio
const { waveform } = await this.model(inputs);
return new RawAudio(waveform.data, SAMPLE_RATE);
}
/**
* Generate audio from text in a streaming fashion.
* @param {string|TextSplitterStream} text The input text
* @param {StreamGenerateOptions} options Additional options
* @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
*/
async *stream(text, { voice = "af_heart", speed = 1, split_pattern = null } = {}) {
const language = this._validate_voice(voice);
/** @type {TextSplitterStream} */
let splitter;
if (text instanceof TextSplitterStream) {
splitter = text;
} else if (typeof text === "string") {
splitter = new TextSplitterStream();
const chunks = split_pattern
? text
.split(split_pattern)
.map((chunk) => chunk.trim())
.filter((chunk) => chunk.length > 0)
: [text];
splitter.push(...chunks);
} else {
throw new Error("Invalid input type. Expected string or TextSplitterStream.");
}
for await (const sentence of splitter) {
const phonemes = await phonemize(sentence, language);
const { input_ids } = this.tokenizer(phonemes, {
truncation: true,
});
// TODO: There may be some cases where - even with splitting - the text is too long.
// In that case, we should split the text into smaller chunks and process them separately.
// For now, we just truncate these exceptionally long chunks
const audio = await this.generate_from_ids(input_ids, { voice, speed });
yield { text: sentence, phonemes, audio };
}
}
}
export { TextSplitterStream };

344
kokoro.js/src/splitter.js Normal file
View File

@@ -0,0 +1,344 @@
/**
* Returns true if the character is considered a sentence terminator.
* This includes ASCII (".", "!", "?") and common Unicode terminators.
* NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
* @param {string} c The character to test.
* @param {boolean} [includeNewlines=true] Whether to treat newlines as terminators.
* @returns {boolean}
*/
function isSentenceTerminator(c, includeNewlines = true) {
return ".!?…。?!".includes(c) || (includeNewlines && c === "\n");
}
/**
* Returns true if the character should be attached to the sentence terminator,
* such as closing quotes or brackets.
* @param {string} c The character to test.
* @returns {boolean}
*/
function isTrailingChar(c) {
return "\"')]}」』".includes(c);
}
/**
* Extracts a token (a contiguous sequence of nonwhitespace characters)
* from the buffer starting at the given index.
* @param {string} buffer The input text.
* @param {number} start The starting index.
* @returns {string} The extracted token.
*/
function getTokenFromBuffer(buffer, start) {
let end = start;
while (end < buffer.length && !/\s/.test(buffer[end])) {
++end;
}
return buffer.substring(start, end);
}
// List of common abbreviations. Note that strings with single letters joined by periods
// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
const ABBREVIATIONS = new Set(["mr", "mrs", "ms", "dr", "prof", "sr", "jr", "sgt", "col", "gen", "rep", "sen", "gov", "lt", "maj", "capt", "st", "mt", "etc", "co", "inc", "ltd", "dept", "vs", "p", "pg", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", "sun", "mon", "tu", "tue", "tues", "wed", "th", "thu", "thur", "thurs", "fri", "sat"]);
/**
* Determines if the given token (or series of initials) is a known abbreviation.
* @param {string} token The token to check.
* @returns {boolean}
*/
function isAbbreviation(token) {
// Remove possessive endings and trailing periods.
token = token.replace(/[']s$/i, "").replace(/\.+$/, "");
return ABBREVIATIONS.has(token.toLowerCase());
}
// Map of closing punctuation to their corresponding opening punctuation.
const MATCHING = new Map([
[")", "("],
["]", "["],
["}", "{"],
["》", "《"],
["〉", "〈"],
["", ""],
["»", "«"],
["〉", "〈"],
["」", "「"],
["』", "『"],
["", ""],
["】", "【"],
]);
// Set of opening punctuation characters.
const OPENING = new Set(MATCHING.values());
/**
* Updates the nesting stack to track quotes and paired punctuation.
* This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
* (An apostrophe between letters is ignored so that contractions remain intact.)
* @param {string} c The current character.
* @param {string[]} stack The current nesting stack.
* @param {number} i The index of the character in the buffer.
* @param {string} buffer The full text being processed.
*/
function updateStack(c, stack, i, buffer) {
// Handle standard quotes.
if (c === '"' || c === "'") {
// Ignore an apostrophe if it's between letters (e.g., in contractions).
if (c === "'" && i > 0 && i < buffer.length - 1 && /[A-Za-z]/.test(buffer[i - 1]) && /[A-Za-z]/.test(buffer[i + 1])) {
return;
}
if (stack.length && stack.at(-1) === c) {
stack.pop();
} else {
stack.push(c);
}
return;
}
// Handle opening punctuation.
if (OPENING.has(c)) {
stack.push(c);
return;
}
// Handle closing punctuation.
const expectedOpening = MATCHING.get(c);
if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
stack.pop();
}
}
/**
* A simple stream-based text splitter that emits complete sentences.
*/
export class TextSplitterStream {
constructor() {
this._buffer = "";
this._sentences = [];
this._resolver = null;
this._closed = false;
}
/**
* Push one or more text chunks into the stream.
* @param {...string} texts Text fragments to process.
*/
push(...texts) {
for (const txt of texts) {
this._buffer += txt;
this._process();
}
}
/**
* Closes the stream, signaling that no more text will be pushed.
* This will flush any remaining text in the buffer as a sentence
* and allow the consuming process to finish processing the stream.
*/
close() {
if (this._closed) {
throw new Error("Stream is already closed.");
}
this._closed = true;
this.flush();
}
/**
* Flushes any remaining text in the buffer as a sentence.
*/
flush() {
const remainder = this._buffer.trim();
if (remainder.length > 0) {
this._sentences.push(remainder);
}
this._buffer = "";
this._resolve();
}
/**
* Resolve the pending promise to signal that sentences are available.
* @private
*/
_resolve() {
if (this._resolver) {
this._resolver();
this._resolver = null;
}
}
/**
* Processes the internal buffer to extract complete sentences.
* If the potential sentence boundary is at the end of the current buffer,
* it waits for more text before splitting.
* @private
*/
_process() {
let sentenceStart = 0;
const buffer = this._buffer;
const len = buffer.length;
let i = 0;
let stack = [];
// Helper to scan from the current index over trailing terminators and punctuation.
const scanBoundary = (idx) => {
let end = idx;
// Consume contiguous sentence terminators (excluding newlines).
while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
++end;
}
// Consume trailing characters (e.g., closing quotes/brackets).
while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
++end;
}
let nextNonSpace = end + 1;
while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
++nextNonSpace;
}
return { end, nextNonSpace };
};
while (i < len) {
const c = buffer[i];
updateStack(c, stack, i, buffer);
// Only consider splitting if we're not inside any nested structure.
if (stack.length === 0 && isSentenceTerminator(c)) {
const currentSegment = buffer.slice(sentenceStart, i);
// Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
if (/(^|\n)\d+$/.test(currentSegment)) {
++i;
continue;
}
const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
// If the terminator is not a newline and there's no extra whitespace,
// we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
if (i === nextNonSpace - 1 && c !== "\n") {
++i;
continue;
}
// Wait for more text if there's no non-whitespace character yet.
if (nextNonSpace === len) {
break;
}
// Determine the token immediately preceding the terminator.
let tokenStart = i - 1;
while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
tokenStart--;
}
tokenStart = Math.max(sentenceStart, tokenStart + 1);
const token = getTokenFromBuffer(buffer, tokenStart);
if (!token) {
++i;
continue;
}
// --- URL/email protection ---
// If the token appears to be a URL or email (contains "://" or "@")
// and does not already end with a terminator, skip splitting.
if ((/https?[,:]\/\//.test(token) || token.includes("@")) && !isSentenceTerminator(token.at(-1))) {
i = tokenStart + token.length;
continue;
}
// --- Abbreviation protection ---
if (isAbbreviation(token)) {
++i;
continue;
}
// --- Middle initials heuristic ---
// If the token is a series of single-letter initials (each ending in a period)
// and is followed by a capitalized word, assume it's part of a name.
if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
++i;
continue;
}
// --- Lookahead heuristic ---
// If the terminator is a period and the next nonwhitespace character is lowercase,
// assume it is not the end of a sentence.
if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
++i;
continue;
}
// Special case: ellipsis that stands alone should be merged with the following sentence.
const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
if (sentence === "..." || sentence === "…") {
++i;
continue;
}
// Accept the sentence boundary.
if (sentence) {
this._sentences.push(sentence);
}
// Move to the next sentence.
i = sentenceStart = boundaryEnd + 1;
continue;
}
++i;
}
// Remove the processed portion of the buffer.
this._buffer = buffer.substring(sentenceStart);
// Resolve any pending promise if sentences are available.
if (this._sentences.length > 0) {
this._resolve();
}
}
/**
* Async iterator to yield sentences as they become available.
* @returns {AsyncGenerator<string, void, void>}
*/
async *[Symbol.asyncIterator]() {
if (this._resolver) {
throw new Error("Another iterator is already active.");
}
while (true) {
if (this._sentences.length > 0) {
yield this._sentences.shift();
} else if (this._closed) {
// No more text will be pushed.
break;
} else {
// Wait for more text.
await new Promise((resolve) => {
this._resolver = resolve;
});
}
}
}
/**
* Synchronous iterator that flushes the buffer and returns all sentences.
* @returns {Iterator<string>}
*/
[Symbol.iterator]() {
this.flush();
const iterator = this._sentences[Symbol.iterator]();
this._sentences = [];
return iterator;
}
/**
* Returns the array of sentences currently available.
* @type {string[]} The array of sentences.
* @readonly
*/
get sentences() {
return this._sentences;
}
}
/**
* Splits the input text into an array of sentences.
* @param {string} text The text to split.
* @returns {string[]} An array of sentences.
*/
export function split(text) {
const splitter = new TextSplitterStream();
splitter.push(text);
return [...splitter];
}