Kokoro.js v1.2.0: Streaming support (#92)

* Set up JS project * Finalise JS library * Update README * Fix package.json repository url * Rename package -> `kokoro-js` * Fix samples in README * Cleanup README * Bump `phonemizer` version * Create web demo * Run prettier * Link to model used in demo * Enable multithreading in HF space demo (~40% faster) * Add link to demo in README * Bump to v1.0.1 * Update voices * Update versions * Update phonemize JSDoc * Use updated voice pack * Update versions * Update demo (v1.0 & WebGPU support) * Update README * Enforce maximum number of tokens * Update README * [version] Update to 1.1.1 * Create simple sentence splitter * Update `npm run test` * Update API to use sync and async iterators * Add support for streamed generation in kokoro.js * Always split on newlines * Remove debug line * Improvements * Add more matching puntuation marks * Update comments * nits * Export TextSplitterStream too * Update splitter.js * Update README * [version] Update to 1.2.0
2025-02-15 21:06:33 +02:00
parent 93abff8795
commit 5229a254b7
6 changed files with 1109 additions and 17 deletions
--- a/kokoro.js/src/kokoro.js
+++ b/kokoro.js/src/kokoro.js
@@ -1,10 +1,23 @@
 import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
 import { phonemize } from "./phonemize.js";
+import { TextSplitterStream } from "./splitter.js";
 import { getVoiceData, VOICES } from "./voices.js";

 const STYLE_DIM = 256;
 const SAMPLE_RATE = 24000;

+/**
+ * @typedef {Object} GenerateOptions
+ * @property {keyof typeof VOICES} [voice="af_heart"] The voice
+ * @property {number} [speed=1] The speaking speed
+ */
+
+/**
+ * @typedef {Object} StreamProperties
+ * @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
+ * @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
+ */
+
 export class KokoroTTS {
  /**
   * Create a new KokoroTTS instance.
@@ -47,34 +60,37 @@ export class KokoroTTS {
      console.table(VOICES);
      throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
    }
+    const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
+    return language;
  }

  /**
   * Generate audio from text.
   *
   * @param {string} text The input text
-   * @param {Object} options Additional options
-   * @param {keyof typeof VOICES} [options.voice="af_heart"] The voice style to use
-   * @param {number} [options.speed=1] The speaking speed
+   * @param {GenerateOptions} options Additional options
   * @returns {Promise<RawAudio>} The generated audio
   */
  async generate(text, { voice = "af_heart", speed = 1 } = {}) {
-    this._validate_voice(voice);
+    const language = this._validate_voice(voice);

-    const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
    const phonemes = await phonemize(text, language);
    const { input_ids } = this.tokenizer(phonemes, {
      truncation: true,
    });

+    return this.generate_from_ids(input_ids, { voice, speed });
+  }
+
+  /**
+   * Generate audio from input ids.
+   * @param {Tensor} input_ids The input ids
+   * @param {GenerateOptions} options Additional options
+   * @returns {Promise<RawAudio>} The generated audio
+   */
+  async generate_from_ids(input_ids, { voice = "af_heart", speed = 1 } = {}) {
    // Select voice style based on number of input tokens
-    const num_tokens = Math.min(
-      Math.max(
-        input_ids.dims.at(-1) - 2,
-        0,
-      ),
-      509,
-    );
+    const num_tokens = Math.min(Math.max(input_ids.dims.at(-1) - 2, 0), 509);

    // Load voice style
    const data = await getVoiceData(voice);
@@ -90,7 +106,47 @@ export class KokoroTTS {

    // Generate audio
    const { waveform } = await this.model(inputs);
-
    return new RawAudio(waveform.data, SAMPLE_RATE);
  }
+
+  /**
+   * Generate audio from text in a streaming fashion.
+   * @param {string|TextSplitterStream} text The input text
+   * @param {StreamGenerateOptions} options Additional options
+   * @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
+   */
+  async *stream(text, { voice = "af_heart", speed = 1, split_pattern = null } = {}) {
+    const language = this._validate_voice(voice);
+
+    /** @type {TextSplitterStream} */
+    let splitter;
+    if (text instanceof TextSplitterStream) {
+      splitter = text;
+    } else if (typeof text === "string") {
+      splitter = new TextSplitterStream();
+      const chunks = split_pattern
+        ? text
+            .split(split_pattern)
+            .map((chunk) => chunk.trim())
+            .filter((chunk) => chunk.length > 0)
+        : [text];
+      splitter.push(...chunks);
+    } else {
+      throw new Error("Invalid input type. Expected string or TextSplitterStream.");
+    }
+    for await (const sentence of splitter) {
+      const phonemes = await phonemize(sentence, language);
+      const { input_ids } = this.tokenizer(phonemes, {
+        truncation: true,
+      });
+
+      // TODO: There may be some cases where - even with splitting - the text is too long.
+      // In that case, we should split the text into smaller chunks and process them separately.
+      // For now, we just truncate these exceptionally long chunks
+      const audio = await this.generate_from_ids(input_ids, { voice, speed });
+      yield { text: sentence, phonemes, audio };
+    }
+  }
 }
+
+export { TextSplitterStream };
--- a/kokoro.js/src/splitter.js
+++ b/kokoro.js/src/splitter.js
@@ -0,0 +1,344 @@
+/**
+ * Returns true if the character is considered a sentence terminator.
+ * This includes ASCII (".", "!", "?") and common Unicode terminators.
+ * NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
+ * @param {string} c The character to test.
+ * @param {boolean} [includeNewlines=true] Whether to treat newlines as terminators.
+ * @returns {boolean}
+ */
+function isSentenceTerminator(c, includeNewlines = true) {
+  return ".!?…。？！".includes(c) || (includeNewlines && c === "\n");
+}
+
+/**
+ * Returns true if the character should be attached to the sentence terminator,
+ * such as closing quotes or brackets.
+ * @param {string} c The character to test.
+ * @returns {boolean}
+ */
+function isTrailingChar(c) {
+  return "\"')]}」』".includes(c);
+}
+
+/**
+ * Extracts a token (a contiguous sequence of non–whitespace characters)
+ * from the buffer starting at the given index.
+ * @param {string} buffer The input text.
+ * @param {number} start The starting index.
+ * @returns {string} The extracted token.
+ */
+function getTokenFromBuffer(buffer, start) {
+  let end = start;
+  while (end < buffer.length && !/\s/.test(buffer[end])) {
+    ++end;
+  }
+  return buffer.substring(start, end);
+}
+
+// List of common abbreviations. Note that strings with single letters joined by periods
+// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
+const ABBREVIATIONS = new Set(["mr", "mrs", "ms", "dr", "prof", "sr", "jr", "sgt", "col", "gen", "rep", "sen", "gov", "lt", "maj", "capt", "st", "mt", "etc", "co", "inc", "ltd", "dept", "vs", "p", "pg", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", "sun", "mon", "tu", "tue", "tues", "wed", "th", "thu", "thur", "thurs", "fri", "sat"]);
+
+/**
+ * Determines if the given token (or series of initials) is a known abbreviation.
+ * @param {string} token The token to check.
+ * @returns {boolean}
+ */
+function isAbbreviation(token) {
+  // Remove possessive endings and trailing periods.
+  token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
+  return ABBREVIATIONS.has(token.toLowerCase());
+}
+
+// Map of closing punctuation to their corresponding opening punctuation.
+const MATCHING = new Map([
+  [")", "("],
+  ["]", "["],
+  ["}", "{"],
+  ["》", "《"],
+  ["〉", "〈"],
+  ["›", "‹"],
+  ["»", "«"],
+  ["〉", "〈"],
+  ["」", "「"],
+  ["』", "『"],
+  ["〕", "〔"],
+  ["】", "【"],
+]);
+// Set of opening punctuation characters.
+const OPENING = new Set(MATCHING.values());
+
+/**
+ * Updates the nesting stack to track quotes and paired punctuation.
+ * This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
+ * (An apostrophe between letters is ignored so that contractions remain intact.)
+ * @param {string} c The current character.
+ * @param {string[]} stack The current nesting stack.
+ * @param {number} i The index of the character in the buffer.
+ * @param {string} buffer The full text being processed.
+ */
+function updateStack(c, stack, i, buffer) {
+  // Handle standard quotes.
+  if (c === '"' || c === "'") {
+    // Ignore an apostrophe if it's between letters (e.g., in contractions).
+    if (c === "'" && i > 0 && i < buffer.length - 1 && /[A-Za-z]/.test(buffer[i - 1]) && /[A-Za-z]/.test(buffer[i + 1])) {
+      return;
+    }
+    if (stack.length && stack.at(-1) === c) {
+      stack.pop();
+    } else {
+      stack.push(c);
+    }
+    return;
+  }
+  // Handle opening punctuation.
+  if (OPENING.has(c)) {
+    stack.push(c);
+    return;
+  }
+  // Handle closing punctuation.
+  const expectedOpening = MATCHING.get(c);
+  if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
+    stack.pop();
+  }
+}
+
+/**
+ * A simple stream-based text splitter that emits complete sentences.
+ */
+export class TextSplitterStream {
+  constructor() {
+    this._buffer = "";
+    this._sentences = [];
+    this._resolver = null;
+    this._closed = false;
+  }
+
+  /**
+   * Push one or more text chunks into the stream.
+   * @param  {...string} texts Text fragments to process.
+   */
+  push(...texts) {
+    for (const txt of texts) {
+      this._buffer += txt;
+      this._process();
+    }
+  }
+
+  /**
+   * Closes the stream, signaling that no more text will be pushed.
+   * This will flush any remaining text in the buffer as a sentence
+   * and allow the consuming process to finish processing the stream.
+   */
+  close() {
+    if (this._closed) {
+      throw new Error("Stream is already closed.");
+    }
+    this._closed = true;
+    this.flush();
+  }
+
+  /**
+   * Flushes any remaining text in the buffer as a sentence.
+   */
+  flush() {
+    const remainder = this._buffer.trim();
+    if (remainder.length > 0) {
+      this._sentences.push(remainder);
+    }
+    this._buffer = "";
+    this._resolve();
+  }
+
+  /**
+   * Resolve the pending promise to signal that sentences are available.
+   * @private
+   */
+  _resolve() {
+    if (this._resolver) {
+      this._resolver();
+      this._resolver = null;
+    }
+  }
+
+  /**
+   * Processes the internal buffer to extract complete sentences.
+   * If the potential sentence boundary is at the end of the current buffer,
+   * it waits for more text before splitting.
+   * @private
+   */
+  _process() {
+    let sentenceStart = 0;
+    const buffer = this._buffer;
+    const len = buffer.length;
+    let i = 0;
+    let stack = [];
+
+    // Helper to scan from the current index over trailing terminators and punctuation.
+    const scanBoundary = (idx) => {
+      let end = idx;
+      // Consume contiguous sentence terminators (excluding newlines).
+      while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
+        ++end;
+      }
+      // Consume trailing characters (e.g., closing quotes/brackets).
+      while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
+        ++end;
+      }
+      let nextNonSpace = end + 1;
+      while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
+        ++nextNonSpace;
+      }
+      return { end, nextNonSpace };
+    };
+
+    while (i < len) {
+      const c = buffer[i];
+      updateStack(c, stack, i, buffer);
+
+      // Only consider splitting if we're not inside any nested structure.
+      if (stack.length === 0 && isSentenceTerminator(c)) {
+        const currentSegment = buffer.slice(sentenceStart, i);
+        // Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
+        if (/(^|\n)\d+$/.test(currentSegment)) {
+          ++i;
+          continue;
+        }
+
+        const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
+
+        // If the terminator is not a newline and there's no extra whitespace,
+        // we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
+        if (i === nextNonSpace - 1 && c !== "\n") {
+          ++i;
+          continue;
+        }
+
+        // Wait for more text if there's no non-whitespace character yet.
+        if (nextNonSpace === len) {
+          break;
+        }
+
+        // Determine the token immediately preceding the terminator.
+        let tokenStart = i - 1;
+        while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
+          tokenStart--;
+        }
+        tokenStart = Math.max(sentenceStart, tokenStart + 1);
+        const token = getTokenFromBuffer(buffer, tokenStart);
+        if (!token) {
+          ++i;
+          continue;
+        }
+
+        // --- URL/email protection ---
+        // If the token appears to be a URL or email (contains "://" or "@")
+        // and does not already end with a terminator, skip splitting.
+        if ((/https?[,:]\/\//.test(token) || token.includes("@")) && !isSentenceTerminator(token.at(-1))) {
+          i = tokenStart + token.length;
+          continue;
+        }
+
+        // --- Abbreviation protection ---
+        if (isAbbreviation(token)) {
+          ++i;
+          continue;
+        }
+
+        // --- Middle initials heuristic ---
+        // If the token is a series of single-letter initials (each ending in a period)
+        // and is followed by a capitalized word, assume it's part of a name.
+        if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
+          ++i;
+          continue;
+        }
+
+        // --- Lookahead heuristic ---
+        // If the terminator is a period and the next non–whitespace character is lowercase,
+        // assume it is not the end of a sentence.
+        if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
+          ++i;
+          continue;
+        }
+
+        // Special case: ellipsis that stands alone should be merged with the following sentence.
+        const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
+        if (sentence === "..." || sentence === "…") {
+          ++i;
+          continue;
+        }
+
+        // Accept the sentence boundary.
+        if (sentence) {
+          this._sentences.push(sentence);
+        }
+        // Move to the next sentence.
+        i = sentenceStart = boundaryEnd + 1;
+        continue;
+      }
+      ++i;
+    }
+
+    // Remove the processed portion of the buffer.
+    this._buffer = buffer.substring(sentenceStart);
+
+    // Resolve any pending promise if sentences are available.
+    if (this._sentences.length > 0) {
+      this._resolve();
+    }
+  }
+
+  /**
+   * Async iterator to yield sentences as they become available.
+   * @returns {AsyncGenerator<string, void, void>}
+   */
+  async *[Symbol.asyncIterator]() {
+    if (this._resolver) {
+      throw new Error("Another iterator is already active.");
+    }
+    while (true) {
+      if (this._sentences.length > 0) {
+        yield this._sentences.shift();
+      } else if (this._closed) {
+        // No more text will be pushed.
+        break;
+      } else {
+        // Wait for more text.
+        await new Promise((resolve) => {
+          this._resolver = resolve;
+        });
+      }
+    }
+  }
+
+  /**
+   * Synchronous iterator that flushes the buffer and returns all sentences.
+   * @returns {Iterator<string>}
+   */
+  [Symbol.iterator]() {
+    this.flush();
+    const iterator = this._sentences[Symbol.iterator]();
+    this._sentences = [];
+    return iterator;
+  }
+
+  /**
+   * Returns the array of sentences currently available.
+   * @type {string[]} The array of sentences.
+   * @readonly
+   */
+  get sentences() {
+    return this._sentences;
+  }
+}
+
+/**
+ * Splits the input text into an array of sentences.
+ * @param {string} text The text to split.
+ * @returns {string[]} An array of sentences.
+ */
+export function split(text) {
+  const splitter = new TextSplitterStream();
+  splitter.push(text);
+  return [...splitter];
+}