Kokoro.js v1.2.0: Streaming support (#92)

* Set up JS project

* Finalise JS library

* Update README

* Fix package.json repository url

* Rename package -> `kokoro-js`

* Fix samples in README

* Cleanup README

* Bump `phonemizer` version

* Create web demo

* Run prettier

* Link to model used in demo

* Enable multithreading in HF space demo (~40% faster)

* Add link to demo in README

* Bump to v1.0.1

* Update voices

* Update versions

* Update phonemize JSDoc

* Use updated voice pack

* Update versions

* Update demo (v1.0 & WebGPU support)

* Update README

* Enforce maximum number of tokens

* Update README

* [version] Update to 1.1.1

* Create simple sentence splitter

* Update `npm run test`

* Update API to use sync and async iterators

* Add support for streamed generation in kokoro.js

* Always split on newlines

* Remove debug line

* Improvements

* Add more matching puntuation marks

* Update comments

* nits

* Export TextSplitterStream too

* Update splitter.js

* Update README

* [version] Update to 1.2.0
This commit is contained in:
Joshua Lochner
2025-02-15 21:06:33 +02:00
committed by GitHub
parent 93abff8795
commit 5229a254b7
6 changed files with 1109 additions and 17 deletions

View File

@@ -37,6 +37,44 @@ const audio = await tts.generate(text, {
audio.save("audio.wav"); audio.save("audio.wav");
``` ```
Or if you'd prefer to stream the output, you can do that with:
```js
import { KokoroTTS, TextSplitterStream } from "kokoro-js";
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
const tts = await KokoroTTS.from_pretrained(model_id, {
dtype: "fp32", // Options: "fp32", "fp16", "q8", "q4", "q4f16"
// device: "webgpu", // Options: "wasm", "webgpu" (web) or "cpu" (node).
});
// First, set up the stream
const splitter = new TextSplitterStream();
const stream = tts.stream(splitter);
(async () => {
let i = 0;
for await (const { text, phonemes, audio } of stream) {
console.log({ text, phonemes });
audio.save(`audio-${i++}.wav`);
}
})();
// Next, add text to the stream. Note that the text can be added at different times.
// For this example, let's pretend we're consuming text from an LLM, one word at a time.
const text = "Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects. It can even run 100% locally in your browser, powered by Transformers.js!";
const tokens = text.match(/\s*\S+/g);
for (const token of tokens) {
splitter.push(token);
await new Promise((resolve) => setTimeout(resolve, 10));
}
// Finally, close the stream to signal that no more text will be added.
splitter.close();
// Alternatively, if you'd like to keep the stream open, but flush any remaining text, you can use the `flush` method.
// splitter.flush();
```
## Voices/Samples ## Voices/Samples
> [!TIP] > [!TIP]

View File

@@ -1,12 +1,12 @@
{ {
"name": "kokoro-js", "name": "kokoro-js",
"version": "1.1.1", "version": "1.2.0",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "kokoro-js", "name": "kokoro-js",
"version": "1.1.1", "version": "1.2.0",
"license": "Apache-2.0", "license": "Apache-2.0",
"dependencies": { "dependencies": {
"@huggingface/transformers": "^3.3.3", "@huggingface/transformers": "^3.3.3",

View File

@@ -1,6 +1,6 @@
{ {
"name": "kokoro-js", "name": "kokoro-js",
"version": "1.1.1", "version": "1.2.0",
"type": "module", "type": "module",
"exports": { "exports": {
"types": "./types/kokoro.d.ts", "types": "./types/kokoro.d.ts",
@@ -13,7 +13,7 @@
"scripts": { "scripts": {
"build": "rm -rf dist types && rollup -c && tsc && cp ../LICENSE LICENSE", "build": "rm -rf dist types && rollup -c && tsc && cp ../LICENSE LICENSE",
"format": "prettier --write . --print-width 1000", "format": "prettier --write . --print-width 1000",
"test": "vitest" "test": "vitest run"
}, },
"keywords": [ "keywords": [
"kokoro", "kokoro",

View File

@@ -1,10 +1,23 @@
import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers"; import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
import { phonemize } from "./phonemize.js"; import { phonemize } from "./phonemize.js";
import { TextSplitterStream } from "./splitter.js";
import { getVoiceData, VOICES } from "./voices.js"; import { getVoiceData, VOICES } from "./voices.js";
const STYLE_DIM = 256; const STYLE_DIM = 256;
const SAMPLE_RATE = 24000; const SAMPLE_RATE = 24000;
/**
* @typedef {Object} GenerateOptions
* @property {keyof typeof VOICES} [voice="af_heart"] The voice
* @property {number} [speed=1] The speaking speed
*/
/**
* @typedef {Object} StreamProperties
* @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
* @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
*/
export class KokoroTTS { export class KokoroTTS {
/** /**
* Create a new KokoroTTS instance. * Create a new KokoroTTS instance.
@@ -47,34 +60,37 @@ export class KokoroTTS {
console.table(VOICES); console.table(VOICES);
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`); throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
} }
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
return language;
} }
/** /**
* Generate audio from text. * Generate audio from text.
* *
* @param {string} text The input text * @param {string} text The input text
* @param {Object} options Additional options * @param {GenerateOptions} options Additional options
* @param {keyof typeof VOICES} [options.voice="af_heart"] The voice style to use
* @param {number} [options.speed=1] The speaking speed
* @returns {Promise<RawAudio>} The generated audio * @returns {Promise<RawAudio>} The generated audio
*/ */
async generate(text, { voice = "af_heart", speed = 1 } = {}) { async generate(text, { voice = "af_heart", speed = 1 } = {}) {
this._validate_voice(voice); const language = this._validate_voice(voice);
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
const phonemes = await phonemize(text, language); const phonemes = await phonemize(text, language);
const { input_ids } = this.tokenizer(phonemes, { const { input_ids } = this.tokenizer(phonemes, {
truncation: true, truncation: true,
}); });
return this.generate_from_ids(input_ids, { voice, speed });
}
/**
* Generate audio from input ids.
* @param {Tensor} input_ids The input ids
* @param {GenerateOptions} options Additional options
* @returns {Promise<RawAudio>} The generated audio
*/
async generate_from_ids(input_ids, { voice = "af_heart", speed = 1 } = {}) {
// Select voice style based on number of input tokens // Select voice style based on number of input tokens
const num_tokens = Math.min( const num_tokens = Math.min(Math.max(input_ids.dims.at(-1) - 2, 0), 509);
Math.max(
input_ids.dims.at(-1) - 2,
0,
),
509,
);
// Load voice style // Load voice style
const data = await getVoiceData(voice); const data = await getVoiceData(voice);
@@ -90,7 +106,47 @@ export class KokoroTTS {
// Generate audio // Generate audio
const { waveform } = await this.model(inputs); const { waveform } = await this.model(inputs);
return new RawAudio(waveform.data, SAMPLE_RATE); return new RawAudio(waveform.data, SAMPLE_RATE);
} }
/**
* Generate audio from text in a streaming fashion.
* @param {string|TextSplitterStream} text The input text
* @param {StreamGenerateOptions} options Additional options
* @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
*/
async *stream(text, { voice = "af_heart", speed = 1, split_pattern = null } = {}) {
const language = this._validate_voice(voice);
/** @type {TextSplitterStream} */
let splitter;
if (text instanceof TextSplitterStream) {
splitter = text;
} else if (typeof text === "string") {
splitter = new TextSplitterStream();
const chunks = split_pattern
? text
.split(split_pattern)
.map((chunk) => chunk.trim())
.filter((chunk) => chunk.length > 0)
: [text];
splitter.push(...chunks);
} else {
throw new Error("Invalid input type. Expected string or TextSplitterStream.");
} }
for await (const sentence of splitter) {
const phonemes = await phonemize(sentence, language);
const { input_ids } = this.tokenizer(phonemes, {
truncation: true,
});
// TODO: There may be some cases where - even with splitting - the text is too long.
// In that case, we should split the text into smaller chunks and process them separately.
// For now, we just truncate these exceptionally long chunks
const audio = await this.generate_from_ids(input_ids, { voice, speed });
yield { text: sentence, phonemes, audio };
}
}
}
export { TextSplitterStream };

344
kokoro.js/src/splitter.js Normal file
View File

@@ -0,0 +1,344 @@
/**
* Returns true if the character is considered a sentence terminator.
* This includes ASCII (".", "!", "?") and common Unicode terminators.
* NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
* @param {string} c The character to test.
* @param {boolean} [includeNewlines=true] Whether to treat newlines as terminators.
* @returns {boolean}
*/
function isSentenceTerminator(c, includeNewlines = true) {
return ".!?…。?!".includes(c) || (includeNewlines && c === "\n");
}
/**
* Returns true if the character should be attached to the sentence terminator,
* such as closing quotes or brackets.
* @param {string} c The character to test.
* @returns {boolean}
*/
function isTrailingChar(c) {
return "\"')]}」』".includes(c);
}
/**
* Extracts a token (a contiguous sequence of nonwhitespace characters)
* from the buffer starting at the given index.
* @param {string} buffer The input text.
* @param {number} start The starting index.
* @returns {string} The extracted token.
*/
function getTokenFromBuffer(buffer, start) {
let end = start;
while (end < buffer.length && !/\s/.test(buffer[end])) {
++end;
}
return buffer.substring(start, end);
}
// List of common abbreviations. Note that strings with single letters joined by periods
// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
const ABBREVIATIONS = new Set(["mr", "mrs", "ms", "dr", "prof", "sr", "jr", "sgt", "col", "gen", "rep", "sen", "gov", "lt", "maj", "capt", "st", "mt", "etc", "co", "inc", "ltd", "dept", "vs", "p", "pg", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", "sun", "mon", "tu", "tue", "tues", "wed", "th", "thu", "thur", "thurs", "fri", "sat"]);
/**
* Determines if the given token (or series of initials) is a known abbreviation.
* @param {string} token The token to check.
* @returns {boolean}
*/
function isAbbreviation(token) {
// Remove possessive endings and trailing periods.
token = token.replace(/[']s$/i, "").replace(/\.+$/, "");
return ABBREVIATIONS.has(token.toLowerCase());
}
// Map of closing punctuation to their corresponding opening punctuation.
const MATCHING = new Map([
[")", "("],
["]", "["],
["}", "{"],
["》", "《"],
["〉", "〈"],
["", ""],
["»", "«"],
["〉", "〈"],
["」", "「"],
["』", "『"],
["", ""],
["】", "【"],
]);
// Set of opening punctuation characters.
const OPENING = new Set(MATCHING.values());
/**
* Updates the nesting stack to track quotes and paired punctuation.
* This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
* (An apostrophe between letters is ignored so that contractions remain intact.)
* @param {string} c The current character.
* @param {string[]} stack The current nesting stack.
* @param {number} i The index of the character in the buffer.
* @param {string} buffer The full text being processed.
*/
function updateStack(c, stack, i, buffer) {
// Handle standard quotes.
if (c === '"' || c === "'") {
// Ignore an apostrophe if it's between letters (e.g., in contractions).
if (c === "'" && i > 0 && i < buffer.length - 1 && /[A-Za-z]/.test(buffer[i - 1]) && /[A-Za-z]/.test(buffer[i + 1])) {
return;
}
if (stack.length && stack.at(-1) === c) {
stack.pop();
} else {
stack.push(c);
}
return;
}
// Handle opening punctuation.
if (OPENING.has(c)) {
stack.push(c);
return;
}
// Handle closing punctuation.
const expectedOpening = MATCHING.get(c);
if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
stack.pop();
}
}
/**
* A simple stream-based text splitter that emits complete sentences.
*/
export class TextSplitterStream {
constructor() {
this._buffer = "";
this._sentences = [];
this._resolver = null;
this._closed = false;
}
/**
* Push one or more text chunks into the stream.
* @param {...string} texts Text fragments to process.
*/
push(...texts) {
for (const txt of texts) {
this._buffer += txt;
this._process();
}
}
/**
* Closes the stream, signaling that no more text will be pushed.
* This will flush any remaining text in the buffer as a sentence
* and allow the consuming process to finish processing the stream.
*/
close() {
if (this._closed) {
throw new Error("Stream is already closed.");
}
this._closed = true;
this.flush();
}
/**
* Flushes any remaining text in the buffer as a sentence.
*/
flush() {
const remainder = this._buffer.trim();
if (remainder.length > 0) {
this._sentences.push(remainder);
}
this._buffer = "";
this._resolve();
}
/**
* Resolve the pending promise to signal that sentences are available.
* @private
*/
_resolve() {
if (this._resolver) {
this._resolver();
this._resolver = null;
}
}
/**
* Processes the internal buffer to extract complete sentences.
* If the potential sentence boundary is at the end of the current buffer,
* it waits for more text before splitting.
* @private
*/
_process() {
let sentenceStart = 0;
const buffer = this._buffer;
const len = buffer.length;
let i = 0;
let stack = [];
// Helper to scan from the current index over trailing terminators and punctuation.
const scanBoundary = (idx) => {
let end = idx;
// Consume contiguous sentence terminators (excluding newlines).
while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
++end;
}
// Consume trailing characters (e.g., closing quotes/brackets).
while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
++end;
}
let nextNonSpace = end + 1;
while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
++nextNonSpace;
}
return { end, nextNonSpace };
};
while (i < len) {
const c = buffer[i];
updateStack(c, stack, i, buffer);
// Only consider splitting if we're not inside any nested structure.
if (stack.length === 0 && isSentenceTerminator(c)) {
const currentSegment = buffer.slice(sentenceStart, i);
// Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
if (/(^|\n)\d+$/.test(currentSegment)) {
++i;
continue;
}
const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
// If the terminator is not a newline and there's no extra whitespace,
// we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
if (i === nextNonSpace - 1 && c !== "\n") {
++i;
continue;
}
// Wait for more text if there's no non-whitespace character yet.
if (nextNonSpace === len) {
break;
}
// Determine the token immediately preceding the terminator.
let tokenStart = i - 1;
while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
tokenStart--;
}
tokenStart = Math.max(sentenceStart, tokenStart + 1);
const token = getTokenFromBuffer(buffer, tokenStart);
if (!token) {
++i;
continue;
}
// --- URL/email protection ---
// If the token appears to be a URL or email (contains "://" or "@")
// and does not already end with a terminator, skip splitting.
if ((/https?[,:]\/\//.test(token) || token.includes("@")) && !isSentenceTerminator(token.at(-1))) {
i = tokenStart + token.length;
continue;
}
// --- Abbreviation protection ---
if (isAbbreviation(token)) {
++i;
continue;
}
// --- Middle initials heuristic ---
// If the token is a series of single-letter initials (each ending in a period)
// and is followed by a capitalized word, assume it's part of a name.
if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
++i;
continue;
}
// --- Lookahead heuristic ---
// If the terminator is a period and the next nonwhitespace character is lowercase,
// assume it is not the end of a sentence.
if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
++i;
continue;
}
// Special case: ellipsis that stands alone should be merged with the following sentence.
const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
if (sentence === "..." || sentence === "…") {
++i;
continue;
}
// Accept the sentence boundary.
if (sentence) {
this._sentences.push(sentence);
}
// Move to the next sentence.
i = sentenceStart = boundaryEnd + 1;
continue;
}
++i;
}
// Remove the processed portion of the buffer.
this._buffer = buffer.substring(sentenceStart);
// Resolve any pending promise if sentences are available.
if (this._sentences.length > 0) {
this._resolve();
}
}
/**
* Async iterator to yield sentences as they become available.
* @returns {AsyncGenerator<string, void, void>}
*/
async *[Symbol.asyncIterator]() {
if (this._resolver) {
throw new Error("Another iterator is already active.");
}
while (true) {
if (this._sentences.length > 0) {
yield this._sentences.shift();
} else if (this._closed) {
// No more text will be pushed.
break;
} else {
// Wait for more text.
await new Promise((resolve) => {
this._resolver = resolve;
});
}
}
}
/**
* Synchronous iterator that flushes the buffer and returns all sentences.
* @returns {Iterator<string>}
*/
[Symbol.iterator]() {
this.flush();
const iterator = this._sentences[Symbol.iterator]();
this._sentences = [];
return iterator;
}
/**
* Returns the array of sentences currently available.
* @type {string[]} The array of sentences.
* @readonly
*/
get sentences() {
return this._sentences;
}
}
/**
* Splits the input text into an array of sentences.
* @param {string} text The text to split.
* @returns {string[]} An array of sentences.
*/
export function split(text) {
const splitter = new TextSplitterStream();
splitter.push(text);
return [...splitter];
}

View File

@@ -0,0 +1,654 @@
import { describe, test, expect } from "vitest";
import { TextSplitterStream, split } from "../src/splitter.js";
const TESTS = [
{
name: "Basic sentence splitting",
input: "This is a test. This is another test.",
target: ["This is a test.", "This is another test."],
},
{
name: "Sentence with dash (em dash)",
input: "This is a test — yes, it is.",
target: ["This is a test — yes, it is."],
},
{
name: "Sentences with quoted speech",
input: 'She said, "Hello there. How are you?". I replied, "I\'m fine."',
target: ['She said, "Hello there. How are you?".', 'I replied, "I\'m fine."'],
},
{
name: "Sentences with abbreviations",
input: "Dr. Smith is here. At 10 a.m. I saw him.",
target: ["Dr. Smith is here.", "At 10 a.m. I saw him."],
},
{
name: "Advanced sentences with abbreviations",
input: "I went to Dr. Smith this morning at 10 a.m. and said hi.",
target: ["I went to Dr. Smith this morning at 10 a.m. and said hi."],
},
{
name: "Abbreviations with possessive",
input: "The Dr.'s office.",
target: ["The Dr.'s office."],
},
{
name: "Ellipses in sentences",
input: "Wait... what just happened? I don't understand...",
target: ["Wait... what just happened?", "I don't understand..."],
},
{
name: "Sentences with numbers and decimals",
input: "The price is $4.99. Do you want to buy it?",
target: ["The price is $4.99.", "Do you want to buy it?"],
},
{
name: "Sentences starting and ending with numbers",
input: "10 people died in 2025. 20 people died in 2026.",
target: ["10 people died in 2025.", "20 people died in 2026."],
},
{
name: "Sentences with scientific notation",
input: "The star is 3.2×10^4 light-years away.",
target: ["The star is 3.2×10^4 light-years away."],
},
{
name: "Sentences with multiple punctuation marks",
input: "What?! Are you serious?! This is crazy...",
target: ["What?!", "Are you serious?!", "This is crazy..."],
},
{
name: "Sentences with parentheses",
input: "This is an example (which is quite useful). Do you agree?",
target: ["This is an example (which is quite useful).", "Do you agree?"],
},
{
name: "Nested sentences with parentheses",
input: "This is an example (This is pretty cool. Another sentence). Do you agree?",
target: ["This is an example (This is pretty cool. Another sentence).", "Do you agree?"],
},
{
name: "Sentences with newlines",
input: "First sentence.\nSecond sentence.\nThird sentence.",
target: ["First sentence.", "Second sentence.", "Third sentence."],
},
{
name: "Sentences with emojis",
input: "I love pizza! 🍕 Do you? 😊",
target: ["I love pizza!", "🍕 Do you?", "😊"],
},
{
name: "Sentences with unicode and non-Latin characters",
input: "これはテストです。 次の文です。",
target: ["これはテストです。", "次の文です。"],
},
{
name: "Sentences with bullet points",
input: "- First point.\n- Second point.\n- Third point.",
target: ["- First point.", "- Second point.", "- Third point."],
},
{
name: "Sentences with email addresses",
input: "My email is test@example.com. Contact me!",
target: ["My email is test@example.com.", "Contact me!"],
},
{
name: "Sentences with URLs",
input: "Visit https://example.com. It's a great site!",
target: ["Visit https://example.com.", "It's a great site!"],
},
{
name: "Sentences with URLs (subdomains)",
input: "Visit https://test.example.com. It's a great site!",
target: ["Visit https://test.example.com.", "It's a great site!"],
},
{
name: "Sentences with trailing spaces",
input: " This is a sentence. Another one. ",
target: ["This is a sentence.", "Another one."],
},
{
name: "Sentences with contractions",
input: "You can't be serious. It's too late.",
target: ["You can't be serious.", "It's too late."],
},
{
name: "Sentences with title case and proper nouns",
input: "Mr. Johnson went to New York. He loves it there.",
target: ["Mr. Johnson went to New York.", "He loves it there."],
},
{
name: "Sentences with mixed cases",
input: "i am happy. Are you?",
target: ["i am happy.", "Are you?"],
},
{
name: "Sentences with missing punctuation",
input: "This is a test without punctuation What should happen",
target: ["This is a test without punctuation What should happen"],
},
{
name: "Sentences with mixed symbols",
input: "Hello @John! How's it going? #excited",
target: ["Hello @John!", "How's it going?", "#excited"],
},
{
name: "Sentences with math expressions",
input: "The result is 3.14. It's an approximation of pi.",
target: ["The result is 3.14.", "It's an approximation of pi."],
},
{
name: "Excessive punctuation",
input: "Wait!!!! Are you sure??? This is insane!!! Right???",
target: ["Wait!!!!", "Are you sure???", "This is insane!!!", "Right???"],
},
{
name: "Mixed languages in one line",
input: "English sentence. 这是一句中文? Another English sentence!",
target: ["English sentence.", "这是一句中文?", "Another English sentence!"],
},
{
name: "Sequence of punctuation plus emoji",
input: "What??! 🤯Wait?? Hello!",
target: ["What??!", "🤯Wait??", "Hello!"],
},
{
name: "Nested parentheses and quotes",
input: '(This is "very (strange)" text). Right?',
target: ['(This is "very (strange)" text).', "Right?"],
},
{
name: "Sentence with ellipsis following a question mark",
input: "Are you coming? ... I don't know.",
target: ["Are you coming?", "... I don't know."],
},
{
name: "Sentence with mixed punctuation marks (colon, comma, question mark)",
input: "What do you think: Is this the answer, or not?",
target: ["What do you think: Is this the answer, or not?"],
},
{
name: "Sentence with parentheses and question mark",
input: "Did you understand (after all)?",
target: ["Did you understand (after all)?"],
},
{
name: "Sentence with repeated punctuation marks (exclamation)",
input: "What a great day!!! This is amazing!!!",
target: ["What a great day!!!", "This is amazing!!!"],
},
{
name: "Sentence with multiple short sentences and abbreviations",
input: "Dr. Lee is busy. Mr. Brown is in a meeting.",
target: ["Dr. Lee is busy.", "Mr. Brown is in a meeting."],
},
{
name: "Sentence with only emojis",
input: "🍕🍔🍟🍦",
target: ["🍕🍔🍟🍦"],
},
{
name: "Sentence with single quotes around a word",
input: "The word 'apple' is red.",
target: ["The word 'apple' is red."],
},
{
name: "Sentence with an email and a period",
input: "My email is example@domain.com. Please contact me.",
target: ["My email is example@domain.com.", "Please contact me."],
},
{
name: "Sentence with non-standard punctuation (pipe)",
input: "This | is | a | test.",
target: ["This | is | a | test."],
},
{
name: "Sentence with a URL and a period after it",
input: "You can find more info at https://www.website.com. Its reliable.",
target: ["You can find more info at https://www.website.com.", "Its reliable."],
},
{
name: "Sentence with multiple hashtags",
input: "I love coding! #developer #javascript #testing",
target: ["I love coding!", "#developer #javascript #testing"],
},
{
name: "Sentence with numbers and currency",
input: "I have $99.99 in my wallet. It's not enough.",
target: ["I have $99.99 in my wallet.", "It's not enough."],
},
{
name: "Sentence with mixed punctuation marks and parentheses",
input: "Are you sure (really)? I don't think so!",
target: ["Are you sure (really)?", "I don't think so!"],
},
{
name: "Sentence with parentheses and ellipses",
input: "This is a test (and its great)... seriously.",
target: ["This is a test (and its great)... seriously."],
},
{
name: "Sentence with an uncommon abbreviation",
input: "The event is scheduled for noon PST. Ill be there.",
target: ["The event is scheduled for noon PST.", "Ill be there."],
},
{
name: "Sentence with a phone number",
input: "Call me at 555-1234. Or email me at example@domain.com.",
target: ["Call me at 555-1234.", "Or email me at example@domain.com."],
},
{
name: "Sentence with nested punctuation (quotes inside quotes)",
input: 'He said, "It\'s a test," and left.',
target: ['He said, "It\'s a test," and left.'],
},
{
name: "Sentences only containing a quotation",
input: `"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."\nThis is a test.`,
target: [`"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat.`, `"It's like my body's developed this massive drug deficiency."`, "This is a test."],
},
{
name: "Sentence with a URL containing a question mark",
input: "Visit https://www.example.com?query=test. Its useful.",
target: ["Visit https://www.example.com?query=test.", "Its useful."],
},
{
name: "Sentence with mixed punctuation and commas",
input: "Hello, how are you? I'm fine, thanks.",
target: ["Hello, how are you?", "I'm fine, thanks."],
},
{
name: "Sentence with a comma before 'and'",
input: "I like ice cream, and I like cake.",
target: ["I like ice cream, and I like cake."],
},
{
name: "Sentence with capital letters inside parentheses",
input: "I went to the store (THE BIG ONE).",
target: ["I went to the store (THE BIG ONE)."],
},
{
name: "Sentence with dates and periods",
input: "The event is on January 1st. It's a new year.",
target: ["The event is on January 1st.", "It's a new year."],
},
{
name: "Sentence with suffixes and periods",
input: "Kokoro.js is powered by Transformers.js, a JavaScript library by Hugging Face.",
target: ["Kokoro.js is powered by Transformers.js, a JavaScript library by Hugging Face."],
},
{
name: "Non-splitting after a period",
input: "Pi is 3.14 i.e., a mathematical constant. J.R.R. Tolkien wrote The Lord of the Rings. Wait... what? The files are /path/to/file.txt, VIDEO.MP4 and image.jpg.",
target: ["Pi is 3.14 i.e., a mathematical constant.", "J.R.R. Tolkien wrote The Lord of the Rings.", "Wait... what?", "The files are /path/to/file.txt, VIDEO.MP4 and image.jpg."],
},
{
name: "Long text with multiple sentences",
input: `The sky above the port was the color of television, tuned to a dead channel.\n"It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."\nIt was a Sprawl voice and a Sprawl joke. The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.\nThese were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come. One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures. The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire. Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need. We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.`,
target: [
"The sky above the port was the color of television, tuned to a dead channel.",
"\"It's not like I'm using,\" Case heard someone say, as he shouldered his way through the crowd around the door of the Chat.",
"\"It's like my body's developed this massive drug deficiency.\"",
"It was a Sprawl voice and a Sprawl joke.",
"The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.",
"These were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come.",
"One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures.",
"The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire.",
"Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need.",
"We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.",
],
},
];
// Tests adapted from https://github.com/textlint-rule/sentence-splitter/blob/master/test/sentence-splitter-test.ts
TESTS.push(
{
name: "Basic sentence splitting",
input: "text",
target: ["text"],
},
{
name: "Should not split number",
input: "Temperature is 40.2 degrees.",
target: ["Temperature is 40.2 degrees."],
},
{
name: "Should not split in pair string with same mark",
input: 'I hear "I\'m back to home." from radio.',
target: ['I hear "I\'m back to home." from radio.'],
},
{
name: "Should not split in pair string",
input: "彼は「ココにある。」と言った。",
target: ["彼は「ココにある。」と言った。"],
},
// {
// name: "Should not split in pair string and correct after sentence",
// input: "彼は「ココにある。」と言った。だけではそれは違った。",
// target: ["彼は「ココにある。」と言った。", "だけではそれは違った。"],
// },
{
name: "Should split by first line break",
input: "text",
target: ["text"],
},
{
name: "Should split by last line break",
input: "text\n",
target: ["text"],
},
{
name: "Should split by double line break",
input: "text\n\ntext",
target: ["text", "text"],
},
{
name: "Should split by 。",
input: "text。。text",
target: ["text。。", "text"],
},
{
name: "Should split by 。 and linebreak",
input: "text。\ntext",
target: ["text。", "text"],
},
{
name: "Should split by . and whitespace",
input: "1st text. 2nd text",
target: ["1st text.", "2nd text"],
},
{
name: "Should split by multiple whitespaces",
input: "1st text. 2nd text",
target: ["1st text.", "2nd text"],
},
{
name: "Should support start and end whitespace",
input: " text. ",
target: ["text."],
},
{
name: "Should split by text, whitespaces, and newline",
input: "1st text. \n 2nd text",
target: ["1st text.", "2nd text"],
},
{
name: "Should split by !?",
input: "text!?text",
target: ["text!?", "text"],
},
{
name: "Should split by last 。",
input: "text。",
target: ["text。"],
},
{
name: "Should not split numbered list",
input: "1. 1st text.\n2. 2nd text.\n10. 10th text.",
target: ["1. 1st text.", "2. 2nd text.", "10. 10th text."],
},
);
// Tests adapted from https://github.com/wikimedia/sentencex-js/blob/main/test/en.test.js
TESTS.push(
{
name: "Dr. title should not split",
input: "This is Dr. Watson",
target: ["This is Dr. Watson"],
},
{
name: "Basic sentence split",
input: "Roses Are Red. Violets Are Blue",
target: ["Roses Are Red.", "Violets Are Blue"],
},
{
name: "Exclamation and question split",
input: "Hello! How are you?",
target: ["Hello!", "How are you?"],
},
{
name: "Simple period split",
input: "This is a test.",
target: ["This is a test."],
},
{
name: "Mr. title should not split",
input: "Mr. Smith went to Washington.",
target: ["Mr. Smith went to Washington."],
},
{
name: "Words ending in title-like suffixes should split",
input: "He hit the drums. Then he hit the cymbals.",
target: ["He hit the drums.", "Then he hit the cymbals."],
},
{
name: "Surprise sentence should not split",
input: "What a suprise?!",
target: ["What a suprise?!"],
},
{
name: "Ellipsis should not split",
input: "That's all folks...",
target: ["That's all folks..."],
},
{
name: "Single line break should split",
input: "First line\nSecond line",
target: ["First line", "Second line"],
},
{
name: "Double line break should split",
input: "First line\nSecond line\n\nThird line",
target: ["First line", "Second line", "Third line"],
},
{
name: "Abbreviations should not split",
input: "This is UK. Not US",
target: ["This is UK.", "Not US"],
},
{
name: "Dollar amount should not split",
input: "This balloon costs $1.20",
target: ["This balloon costs $1.20"],
},
{
name: "Basic multiple sentence split",
input: "Hello World. My name is Jonas.",
target: ["Hello World.", "My name is Jonas."],
},
{
name: "Basic question and sentence split",
input: "What is your name? My name is Jonas.",
target: ["What is your name?", "My name is Jonas."],
},
{
name: "Exclamation and period split",
input: "There it is! I found it.",
target: ["There it is!", "I found it."],
},
{
name: "Middle initial should not split",
input: "My name is Jonas E. Smith.",
target: ["My name is Jonas E. Smith."],
},
{
name: "Page reference should not split",
input: "Please turn to p. 55.",
target: ["Please turn to p. 55."],
},
{
name: "Co. abbreviation should not split",
input: "Were Jane and co. at the party?",
target: ["Were Jane and co. at the party?"],
},
{
name: "Business name should not split",
input: "They closed the deal with Pitt, Briggs & Co. at noon.",
target: ["They closed the deal with Pitt, Briggs & Co. at noon."],
},
{
name: "Mount abbreviation should not split",
input: "I can see Mt. Fuji from here.",
target: ["I can see Mt. Fuji from here."],
},
{
name: "Saint abbreviation should not split",
input: "St. Michael's Church is on 5th st. near the light.",
target: ["St. Michael's Church is on 5th st. near the light."],
},
{
name: "JFK Jr. should not split",
input: "That is JFK Jr.'s book.",
target: ["That is JFK Jr.'s book."],
},
{
name: "Country abbreviation should not split",
input: "I visited the U.S.A. last year.",
target: ["I visited the U.S.A. last year."],
},
{
name: "Dollar amount with period split",
input: "She has $100.00. It is in her bag.",
target: ["She has $100.00.", "It is in her bag."],
},
{
name: "Email should not split",
input: "Her email is Jane.Doe@example.com. I sent her an email.",
target: ["Her email is Jane.Doe@example.com.", "I sent her an email."],
},
{
name: "URL should not split",
input: "The site is, https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
target: ["The site is, https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."],
},
// {
// name: "Yahoo! should not split",
// input: "She works at Yahoo! in the accounting department.",
// target: ["She works at Yahoo! in the accounting department."],
// },
{
name: "Multiple exclamations should split",
input: "Hello!! Long time no see.",
target: ["Hello!!", "Long time no see."],
},
{
name: "Mixed punctuation should split",
input: "Hello?! Is that you?",
target: ["Hello?!", "Is that you?"],
},
// {
// name: "Numbered reference should not split",
// input: "Saint Maximus (died 250) is a Christian saint and martyr.[1] The emperor Decius published a decree ordering the veneration of busts of the deified emperors.",
// target: ["Saint Maximus (died 250) is a Christian saint and martyr.[1]", "The emperor Decius published a decree ordering the veneration of busts of the deified emperors."],
// },
);
const STREAMED_TESTS = [
{
name: "Basic sentence splitting",
input: ["I went", " to the", " store. I", " bought an apple for $1.", "99. It was", " a good deal."],
target: ["I went to the store.", "I bought an apple for $1.99.", "It was a good deal."],
},
{
name: "URL with query parameters",
input: ["Visit https://www", ".example.", "com", "?query=test."],
target: ["Visit https://www.example.com?query=test."],
},
];
describe("Sentence splitting", () => {
describe("synchronous", () => {
for (const { name, input, target } of TESTS) {
test(name, () => {
expect(split(input)).toEqual(target);
});
}
});
describe("for loop", () => {
test("synchronous for ... of", () => {
const streamer = new TextSplitterStream();
// Initial text
streamer.push("Hello, how are you? I'm fine, thanks.");
// 1. Consume the current stream
const sentences = [];
for (const sentence of streamer) {
sentences.push(sentence);
}
expect(sentences).toEqual(["Hello, how are you?", "I'm fine, thanks."]);
// 2. Consume the stream again
streamer.push("This is a test. This is unfinish-");
const sentences2 = [];
for (const sentence of streamer) {
sentences2.push(sentence);
}
expect(sentences2).toEqual(["This is a test.", "This is unfinish-"]);
// 3. Consume the stream again
streamer.push("ed.");
const sentences3 = [];
for (const sentence of streamer) {
sentences3.push(sentence);
}
expect(sentences3).toEqual(["ed."]);
});
test("asynchronous for ... of", async () => {
const streamer = new TextSplitterStream();
// Initial text
streamer.push("Hello, how are");
// Consumes the stream asynchronously
const sentences = [];
const consumeStream = (async () => {
for await (const sentence of streamer) {
sentences.push(sentence);
}
})();
setTimeout(() => {
streamer.push(" you? I'm fine, thanks.");
}, 10);
setTimeout(() => {
streamer.push(" This is a test. This is unfinish-");
}, 20);
setTimeout(() => {
streamer.push("ed.");
}, 30);
setTimeout(() => {
streamer.close();
}, 40);
await consumeStream;
expect(sentences).toEqual(["Hello, how are you?", "I'm fine, thanks.", "This is a test.", "This is unfinish-ed."]);
});
});
describe("streaming", () => {
const tests = [
// Pre-defined test cases
...STREAMED_TESTS,
// Test that adding character by character (the most extreme case) also works correctly
...TESTS.map(({ name, input, target }) => ({ name, input: Array.from(input), target })),
];
for (const { name, input, target } of tests) {
test(name, async () => {
const streamer = new TextSplitterStream();
const sentences = [];
const consumeStream = (async () => {
for await (const sentence of streamer) {
sentences.push(sentence);
}
})();
streamer.push(...input);
streamer.close();
await consumeStream;
expect(sentences).toEqual(target);
});
}
});
});