Fix: add text chunking for non-English language pipeline (#105)

Co-authored-by: Your Name <your.email@example.com>
2025-02-20 03:15:21 +01:00
parent e648c0605a
commit 2dd9df6779
1 changed files with 55 additions and 10 deletions
--- a/kokoro/pipeline.py
+++ b/kokoro/pipeline.py
@@ -342,7 +342,7 @@ class KPipeline:
        self,
        text: Union[str, List[str]],
        voice: Optional[str] = None,
-        speed: float = 1,
+        speed: Number = 1,
        split_pattern: Optional[str] = r'\n+',
        model: Optional[KModel] = None
    ) -> Generator['KPipeline.Result', None, None]:
@@ -350,10 +350,17 @@ class KPipeline:
        if model and voice is None:
            raise ValueError('Specify a voice: en_us_pipeline(text="Hello world!", voice="af_heart")')
        pack = self.load_voice(voice).to(model.device) if model else None
        # Convert input to list of segments
        if isinstance(text, str):
            text = re.split(split_pattern, text.strip()) if split_pattern else [text]
        # Process each segment
        for graphemes in text:
-            # TODO(misaki): Unify G2P interface between English and non-English
+            if not graphemes.strip():  # Skip empty segments
                continue
            # English processing (unchanged)
            if self.lang_code in 'ab':
                logger.debug(f"Processing English text: {graphemes[:50]}{'...' if len(graphemes) > 50 else ''}")
                _, tokens = self.g2p(graphemes)
@@ -367,12 +374,50 @@ class KPipeline:
                    if output is not None and output.pred_dur is not None:
                        KPipeline.join_timestamps(tks, output.pred_dur)
                    yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output)
            # Non-English processing with chunking
            else:
-                ps = self.g2p(graphemes)
+                # Split long text into smaller chunks (roughly 400 characters each)
-                if not ps:
+                # Using sentence boundaries when possible
-                    continue
+                chunk_size = 400
-                elif len(ps) > 510:
+                chunks = []
-                    logger.warning(f'Truncating len(ps) == {len(ps)} > 510')
+                
-                    ps = ps[:510]
+                # Try to split on sentence boundaries first
-                output = KPipeline.infer(model, ps, pack, speed) if model else None
+                sentences = re.split(r'([.!?]+)', graphemes)
-                yield self.Result(graphemes=graphemes, phonemes=ps, output=output)
+                current_chunk = ""
                for i in range(0, len(sentences), 2):
                    sentence = sentences[i]
                    # Add the punctuation back if it exists
                    if i + 1 < len(sentences):
                        sentence += sentences[i + 1]
                    if len(current_chunk) + len(sentence) <= chunk_size:
                        current_chunk += sentence
                    else:
                        if current_chunk:
                            chunks.append(current_chunk.strip())
                        current_chunk = sentence
                if current_chunk:
                    chunks.append(current_chunk.strip())
                # If no chunks were created (no sentence boundaries), fall back to character-based chunking
                if not chunks:
                    chunks = [graphemes[i:i+chunk_size] for i in range(0, len(graphemes), chunk_size)]
                # Process each chunk
                for chunk in chunks:
                    if not chunk.strip():
                        continue
                    ps = self.g2p(chunk)
                    if not ps:
                        continue
                    elif len(ps) > 510:
                        logger.warning(f'Truncating len(ps) == {len(ps)} > 510')
                        ps = ps[:510]
                    output = KPipeline.infer(model, ps, pack, speed) if model else None
                    yield self.Result(graphemes=chunk, phonemes=ps, output=output)