Add Result.text_index to be able to map segments to paragraphs (#111)

* Add Result.text_index to be able to map segments to paragraphs * Fix speed re: #105
2025-02-23 19:30:25 +03:00
parent 2dd9df6779
commit 52f7eb740b
1 changed files with 5 additions and 4 deletions
--- a/kokoro/pipeline.py
+++ b/kokoro/pipeline.py
@@ -316,6 +316,7 @@ class KPipeline:
        phonemes: str
        tokens: Optional[List[en.MToken]] = None
        output: Optional[KModel.Output] = None
+        text_index: Optional[int] = None

        @property
        def audio(self) -> Optional[torch.FloatTensor]:
@@ -342,7 +343,7 @@ class KPipeline:
        self,
        text: Union[str, List[str]],
        voice: Optional[str] = None,
-        speed: Number = 1,
+        speed: float = 1,
        split_pattern: Optional[str] = r'\n+',
        model: Optional[KModel] = None
    ) -> Generator['KPipeline.Result', None, None]:
@@ -356,7 +357,7 @@ class KPipeline:
            text = re.split(split_pattern, text.strip()) if split_pattern else [text]
            
        # Process each segment
-        for graphemes in text:
+        for graphemes_index, graphemes in enumerate(text):
            if not graphemes.strip():  # Skip empty segments
                continue
                
@@ -373,7 +374,7 @@ class KPipeline:
                    output = KPipeline.infer(model, ps, pack, speed) if model else None
                    if output is not None and output.pred_dur is not None:
                        KPipeline.join_timestamps(tks, output.pred_dur)
-                    yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output)
+                    yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output, text_index=graphemes_index)
            
            # Non-English processing with chunking
            else:
@@ -419,5 +420,5 @@ class KPipeline:
                        ps = ps[:510]
                        
                    output = KPipeline.infer(model, ps, pack, speed) if model else None
-                    yield self.Result(graphemes=chunk, phonemes=ps, output=output)
+                    yield self.Result(graphemes=chunk, phonemes=ps, output=output, text_index=graphemes_index)