Allow pipeline to take a voice style tensor directly. (#93)

2025-02-15 00:48:08 -06:00
parent 1145c0b7f6
commit 330d110c05
2 changed files with 12 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ You can run this cell on [Google Colab](https://colab.research.google.com/). [Li
 from kokoro import KPipeline
 from IPython.display import display, Audio
 import soundfile as sf
+import torch
 # 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
 # 🇯🇵 'j' => Japanese: pip install misaki[ja]
 # 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
@@ -49,6 +50,14 @@ generator = pipeline(
    text, voice='af_heart', # <= change voice here
    speed=1, split_pattern=r'\n+'
 )
+
+# Alternatively, load voice tensor directly:
+voice_tensor = torch.load('path/to/voice.pt', weights_only=True)
+generator = pipeline(
+    text, voice=voice_tensor,
+    speed=1, split_pattern=r'\n+'
+)
+
 for i, (gs, ps, audio) in enumerate(generator):
    print(i)  # i => index
    print(gs) # gs => graphemes/text
--- a/kokoro/pipeline.py
+++ b/kokoro/pipeline.py
@@ -146,7 +146,9 @@ class KPipeline:
    If multiple voices are requested, they are averaged.
    Delimiter is optional and defaults to ','.
    """
-    def load_voice(self, voice: str, delimiter: str = ",") -> torch.FloatTensor:
+    def load_voice(self, voice: Union[str, torch.FloatTensor], delimiter: str = ",") -> torch.FloatTensor:
+        if isinstance(voice, torch.FloatTensor):
+            return voice
        if voice in self.voices:
            return self.voices[voice]
        logger.debug(f"Loading voice: {voice}")