diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index 00d4a47ac..da529be94 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -69,6 +69,8 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { auto phonemeTypeStr = configRoot["phoneme_type"].get(); if (phonemeTypeStr == "text") { phonemizeConfig.phonemeType = TextPhonemes; + } else if (phonemeTypeStr == "raw") { + phonemizeConfig.phonemeType = RawPhonemes; } } @@ -467,7 +469,16 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, spdlog::debug("Phonemizing text: {}", text); std::vector> phonemes; - if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) { + if (voice.phonemizeConfig.phonemeType == RawPhonemes) { + // Use text as raw utf-8 phonemes + phonemes.emplace_back(); + // UTF-8 wide + std::wstring_convert> converter; + std::wstring wide_text = converter.from_bytes(text); + for (wchar_t c : wide_text) { + phonemes.back().push_back(c); // Adds each wide character as a phoneme + } + } else if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) { // Use espeak-ng for phonemization eSpeakPhonemeConfig eSpeakConfig; eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice; diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp index 7b956f798..2f7280e14 100644 --- a/src/cpp/piper.hpp +++ b/src/cpp/piper.hpp @@ -34,7 +34,7 @@ struct PiperConfig { std::unique_ptr tashkeelState; }; -enum PhonemeType { eSpeakPhonemes, TextPhonemes }; +enum PhonemeType { eSpeakPhonemes, TextPhonemes, RawPhonemes }; struct PhonemizeConfig { PhonemeType phonemeType = eSpeakPhonemes; diff --git a/src/python/piper_train/preprocess.py b/src/python/piper_train/preprocess.py index 40c20c992..539a529b2 100644 --- a/src/python/piper_train/preprocess.py +++ b/src/python/piper_train/preprocess.py @@ -103,6 +103,9 @@ def main() -> None: parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to the console" ) + parser.add_argument( + "--raw-phonemes", action="store_true", help="Raw espeak compatible phonemes" + ) args = parser.parse_args() if args.single_speaker and (args.speaker_id is not None): @@ -299,7 +302,10 @@ def phonemize_batch_espeak( utt.text = tashkeel_run(utt.text) _LOGGER.debug(utt) - all_phonemes = phonemize_espeak(casing(utt.text), args.language) + if args.raw_phonemes: + all_phonemes = [utt.text.split()] + else: + all_phonemes = phonemize_espeak(casing(utt.text), args.language) # Flatten utt.phonemes = [