VITS-Umamusume-voice-synthesizer

Runtime error

App Files Files Community

cymic commited on Feb 10, 2023

Commit

0d3dcb0

1 Parent(s): 252a6de

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -89

app.py CHANGED Viewed

@@ -26,7 +26,6 @@ from text.cleaners import japanese_cleaners
 from gradio import routes
 from typing import List, Type
 import os
-os.system('pip install gradio==3.18.0')
 def audio_postprocess(self, y):
     if y is None:
@@ -242,31 +241,6 @@ download_audio_js = """
 }}
 """
-def monkey_patch():
-    def postprocess(self, y):
-                    """
-                    Any postprocessing needed to be performed on a block context.
-                    """
-                    return y
-    gr.blocks.BlockContext.postprocess = postprocess
-def get_types(cls_set: List[Type], component: str):
-    docset = []
-    types = []
-    if component == "input":
-        for cls in cls_set:
-            doc = inspect.getdoc(cls)
-            doc_lines = doc.split("\n")
-            docset.append(doc_lines[1].split(":")[-1])
-            types.append(doc_lines[1].split(")")[0].split("(")[-1])
-    else:
-        for cls in cls_set:
-            doc = inspect.getdoc(cls)
-            doc_lines = doc.split("\n")
-            docset.append(doc_lines[-1].split(":")[-1])
-            types.append(doc_lines[-1].split(")")[0].split("(")[-1])
-    return docset, types
-routes.get_types = get_types
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -274,18 +248,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     app = gr.Blocks()
     with app:
-        gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n"
-                    "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
-                    "This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
-                    "这个合成器是基于VITS文本到语音模型，在从手游《賽馬娘：Pretty Derby》解包的语音数据上训练得到。[Dataset Link](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
-                    "[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
-                    "You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
-                    "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
-                    "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
-                    "若有bug反馈或建议，请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
-                    "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
-                    "如果您的输入语言不是日语，则会由谷歌翻译自动翻译为日语，但是准确性不能保证。\n\n"
-                    )
         with gr.Row():
             with gr.Column():
                 # We instantiate the Textbox class
@@ -333,35 +296,34 @@ if __name__ == "__main__":
                 noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
-            with gr.Column():
-                text_output = gr.Textbox(label="Output Text")
-                phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
-                audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
-                btn = gr.Button("Generate!")
-                cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
-                download = gr.Button("Download Audio")
-                download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"), api_name="download_audio")
-                with gr.Accordion(label="Speaking Pace Control", open=True):
-                    duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
-                                                interactive = True)
-                    gr.Markdown(
-                        "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
-                        "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
-                        "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
-                        "音素冒号后的数字代表音素在生成音频中的长度，( )内的数字代表每个音素与下一个音素之间间隔的长度。"
-                        "您可以手动修改这些数字来控制每个音素以及间隔的长度，从而完全控制合成音频的说话节奏。"
-                        "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
-                    )
-                monkey_patch()
-                btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
-                  outputs=[text_output, audio_output, phoneme_output, duration_output])#, api_name="1")
-                cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
-                          outputs=[phoneme_output, audio_output])#, api_name="2")
         examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
                     ['お疲れ様です，トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
@@ -377,30 +339,6 @@ if __name__ == "__main__":
             outputs=[text_output, audio_output],
             fn=infer
         )
-        gr.Markdown("# Updates Logs 更新日志：\n\n"
-                   "2023/1/24：\n\n"
-                   "Improved the format of phoneme length control.\n\n"
-                   "改善了音素控制的格式。\n\n"
-                   "2023/1/24：\n\n"
-                   "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
-                   "增加了对说话节奏的音素级控制。\n\n"
-                   "2023/1/13：\n\n"
-                   "Added one example of phoneme input.\n\n"
-                   "增加了音素输入的example（米浴喘气）\n\n"
-                   "2023/1/12：\n\n"
-                   "Added phoneme input, which enables more precise control on output audio.\n\n"
-                   "增加了音素输入的功能，可以对语气和语调做到一定程度的精细控制。\n\n"
-                   "Adjusted UI arrangements.\n\n"
-                   "调整了UI的布局。\n\n"
-                   "2023/1/10：\n\n"
-                   "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
-                   "数据集已上传，您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
-                   "2023/1/9：\n\n"
-                   "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
-                   "模型推理已全面转为onnxruntime，现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
-                   "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
-                   "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
-                   )
     ifa = gr.Interface(lambda: None, inputs=[textbox], outputs=[text_output])
     app.queue(concurrency_count=3).launch(show_api=True, share=args.share)

 from gradio import routes
 from typing import List, Type
 import os
 def audio_postprocess(self, y):
     if y is None:
 }}
 """
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     args = parser.parse_args()
     app = gr.Blocks()
     with app:
+        gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n")
         with gr.Row():
             with gr.Column():
                 # We instantiate the Textbox class
                 noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
+            text_output = gr.Textbox(label="Output Text")
+            phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
+            audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
+            btn = gr.Button("Generate!")
+            cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
+            download = gr.Button("Download Audio")
+            download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"), api_name="download_audio")
+            with gr.Accordion(label="Speaking Pace Control", open=True):
+                duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
+                                            interactive = True)
+            gr.Markdown(
+                "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
+                "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
+                "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
+                "音素冒号后的数字代表音素在生成音频中的长度，( )内的数字代表每个音素与下一个音素之间间隔的长度。"
+                "您可以手动修改这些数字来控制每个音素以及间隔的长度，从而完全控制合成音频的说话节奏。"
+                "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
+            )
+        monkey_patch()
+        btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
+          outputs=[text_output, audio_output, phoneme_output, duration_output])#, api_name="1")
+        cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
+                  outputs=[phoneme_output, audio_output])#, api_name="2")
         examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
                     ['お疲れ様です，トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
             outputs=[text_output, audio_output],
             fn=infer
         )
     ifa = gr.Interface(lambda: None, inputs=[textbox], outputs=[text_output])
     app.queue(concurrency_count=3).launch(show_api=True, share=args.share)