Text-to-Audio
Audiocraft
English
audiogen
styletts2
shift-tts
sound
audio-generation
text-to-speech
mimic3
Instructions to use dkounadis/artificial-styletts2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Audiocraft
How to use dkounadis/artificial-styletts2 with Audiocraft:
from audiocraft.models import AudioGen model = AudioGen.get_pretrained("dkounadis/artificial-styletts2") model.set_generation_params(duration=5) # generate 5 seconds. descriptions = ['dog barking', 'sirene of an emergency vehicle', 'footsteps in a corridor'] wav = model.generate(descriptions) # generates 3 samples. - Notebooks
- Google Colab
- Kaggle
| # https://github.com/audeering/shift/tree/main - MAKE Mimic-3 voice / harvard 1x 4x | |
| import shutil | |
| import csv | |
| import io | |
| import os | |
| import typing | |
| import wave | |
| import sys | |
| from mimic3_tts.__main__ import (CommandLineInterfaceState, | |
| get_args, | |
| initialize_args, | |
| initialize_tts, | |
| # print_voices, | |
| # process_lines, | |
| shutdown_tts, | |
| OutputNaming, | |
| process_line) | |
| import time | |
| import json | |
| import os | |
| import numpy as np | |
| from pathlib import Path | |
| import audiofile | |
| # ================================================ LIST OF VOICES | |
| ROOT_DIR = '/data/dkounadis/mimic3-voices/' | |
| foreign_voices = [] | |
| english_voices = [] | |
| for lang in os.listdir(ROOT_DIR + 'voices'): | |
| for voice in os.listdir(ROOT_DIR + 'voices/' + lang): | |
| if 'en_' in lang: | |
| try: | |
| with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: | |
| for spk in f: | |
| english_voices.append(lang + '/' + voice + '#' + spk.rstrip()) | |
| # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f | |
| except FileNotFoundError: | |
| english_voices.append(lang + '/' + voice) | |
| else: | |
| try: | |
| with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: | |
| for spk in f: | |
| foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip()) | |
| except FileNotFoundError: | |
| foreign_voices.append(lang + '/' + voice) | |
| # | |
| [print(i) for i in foreign_voices] | |
| print('\n_______________________________\n') | |
| [print(i) for i in english_voices] | |
| # ====================================================== LIST Mimic-3 ALL VOICES | |
| # list_voices = [ | |
| # 'en_US/m-ailabs_low#mary_ann', | |
| # 'en_UK/apope_low', | |
| # 'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section | |
| # # 'ko_KO/kss_low', | |
| # 'fr_FR/m-ailabs_low#gilles_g_le_blanc', | |
| # #'human', | |
| # ] # special - for human we load specific style file - no Mimic3 is run | |
| # ================================== ====== END INTERFACE | |
| def process_lines(state: CommandLineInterfaceState, wav_path=None): | |
| '''MIMIC3 INTERNAL CALL that yields the sigh sound''' | |
| args = state.args | |
| result_idx = 0 | |
| print(f'why waitings in the for loop LIN {state.texts=}\n') | |
| for line in state.texts: | |
| # print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts | |
| line_voice: typing.Optional[str] = None | |
| line_id = "" | |
| line = line.strip() | |
| # if not line: | |
| # continue | |
| if args.output_naming == OutputNaming.ID: | |
| # Line has the format id|text instead of just text | |
| with io.StringIO(line) as line_io: | |
| reader = csv.reader(line_io, delimiter=args.csv_delimiter) | |
| row = next(reader) | |
| line_id, line = row[0], row[-1] | |
| if args.csv_voice: | |
| line_voice = row[1] | |
| process_line(line, state, | |
| line_id=line_id, | |
| line_voice=line_voice) | |
| result_idx += 1 | |
| time.sleep(4) | |
| # Write combined audio to stdout | |
| if state.all_audio: | |
| # _LOGGER.debug("Writing WAV audio to stdout") | |
| if sys.stdout.isatty() and (not state.args.stdout): | |
| with io.BytesIO() as wav_io: | |
| wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") | |
| with wav_file_play: | |
| wav_file_play.setframerate(state.sample_rate_hz) | |
| wav_file_play.setsampwidth(state.sample_width_bytes) | |
| wav_file_play.setnchannels(state.num_channels) | |
| wav_file_play.writeframes(state.all_audio) | |
| # play_wav_bytes(state.args, wav_io.getvalue()) | |
| # wav_path = '_direct_call_2.wav' | |
| with open(wav_path, 'wb') as wav_file: | |
| wav_file.write(wav_io.getvalue()) | |
| wav_file.seek(0) | |
| print('\n\n5T', wav_path) | |
| else: | |
| print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path) | |
| # ----------------------------------------------------------------------------- | |
| # cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav | |
| # ====================================================================== | |
| for lang, list_voices in [ | |
| ['english', english_voices], | |
| ['foreign', foreign_voices] | |
| ]: | |
| for rate in [1, 4]: | |
| # # -- | |
| # # assure mimic-3 generator .onnx exists | |
| # home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/' | |
| # Path(home_voice_dir).mkdir(parents=True, exist_ok=True) | |
| # speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice | |
| # if ( | |
| # (not os.path.isfile(home_voice_dir + 'generator.onnx')) or | |
| # (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header | |
| # ): | |
| # # Copy | |
| # shutil.copyfile( | |
| # f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx', | |
| # home_voice_dir + 'generator.onnx') | |
| # # -- | |
| with open('harvard.json', 'r') as f: | |
| harvard_individual_sentences = json.load(f)['sentences'] | |
| total_audio_mimic3 = [] | |
| ix = 0 | |
| for list_of_10 in harvard_individual_sentences[:4]: # 77 | |
| # text = ' '.join(list_of_10['sentences']) | |
| for text in list_of_10['sentences']: | |
| _voice = list_voices[ix % len(list_voices)] | |
| _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') | |
| if 'cmu-arctic' in _str: | |
| _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav' | |
| print(ix, lang, text) | |
| # Synthesis Mimic-3 then use it as prompt for StyleTTS2 | |
| # MIMIC-3 if _voice is not HUMAN | |
| _ssml = ( | |
| '<speak>' | |
| '<prosody volume=\'64\'>' | |
| f'<prosody rate=\'{rate}\'>' | |
| f'<voice name=\'{_voice}\'>' | |
| '<s>' | |
| f'{text[:-1] + ", .. !!!"}' | |
| '</s>' | |
| '</voice>' | |
| '</prosody>' | |
| '</prosody>' | |
| '</speak>' | |
| ) | |
| with open('_tmp_ssml.txt', 'w') as f: | |
| f.write(_ssml) | |
| # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True) | |
| # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer | |
| args = get_args() | |
| args.ssml = True | |
| args.text = [_ssml] #['aa', 'bb'] #txt | |
| args.interactive = False | |
| # args.output_naming = OutputNaming.TIME | |
| state = CommandLineInterfaceState(args=args) | |
| initialize_args(state) | |
| initialize_tts(state) | |
| # args.texts = [txt] #['aa', 'bb'] #txt | |
| # state.stdout = '.' #None #'makeme.wav' | |
| # state.output_dir = '.noopy' | |
| # state.interactive = False | |
| # state.output_naming = OutputNaming.TIME | |
| # # state.ssml = 1234546575 | |
| # state.stdout = True | |
| # state.tts = True | |
| style_path = 'tmp1.wav' | |
| process_lines(state, wav_path=style_path) | |
| shutdown_tts(state) | |
| x, fs = audiofile.read(style_path) | |
| ix += 1 | |
| total_audio_mimic3.append(x) | |
| # save styletts2 .wav | |
| total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists | |
| audiofile.write(f'harvards_upload_mimic3_{rate}_{lang}.wav', total_audio_mimic3, 22050) | |
| print(total_audio_mimic3.shape, 'LEN\n') | |