Spaces:
Runtime error
Runtime error
| !/usr/bin/env bash | |
| DOCKER_EXP_DIR="/checkpoints/results" | |
| ### Good | |
| CKPT="/checkpoints/streaming/magpie/jason/magpieTTS--val_loss=5.1255-epoch=89-last.ckpt" | |
| HPARAM="/checkpoints/streaming/magpie/jason/magpietts_en_jason_inference.yaml" | |
| PROJECT='magpieTTS_en_newcodec_exps' | |
| CODECMODEL_NAME="C25FPS_Causal_8cb" | |
| # CODEC_PATH="/nemo_codec_checkpoints/21fps_causal_codecmodel.nemo" | |
| CODEC_PATH="/nemo_codec_checkpoints/Low_Frame-rate_25FPS_Speech_Codec++.nemo" | |
| DATASETS="HRLLM" | |
| MODEL_STRUCTURE="MP_CE" | |
| LR=1e-4 | |
| PRECISION=bf16 | |
| SAMPLE_WEIGHT_LIBRIVOX=0.2 | |
| BATCH_SIZE=12 | |
| EXP_NAME="${MODEL_STRUCTURE}_${CODECMODEL_NAME}_${DATASETS}_lr${LR}_bs${BATCH_SIZE}_precision${PRECISION}_w${SAMPLE_WEIGHT_LIBRIVOX}_bin_4" | |
| # EXP_DIR="/lustre/fsw/llmservice_nemo_speechlm/users/subhankarg/experiments_slurm_logs/T5TTSMarch2025/magpie2503_causal_codec_causal_enc_attnprior_nolocaltransformer" | |
| DOCKER_EXP_DIR="/checkpoints/local_training/magpie2503_causal_codec_causal_enc_attnprior_nolocaltransformer" | |
| export CUDA_VISIBLE_DEVICES=0 | |
| # ++model.train_ds.dataset.include_wait_k=True \ | |
| # ++model.train_ds.dataset.max_wait_k=2 \ | |
| # ++model.validation_ds.dataset.include_wait_k=True \ | |
| # ++model.validation_ds.dataset.max_wait_k=2 \ | |
| python examples/tts/magpietts.py \ | |
| --config-path=/workspace/NeMo/examples/tts/conf/magpietts \ | |
| --config-name=magpietts_en_bpe_25fps \ | |
| exp_manager.exp_dir="${DOCKER_EXP_DIR}" \ | |
| ++exp_manager.name="magpieTTS" \ | |
| ++exp_manager.create_tensorboard_logger=true \ | |
| ++exp_manager.create_wandb_logger=false \ | |
| +exp_manager.version=0 \ | |
| weighted_sampling_steps_per_epoch=1000 \ | |
| max_epochs=500 \ | |
| batch_size=${BATCH_SIZE} \ | |
| ~model.text_tokenizers.english_phoneme \ | |
| ++model.text_tokenizers.qwen._target_=AutoTokenizer \ | |
| ++model.text_tokenizers.qwen.pretrained_model="Qwen/Qwen3-1.7B" \ | |
| ++model.use_bpe_char_tokenizer=true \ | |
| +train_ds_meta.libri100train.manifest_path="/data/TTS/manifests/libri100__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withContextAudioPaths.json" \ | |
| +train_ds_meta.libri100train.audio_dir="/data/TTS/LibriTTS" \ | |
| +train_ds_meta.libri100train.feature_dir="/data/TTS/LibriTTS" \ | |
| +train_ds_meta.libri100train.sample_weight=1.0 \ | |
| +train_ds_meta.libri100train.tokenizer_names="[qwen]" \ | |
| +val_ds_meta.libridev.manifest_path="/data/TTS/manifests/dev_clean_withContextAudioPaths.json" \ | |
| +val_ds_meta.libridev.audio_dir="/data/TTS/LibriTTS" \ | |
| +val_ds_meta.libridev.feature_dir="/data/TTS/LibriTTS" \ | |
| +val_ds_meta.libridev.tokenizer_names="[qwen]" \ | |
| model.train_ds.dataset.min_duration=0.2 \ | |
| model.validation_ds.dataset.min_duration=0.2 \ | |
| ++model.train_ds.dataset.load_cached_codes_if_available=false \ | |
| ++model.validation_ds.dataset.load_cached_codes_if_available=false \ | |
| ++model.train_ds.dataset.include_wait_k=True \ | |
| ++model.train_ds.dataset.max_wait_k=3 \ | |
| ++model.train_ds.dataset.wait_k_step_size=2 \ | |
| ++model.validation_ds.dataset.include_wait_k=True \ | |
| ++model.validation_ds.dataset.max_wait_k=3 \ | |
| ++model.validation_ds.dataset.wait_k_step_size=2 \ | |
| model.context_duration_min=5.0 \ | |
| model.context_duration_max=5.0 \ | |
| model.codecmodel_path=${CODEC_PATH} \ | |
| model.model_type="decoder_context_tts" \ | |
| model.use_text_conditioning_encoder=false \ | |
| model.alignment_loss_scale=0.002 \ | |
| model.prior_scaling_factor=0.1 \ | |
| model.prior_end_step=0 \ | |
| model.prior_scaledown_start_step=0 \ | |
| model.indefinite_prior_prob=0.5 \ | |
| model.use_alignment_encoder=true \ | |
| model.binarize_prior_after_step=5 \ | |
| model.alignment_encoder_loss_scale=1.0 \ | |
| model.use_prior_for_aligner=true \ | |
| model.prior_future_context=0 \ | |
| model.prior_past_context=1 \ | |
| model.prior_future_decay=0.05 \ | |
| model.prior_past_decay=0.03 \ | |
| model.binarized_prior_epsilon=0.0001 \ | |
| model.use_local_transformer=false \ | |
| +model.cfg_unconditional_prob=0.1 \ | |
| model.encoder.is_causal=true \ | |
| model.encoder.kernel_size=3 \ | |
| model.context_encoder.is_causal=true \ | |
| model.decoder.kernel_size=3 \ | |
| model.decoder.xa_n_heads=1 \ | |
| model.embedding_dim=768 \ | |
| model.train_ds.dataloader_params.num_workers=2 \ | |
| model.validation_ds.dataloader_params.num_workers=2 \ | |
| trainer.devices=-1 \ | |
| trainer.val_check_interval=100 \ | |
| +trainer.num_sanity_val_steps=0 \ | |
| trainer.precision=${PRECISION} \ | |
| model.optim.lr=${LR} | |
| # +trainer.check_val_every_n_epoch=1 \ |