#!/bin/bash #SBATCH -A # #SBATCH -J "" # job name (<< CHANGE ! >>) #SBATCH -p # partition (queue) (<< CHANGE ! >>) #SBATCH -N 1 # number of nodes #SBATCH -t 4:00:00 # wall time #SBATCH --time-min 04:00:00 #SBATCH --ntasks-per-node=8 # n tasks per machine (one task per gpu) #SBATCH --gpus-per-node=8 #SBATCH --exclusive #SBATCH --overcommit #SBATCH --mem=0 # """""""""""""""NOTE""" # This is a template script to run NeMo on Slurm. Only provided as a reference to get you started on multi-node multi-gpu training. # Please adjust the paths, account, partition, and other parameters as per your cluster setup. # Make sure to have the correct paths to the data, code, and results directory. # Make sure to have the correct WandB key. # Make sure to have the correct NGC key in ~/.config/enroot/.credentials # Make sure to have the correct container image. (use sqsh to improve loading times) # This script is not tested and may require adjustments based on your cluster setup. # Once you have adjusted the script, you can run it using the following command: # sbatch slurm_example.sh # """ set -x WANDB="" # replace with your own WandB API key CONTAINER=nvcr.io/nvidia/nemo:25.02.rc4 # Adjust to your needs. and make sure you have ngc key in ~/.config/enroot/.credentials CODE_DIR= # Adjust to your needs. DATA_DIR= # Adjust to your needs on how you want to mount the data. MOUNTS="--container-mounts=${CODE_DIR},${DATA_DIR}:/data" CONFIG_PATH="${CODE_DIR}/examples/asr/conf/speech_multitask" # Adjust if launching from outside this directory. CONFIG_NAME="fast-conformer_aed.yaml" EXP_NAME="canary-1b-repro" RESULTS_DIR="" # Adjust to your needs. mkdir -p ${RESULTS_DIR} # && export AIS_ENDPOINT="" \ # Update and uncomment if you want to use AIS for data storage read -r -d '' cmd <