trm-arc2-8gpu / COMMANDS.txt
seconds-0's picture
Refresh with step119432 resume checkpoint
520035d verified
raw
history blame contribute delete
535 Bytes
python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_endpoint=localhost:0 --nnodes=1 pretrain.py \
arch=trm \
data_paths="[data/arc2concept-aug-1000]" \
arch.L_layers=2 \
arch.H_cycles=3 arch.L_cycles=4 \
+run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
checkpoint_every_eval=True \
epochs=24000 eval_interval=100 \
global_batch_size=768 \
+load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"