| python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_endpoint=localhost:0 --nnodes=1 pretrain.py \ | |
| arch=trm \ | |
| data_paths="[data/arc2concept-aug-1000]" \ | |
| arch.L_layers=2 \ | |
| arch.H_cycles=3 arch.L_cycles=4 \ | |
| +run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \ | |
| checkpoint_every_eval=True \ | |
| epochs=24000 eval_interval=100 \ | |
| global_batch_size=768 \ | |
| +load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815" | |