| checkpoints: | |
| checkpoint_interval: 500 | |
| checkpoints_path: /scratch/craffel/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredyoutube-seed-6- | |
| checkpoints_path_is_shared_file_system: false | |
| resume_checkpoint_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredyoutube-seed-6- | |
| save_initial_state: true | |
| data: | |
| dataset: | |
| dataloader_type: single | |
| dataset_max_tokens: null | |
| dataset_weights: null | |
| datasets: | |
| - bits_per_token: 16 | |
| filename_pattern: .*\.ds$ | |
| folder: /scratch/dataset/commav0p1-ablations-1p82G-commonpile0p1filteredyoutube-seed-6-/ | |
| original_folder: null | |
| seed: 6 | |
| shuffle: true | |
| skip_tokens: 0 | |
| pad_samples_to_global_batch_size: false | |
| skip_in_stream: true | |
| num_loading_workers: 0 | |
| seed: 6 | |
| experiment_logger: | |
| tensorboard_logger: | |
| push_to_hub_interval: 300 | |
| repo_id: craffel/commav0p1-ablations | |
| repo_public: false | |
| tensorboard_dir: /scratch/craffel/tensorboard-craffel-commav0p1-ablations | |
| wandb_logger: null | |
| general: | |
| benchmark_csv_path: null | |
| consumed_train_samples: 14336000 | |
| ignore_sanity_checks: true | |
| project: commav0p1-ablations | |
| run: commav0p1-ablations-1p82G-commonpile0p1filteredyoutube-seed-6- | |
| seed: 42 | |
| step: 14000 | |
| kill_switch_path: null | |
| lighteval: | |
| batch_size: 16 | |
| checkpoints_path: null | |
| generation: null | |
| logging: | |
| hub_repo_details: null | |
| hub_repo_results: null | |
| hub_repo_tensorboard: craffel/commav0p1-ablations | |
| local_output_path: /scratch/craffel/lighteval/commav0p1-ablations-1p82G-commonpile0p1filteredyoutube-seed-6- | |
| push_details_to_hub: false | |
| push_results_to_hub: false | |
| push_results_to_tensorboard: true | |
| tensorboard_metric_prefix: e | |
| parallelism: | |
| dp: 8 | |
| expert_parallel_size: 1 | |
| pp: 1 | |
| pp_engine: 1f1b | |
| tp: 1 | |
| tp_linear_async_communication: false | |
| tp_mode: ALL_REDUCE | |
| slurm_script_dir: /fsx/craffel/train/eval-scripts | |
| slurm_template: /fsx/craffel/run_eval.slurm.jinja | |
| tasks: | |
| custom_tasks: brrr.lighteval.evaluation_tasks | |
| dataset_loading_processes: 8 | |
| max_samples: 1000 | |
| multichoice_continuations_start_space: null | |
| no_multichoice_continuations_start_space: null | |
| num_fewshot_seeds: null | |
| tasks: early-signal | |
| wandb: null | |
| logging: | |
| iteration_step_info_interval: 1 | |
| log_level: info | |
| log_level_replica: info | |
| model: | |
| ddp_bucket_cap_mb: 25 | |
| dtype: bfloat16 | |
| init_method: | |
| std: 0.02 | |
| make_vocab_size_divisible_by: 1 | |
| model_config: | |
| bos_token_id: 1 | |
| eos_token_id: 2 | |
| hidden_act: silu | |
| hidden_size: 2048 | |
| initializer_range: 0.02 | |
| intermediate_size: 8192 | |
| is_llama_config: true | |
| max_position_embeddings: 2048 | |
| num_attention_heads: 32 | |
| num_hidden_layers: 24 | |
| num_key_value_heads: 32 | |
| pad_token_id: null | |
| pretraining_tp: 1 | |
| rms_norm_eps: 1.0e-05 | |
| rope_scaling: null | |
| tie_word_embeddings: true | |
| use_cache: true | |
| vocab_size: 50272 | |
| optimizer: | |
| accumulate_grad_in_fp32: true | |
| adam_beta1: 0.9 | |
| adam_beta2: 0.95 | |
| adam_eps: 1.0e-08 | |
| clip_grad: 1.0 | |
| learning_rate_scheduler: | |
| learning_rate: 0.0003 | |
| lr_decay_starting_step: null | |
| lr_decay_steps: null | |
| lr_decay_style: cosine | |
| lr_warmup_steps: 500 | |
| lr_warmup_style: linear | |
| min_decay_lr: 3.0e-05 | |
| torch_adam_is_fused: true | |
| weight_decay: 0.1 | |
| zero_stage: 0 | |
| parallelism: | |
| dp: 64 | |
| expert_parallel_size: 1 | |
| pp: 1 | |
| pp_engine: 1f1b | |
| tp: 1 | |
| tp_linear_async_communication: true | |
| tp_mode: REDUCE_SCATTER | |
| profiler: null | |
| s3_upload: | |
| remove_after_upload: true | |
| s5cmd_concurrency: 5 | |
| s5cmd_numworkers: 16 | |
| s5cmd_path: /fsx/craffel/miniconda3/envs/exp/bin/s5cmd | |
| upload_s3_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredyoutube-seed-6- | |
| tokenizer: | |
| tokenizer_max_length: null | |
| tokenizer_name_or_path: gpt2 | |
| tokenizer_revision: null | |
| tokens: | |
| batch_accumulation_per_replica: 4 | |
| limit_test_batches: 0 | |
| limit_val_batches: 0 | |
| micro_batch_size: 4 | |
| sequence_length: 2048 | |
| train_steps: 14305 | |
| val_check_interval: 100 | |